Merge tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull block updates from Jens Axboe: - Add shared memory zero-copy I/O support for ublk, bypassing per-I/O copies between kernel and userspace by matching registered buffer PFNs at I/O time. Includes selftests. - Refactor bio integrity to support filesystem initiated integrity operations and arbitrary buffer alignment. - Clean up bio allocation, splitting bio_alloc_bioset() into clear fast and slow paths. Add bio_await() and bio_submit_or_kill() helpers, unify synchronous bi_end_io callbacks. - Fix zone write plug refcount handling and plug removal races. Add support for serializing zone writes at QD=1 for rotational zoned devices, yielding significant throughput improvements. - Add SED-OPAL ioctls for Single User Mode management and a STACK_RESET command. - Add io_uring passthrough (uring_cmd) support to the BSG layer. - Replace pp_buf in partition scanning with struct seq_buf. - zloop improvements and cleanups. - drbd genl cleanup, switching to pre_doit/post_doit. - NVMe pull request via Keith: - Fabrics authentication updates - Enhanced block queue limits support - Workqueue usage updates - A new write zeroes device quirk - Tagset cleanup fix for loop device - MD pull requests via Yu Kuai: - Fix raid5 soft lockup in retry_aligned_read() - Fix raid10 deadlock with check operation and nowait requests - Fix raid1 overlapping writes on writemostly disks - Fix sysfs deadlock on array_state=clear - Proactive RAID-5 parity building with llbitmap, with write_zeroes_unmap optimization for initial sync - Fix llbitmap barrier ordering, rdev skipping, and bitmap_ops version mismatch fallback - Fix bcache use-after-free and uninitialized closure - Validate raid5 journal metadata payload size - Various cleanups - Various other fixes, improvements, and cleanups * tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (146 commits) ublk: fix tautological comparison warning in ublk_ctrl_reg_buf scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd() block: refactor blkdev_zone_mgmt_ioctl MAINTAINERS: update ublk driver maintainer email Documentation: ublk: address review comments for SHMEM_ZC docs ublk: allow buffer registration before device is started ublk: replace xarray with IDA for shmem buffer index allocation ublk: simplify PFN range loop in __ublk_ctrl_reg_buf ublk: verify all pages in multi-page bvec fall within registered range ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support xfs: use bio_await in xfs_zone_gc_reset_sync block: add a bio_submit_or_kill helper block: factor out a bio_await helper block: unify the synchronous bi_end_io callbacks xfs: fix number of GC bvecs selftests/ublk: add read-only buffer registration test selftests/ublk: add filesystem fio verify test for shmem_zc selftests/ublk: add hugetlbfs shmem_zc test for loop target selftests/ublk: add shared memory zero-copy test selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target ...
2026-04-18 06:44:00 -04:00 · 2026-04-13 15:51:31 -07:00
parent b8f82cb0d8 36446de0c3
commit 7fe6ac157b
121 changed files with 5504 additions and 3124 deletions
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -886,6 +886,21 @@ Description:
 		zone commands, they will be treated as regular block devices and
 		zoned will report "none".

+What:		/sys/block/<disk>/queue/zoned_qd1_writes
+Date:		January 2026
+Contact:	Damien Le Moal <dlemoal@kernel.org>
+Description:
+		[RW] zoned_qd1_writes indicates if write operations to a zoned
+		block device are being handled using a single issuer context (a
+		kernel thread) operating at a maximum queue depth of 1. This
+		attribute is visible only for zoned block devices. The default
+		value for zoned block devices that are not rotational devices
+		(e.g. ZNS SSDs or zoned UFS devices) is 0. For rotational zoned
+		block devices (e.g. SMR HDDs) the default value is 1. Since
+		this default may not be appropriate for some devices, e.g.
+		remotely connected devices over high latency networks, the user
+		can disable this feature by setting this attribute to 0.
+

 What:		/sys/block/<disk>/hidden
 Date:		March 2023
--- a/Documentation/ABI/testing/sysfs-nvme
+++ b/Documentation/ABI/testing/sysfs-nvme
@@ -0,0 +1,13 @@
+What:		/sys/devices/virtual/nvme-fabrics/ctl/.../tls_configured_key
+Date:		November 2025
+KernelVersion:	6.19
+Contact:	Linux NVMe mailing list <linux-nvme@lists.infradead.org>
+Description:
+		The file is avaliable when using a secure concatanation
+		connection to a NVMe target. Reading the file will return
+		the serial of the currently negotiated key.
+
+		Writing 0 to the file will trigger a PSK reauthentication
+		(REPLACETLSPSK) with the target. After a reauthentication
+		the value returned by tls_configured_key will be the new
+		serial.
--- a/Documentation/admin-guide/blockdev/zoned_loop.rst
+++ b/Documentation/admin-guide/blockdev/zoned_loop.rst
@@ -62,7 +62,7 @@ The options available for the add command can be listed by reading the
 /dev/zloop-control device::

 	$ cat /dev/zloop-control
-        add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io
+        add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,max_open_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io,zone_append=%u,ordered_zone_append,discard_write_cache
        remove id=%d

 In more details, the options that can be used with the "add" command are as
@@ -80,6 +80,9 @@ zone_capacity_mb      Device zone capacity (must always be equal to or lower
 conv_zones            Total number of conventioanl zones starting from
                      sector 0
                      Default: 8
+max_open_zones        Maximum number of open sequential write required zones
+                      (0 for no limit).
+                      Default: 0
 base_dir              Path to the base directory where to create the directory
                      containing the zone files of the device.
                      Default=/var/local/zloop.
@@ -104,6 +107,11 @@ ordered_zone_append   Enable zloop mitigation of zone append reordering.
                      (extents), as when enabled, this can significantly reduce
                      the number of data extents needed to for a file data
                      mapping.
+discard_write_cache   Discard all data that was not explicitly persisted using a
+                      flush operation when the device is removed by truncating
+                      each zone file to the size recorded during the last flush
+                      operation. This simulates power fail events where
+                      uncommitted data is lost.
 ===================   =========================================================

 3) Deleting a Zoned Device
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -153,7 +153,7 @@ blk-crypto-fallback completes the original bio.  If the original bio is too
 large, multiple bounce bios may be required; see the code for details.

 For decryption, blk-crypto-fallback "wraps" the bio's completion callback
-(``bi_complete``) and private data (``bi_private``) with its own, unsets the
+(``bi_end_io``) and private data (``bi_private``) with its own, unsets the
 bio's encryption context, then submits the bio.  If the read completes
 successfully, blk-crypto-fallback restores the bio's original completion
 callback and private data, then decrypts the bio's data in-place using the
--- a/Documentation/block/ublk.rst
+++ b/Documentation/block/ublk.rst
@@ -485,6 +485,125 @@ Limitations
  in case that too many ublk devices are handled by this single io_ring_ctx
  and each one has very large queue depth

+Shared Memory Zero Copy (UBLK_F_SHMEM_ZC)
+------------------------------------------
+
+The ``UBLK_F_SHMEM_ZC`` feature provides an alternative zero-copy path
+that works by sharing physical memory pages between the client application
+and the ublk server. Unlike the io_uring fixed buffer approach above,
+shared memory zero copy does not require io_uring buffer registration
+per I/O — instead, it relies on the kernel matching physical pages
+at I/O time. This allows the ublk server to access the shared
+buffer directly, which is unlikely for the io_uring fixed buffer
+approach.
+
+Motivation
+~~~~~~~~~~
+
+Shared memory zero copy takes a different approach: if the client
+application and the ublk server both map the same physical memory, there is
+nothing to copy. The kernel detects the shared pages automatically and
+tells the server where the data already lives.
+
+``UBLK_F_SHMEM_ZC`` can be thought of as a supplement for optimized client
+applications — when the client is willing to allocate I/O buffers from
+shared memory, the entire data path becomes zero-copy.
+
+Use Cases
+~~~~~~~~~
+
+This feature is useful when the client application can be configured to
+use a specific shared memory region for its I/O buffers:
+
+- **Custom storage clients** that allocate I/O buffers from shared memory
+  (memfd, hugetlbfs) and issue direct I/O to the ublk device
+- **Database engines** that use pre-allocated buffer pools with O_DIRECT
+
+How It Works
+~~~~~~~~~~~~
+
+1. The ublk server and client both ``mmap()`` the same file (memfd or
+   hugetlbfs) with ``MAP_SHARED``. This gives both processes access to the
+   same physical pages.
+
+2. The ublk server registers its mapping with the kernel::
+
+     struct ublk_shmem_buf_reg buf = { .addr = mmap_va, .len = size };
+     ublk_ctrl_cmd(UBLK_U_CMD_REG_BUF, .addr = &buf);
+
+   The kernel pins the pages and builds a PFN lookup tree.
+
+3. When the client issues direct I/O (``O_DIRECT``) to ``/dev/ublkb*``,
+   the kernel checks whether the I/O buffer pages match any registered
+   pages by comparing PFNs.
+
+4. On a match, the kernel sets ``UBLK_IO_F_SHMEM_ZC`` in the I/O
+   descriptor and encodes the buffer index and offset in ``addr``::
+
+     if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) {
+         /* Data is already in our shared mapping — zero copy */
+         index  = ublk_shmem_zc_index(iod->addr);
+         offset = ublk_shmem_zc_offset(iod->addr);
+         buf = shmem_table[index].mmap_base + offset;
+     }
+
+5. If pages do not match (e.g., the client used a non-shared buffer),
+   the I/O falls back to the normal copy path silently.
+
+The shared memory can be set up via two methods:
+
+- **Socket-based**: the client sends a memfd to the ublk server via
+  ``SCM_RIGHTS`` on a unix socket. The server mmaps and registers it.
+- **Hugetlbfs-based**: both processes ``mmap(MAP_SHARED)`` the same
+  hugetlbfs file. No IPC needed — same file gives same physical pages.
+
+Advantages
+~~~~~~~~~~
+
+- **Simple**: no per-I/O buffer registration or unregistration commands.
+  Once the shared buffer is registered, all matching I/O is zero-copy
+  automatically.
+- **Direct buffer access**: the ublk server can read and write the shared
+  buffer directly via its own mmap, without going through io_uring fixed
+  buffer operations. This is more friendly for server implementations.
+- **Fast**: PFN matching is a single maple tree lookup per bvec. No
+  io_uring command round-trips for buffer management.
+- **Compatible**: non-matching I/O silently falls back to the copy path.
+  The device works normally for any client, with zero-copy as an
+  optimization when shared memory is available.
+
+Limitations
+~~~~~~~~~~~
+
+- **Requires client cooperation**: the client must allocate its I/O
+  buffers from the shared memory region. This requires a custom or
+  configured client — standard applications using their own buffers
+  will not benefit.
+- **Direct I/O only**: buffered I/O (without ``O_DIRECT``) goes through
+  the page cache, which allocates its own pages. These kernel-allocated
+  pages will never match the registered shared buffer. Only ``O_DIRECT``
+  puts the client's buffer pages directly into the block I/O.
+- **Contiguous data only**: each I/O request's data must be contiguous
+  within a single registered buffer. Scatter/gather I/O that spans
+  multiple non-adjacent registered buffers cannot use the zero-copy path.
+
+Control Commands
+~~~~~~~~~~~~~~~~
+
+- ``UBLK_U_CMD_REG_BUF``
+
+  Register a shared memory buffer. ``ctrl_cmd.addr`` points to a
+  ``struct ublk_shmem_buf_reg`` containing the buffer virtual address and size.
+  Returns the assigned buffer index (>= 0) on success. The kernel pins
+  pages and builds the PFN lookup tree. Queue freeze is handled
+  internally.
+
+- ``UBLK_U_CMD_UNREG_BUF``
+
+  Unregister a previously registered buffer. ``ctrl_cmd.data[0]`` is the
+  buffer index. Unpins pages and removes PFN entries from the lookup
+  tree.
+
 References
 ==========

--- a/2
+++ b/2
@@ -27015,7 +27015,7 @@ F:	Documentation/filesystems/ubifs.rst
 F:	fs/ubifs/

 UBLK USERSPACE BLOCK DRIVER
-M:	Ming Lei <ming.lei@redhat.com>
+M:	Ming Lei <tom.leiming@gmail.com>
 L:	linux-block@vger.kernel.org
 S:	Maintained
 F:	Documentation/block/ublk.rst
--- a/block/bio.c
+++ b/block/bio.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/blk-crypto.h>
 #include <linux/xarray.h>
+#include <linux/kmemleak.h>

 #include <trace/events/block.h>
 #include "blk.h"
@@ -34,6 +35,8 @@ struct bio_alloc_cache {
 	unsigned int		nr_irq;
 };

+#define BIO_INLINE_VECS 4
+
 static struct biovec_slab {
 	int nr_vecs;
 	char *name;
@@ -114,6 +117,11 @@ static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
 	return bs->front_pad + sizeof(struct bio) + bs->back_pad;
 }

+static inline void *bio_slab_addr(struct bio *bio)
+{
+	return (void *)bio - bio->bi_pool->front_pad;
+}
+
 static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
 {
 	unsigned int size = bs_bio_slab_size(bs);
@@ -159,57 +167,16 @@ out:
 	mutex_unlock(&bio_slab_lock);
 }

-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
-{
-	BUG_ON(nr_vecs > BIO_MAX_VECS);
-
-	if (nr_vecs == BIO_MAX_VECS)
-		mempool_free(bv, pool);
-	else if (nr_vecs > BIO_INLINE_VECS)
-		kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
-}
-
 /*
 * Make the first allocation restricted and don't dump info on allocation
 * failures, since we'll fall back to the mempool in case of failure.
 */
-static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
+static inline gfp_t try_alloc_gfp(gfp_t gfp)
 {
 	return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
 		__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 }

-struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
-		gfp_t gfp_mask)
-{
-	struct biovec_slab *bvs = biovec_slab(*nr_vecs);
-
-	if (WARN_ON_ONCE(!bvs))
-		return NULL;
-
-	/*
-	 * Upgrade the nr_vecs request to take full advantage of the allocation.
-	 * We also rely on this in the bvec_free path.
-	 */
-	*nr_vecs = bvs->nr_vecs;
-
-	/*
-	 * Try a slab allocation first for all smaller allocations.  If that
-	 * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
-	 * The mempool is sized to handle up to BIO_MAX_VECS entries.
-	 */
-	if (*nr_vecs < BIO_MAX_VECS) {
-		struct bio_vec *bvl;
-
-		bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
-		if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
-			return bvl;
-		*nr_vecs = BIO_MAX_VECS;
-	}
-
-	return mempool_alloc(pool, gfp_mask);
-}
-
 void bio_uninit(struct bio *bio)
 {
 #ifdef CONFIG_BLK_CGROUP
@@ -231,9 +198,14 @@ static void bio_free(struct bio *bio)
 	void *p = bio;

 	WARN_ON_ONCE(!bs);
+	WARN_ON_ONCE(bio->bi_max_vecs > BIO_MAX_VECS);

 	bio_uninit(bio);
-	bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
+	if (bio->bi_max_vecs == BIO_MAX_VECS)
+		mempool_free(bio->bi_io_vec, &bs->bvec_pool);
+	else if (bio->bi_max_vecs > BIO_INLINE_VECS)
+		kmem_cache_free(biovec_slab(bio->bi_max_vecs)->slab,
+				bio->bi_io_vec);
 	mempool_free(p - bs->front_pad, &bs->bio_pool);
 }

@@ -430,13 +402,31 @@ static void bio_alloc_rescue(struct work_struct *work)
 	}
 }

+/*
+ * submit_bio_noacct() converts recursion to iteration; this means if we're
+ * running beneath it, any bios we allocate and submit will not be submitted
+ * (and thus freed) until after we return.
+ *
+ * This exposes us to a potential deadlock if we allocate multiple bios from the
+ * same bio_set while running underneath submit_bio_noacct().  If we were to
+ * allocate multiple bios (say a stacking block driver that was splitting bios),
+ * we would deadlock if we exhausted the mempool's reserve.
+ *
+ * We solve this, and guarantee forward progress by punting the bios on
+ * current->bio_list to a per bio_set rescuer workqueue before blocking to wait
+ * for elements being returned to the mempool.
+ */
 static void punt_bios_to_rescuer(struct bio_set *bs)
 {
 	struct bio_list punt, nopunt;
 	struct bio *bio;

-	if (WARN_ON_ONCE(!bs->rescue_workqueue))
+	if (!current->bio_list || !bs->rescue_workqueue)
 		return;
+	if (bio_list_empty(&current->bio_list[0]) &&
+	    bio_list_empty(&current->bio_list[1]))
+		return;
+
 	/*
 	 * In order to guarantee forward progress we must punt only bios that
 	 * were allocated from this bio_set; otherwise, if there was a bio on
@@ -483,9 +473,7 @@ static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
 	local_irq_restore(flags);
 }

-static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
-		unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
-		struct bio_set *bs)
+static struct bio *bio_alloc_percpu_cache(struct bio_set *bs)
 {
 	struct bio_alloc_cache *cache;
 	struct bio *bio;
@@ -503,12 +491,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
 	cache->free_list = bio->bi_next;
 	cache->nr--;
 	put_cpu();
-
-	if (nr_vecs)
-		bio_init_inline(bio, bdev, nr_vecs, opf);
-	else
-		bio_init(bio, bdev, NULL, nr_vecs, opf);
 	bio->bi_pool = bs;
+
+	kmemleak_alloc(bio_slab_addr(bio),
+		       kmem_cache_size(bs->bio_slab), 1, GFP_NOIO);
 	return bio;
 }

@@ -517,7 +503,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
 * @bdev:	block device to allocate the bio for (can be %NULL)
 * @nr_vecs:	number of bvecs to pre-allocate
 * @opf:	operation and flags for bio
- * @gfp_mask:   the GFP_* mask given to the slab allocator
+ * @gfp:	the GFP_* mask given to the slab allocator
 * @bs:		the bio_set to allocate from.
 *
 * Allocate a bio from the mempools in @bs.
@@ -547,91 +533,77 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
 * Returns: Pointer to new bio on success, NULL on failure.
 */
 struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
-			     blk_opf_t opf, gfp_t gfp_mask,
-			     struct bio_set *bs)
+			     blk_opf_t opf, gfp_t gfp, struct bio_set *bs)
 {
-	gfp_t saved_gfp = gfp_mask;
-	struct bio *bio;
+	struct bio_vec *bvecs = NULL;
+	struct bio *bio = NULL;
+	gfp_t saved_gfp = gfp;
 	void *p;

 	/* should not use nobvec bioset for nr_vecs > 0 */
 	if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
 		return NULL;

+	gfp = try_alloc_gfp(gfp);
 	if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
-		opf |= REQ_ALLOC_CACHE;
-		bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf,
-					     gfp_mask, bs);
-		if (bio)
-			return bio;
 		/*
-		 * No cached bio available, bio returned below marked with
-		 * REQ_ALLOC_CACHE to participate in per-cpu alloc cache.
+		 * Set REQ_ALLOC_CACHE even if no cached bio is available to
+		 * return the allocated bio to the percpu cache when done.
 		 */
-	} else
-		opf &= ~REQ_ALLOC_CACHE;
-
-	/*
-	 * submit_bio_noacct() converts recursion to iteration; this means if
-	 * we're running beneath it, any bios we allocate and submit will not be
-	 * submitted (and thus freed) until after we return.
-	 *
-	 * This exposes us to a potential deadlock if we allocate multiple bios
-	 * from the same bio_set() while running underneath submit_bio_noacct().
-	 * If we were to allocate multiple bios (say a stacking block driver
-	 * that was splitting bios), we would deadlock if we exhausted the
-	 * mempool's reserve.
-	 *
-	 * We solve this, and guarantee forward progress, with a rescuer
-	 * workqueue per bio_set. If we go to allocate and there are bios on
-	 * current->bio_list, we first try the allocation without
-	 * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
-	 * blocking to the rescuer workqueue before we retry with the original
-	 * gfp_flags.
-	 */
-	if (current->bio_list &&
-	    (!bio_list_empty(&current->bio_list[0]) ||
-	     !bio_list_empty(&current->bio_list[1])) &&
-	    bs->rescue_workqueue)
-		gfp_mask &= ~__GFP_DIRECT_RECLAIM;
-
-	p = mempool_alloc(&bs->bio_pool, gfp_mask);
-	if (!p && gfp_mask != saved_gfp) {
-		punt_bios_to_rescuer(bs);
-		gfp_mask = saved_gfp;
-		p = mempool_alloc(&bs->bio_pool, gfp_mask);
-	}
-	if (unlikely(!p))
-		return NULL;
-	if (!mempool_is_saturated(&bs->bio_pool))
-		opf &= ~REQ_ALLOC_CACHE;
-
-	bio = p + bs->front_pad;
-	if (nr_vecs > BIO_INLINE_VECS) {
-		struct bio_vec *bvl = NULL;
-
-		bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
-		if (!bvl && gfp_mask != saved_gfp) {
-			punt_bios_to_rescuer(bs);
-			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
-		}
-		if (unlikely(!bvl))
-			goto err_free;
-
-		bio_init(bio, bdev, bvl, nr_vecs, opf);
-	} else if (nr_vecs) {
-		bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf);
+		opf |= REQ_ALLOC_CACHE;
+		bio = bio_alloc_percpu_cache(bs);
 	} else {
-		bio_init(bio, bdev, NULL, 0, opf);
+		opf &= ~REQ_ALLOC_CACHE;
+		p = kmem_cache_alloc(bs->bio_slab, gfp);
+		if (p)
+			bio = p + bs->front_pad;
 	}

+	if (bio && nr_vecs > BIO_INLINE_VECS) {
+		struct biovec_slab *bvs = biovec_slab(nr_vecs);
+
+		/*
+		 * Upgrade nr_vecs to take full advantage of the allocation.
+		 * We also rely on this in bio_free().
+		 */
+		nr_vecs = bvs->nr_vecs;
+		bvecs = kmem_cache_alloc(bvs->slab, gfp);
+		if (unlikely(!bvecs)) {
+			kmem_cache_free(bs->bio_slab, p);
+			bio = NULL;
+		}
+	}
+
+	if (unlikely(!bio)) {
+		/*
+		 * Give up if we are not allow to sleep as non-blocking mempool
+		 * allocations just go back to the slab allocation.
+		 */
+		if (!(saved_gfp & __GFP_DIRECT_RECLAIM))
+			return NULL;
+
+		punt_bios_to_rescuer(bs);
+
+		/*
+		 * Don't rob the mempools by returning to the per-CPU cache if
+		 * we're tight on memory.
+		 */
+		opf &= ~REQ_ALLOC_CACHE;
+
+		p = mempool_alloc(&bs->bio_pool, saved_gfp);
+		bio = p + bs->front_pad;
+		if (nr_vecs > BIO_INLINE_VECS) {
+			nr_vecs = BIO_MAX_VECS;
+			bvecs = mempool_alloc(&bs->bvec_pool, saved_gfp);
+		}
+	}
+
+	if (nr_vecs && nr_vecs <= BIO_INLINE_VECS)
+		bio_init_inline(bio, bdev, nr_vecs, opf);
+	else
+		bio_init(bio, bdev, bvecs, nr_vecs, opf);
 	bio->bi_pool = bs;
 	return bio;
-
-err_free:
-	mempool_free(p, &bs->bio_pool);
-	return NULL;
 }
 EXPORT_SYMBOL(bio_alloc_bioset);

@@ -765,6 +737,9 @@ static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
 	while ((bio = cache->free_list) != NULL) {
 		cache->free_list = bio->bi_next;
 		cache->nr--;
+		kmemleak_alloc(bio_slab_addr(bio),
+			       kmem_cache_size(bio->bi_pool->bio_slab),
+			       1, GFP_KERNEL);
 		bio_free(bio);
 		if (++i == nr)
 			break;
@@ -828,6 +803,7 @@ static inline void bio_put_percpu_cache(struct bio *bio)
 		bio->bi_bdev = NULL;
 		cache->free_list = bio;
 		cache->nr++;
+		kmemleak_free(bio_slab_addr(bio));
 	} else if (in_hardirq()) {
 		lockdep_assert_irqs_disabled();

@@ -835,6 +811,7 @@ static inline void bio_put_percpu_cache(struct bio *bio)
 		bio->bi_next = cache->free_list_irq;
 		cache->free_list_irq = bio;
 		cache->nr_irq++;
+		kmemleak_free(bio_slab_addr(bio));
 	} else {
 		goto out_free;
 	}
@@ -897,10 +874,11 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
 * @gfp: allocation priority
 * @bs: bio_set to allocate from
 *
- * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
- * bio, but not the actual data it points to.
- *
- * The caller must ensure that the return bio is not freed before @bio_src.
+ * Allocate a new bio that is a clone of @bio_src. This reuses the bio_vecs
+ * pointed to by @bio_src->bi_io_vec, and clones the iterator pointing to
+ * the current position in it.  The caller owns the returned bio, but not
+ * the bio_vecs, and must ensure the bio is freed before the memory
+ * pointed to by @bio_Src->bi_io_vecs.
 */
 struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
 		gfp_t gfp, struct bio_set *bs)
@@ -929,9 +907,7 @@ EXPORT_SYMBOL(bio_alloc_clone);
 * @gfp: allocation priority
 *
 * Initialize a new bio in caller provided memory that is a clone of @bio_src.
- * The caller owns the returned bio, but not the actual data it points to.
- *
- * The caller must ensure that @bio_src is not freed before @bio.
+ * The same bio_vecs reuse and bio lifetime rules as bio_alloc_clone() apply.
 */
 int bio_init_clone(struct block_device *bdev, struct bio *bio,
 		struct bio *bio_src, gfp_t gfp)
@@ -1064,6 +1040,8 @@ int bio_add_page(struct bio *bio, struct page *page,
 {
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return 0;
+	if (WARN_ON_ONCE(len == 0))
+		return 0;
 	if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
 		return 0;

@@ -1484,11 +1462,41 @@ void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
 		bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
 }

-static void submit_bio_wait_endio(struct bio *bio)
+static void bio_wait_end_io(struct bio *bio)
 {
 	complete(bio->bi_private);
 }

+/**
+ * bio_await - call a function on a bio, and wait until it completes
+ * @bio:	the bio which describes the I/O
+ * @submit:	function called to submit the bio
+ * @priv:	private data passed to @submit
+ *
+ * Wait for the bio as well as any bio chained off it after executing the
+ * passed in callback @submit.  The wait for the bio is set up before calling
+ * @submit to ensure that the completion is captured.  If @submit is %NULL,
+ * submit_bio() is used instead to submit the bio.
+ *
+ * Note: this overrides the bi_private and bi_end_io fields in the bio.
+ */
+void bio_await(struct bio *bio, void *priv,
+	       void (*submit)(struct bio *bio, void *priv))
+{
+	DECLARE_COMPLETION_ONSTACK_MAP(done,
+			bio->bi_bdev->bd_disk->lockdep_map);
+
+	bio->bi_private = &done;
+	bio->bi_end_io = bio_wait_end_io;
+	bio->bi_opf |= REQ_SYNC;
+	if (submit)
+		submit(bio, priv);
+	else
+		submit_bio(bio);
+	blk_wait_io(&done);
+}
+EXPORT_SYMBOL_GPL(bio_await);
+
 /**
 * submit_bio_wait - submit a bio, and wait until it completes
 * @bio: The &struct bio which describes the I/O
@@ -1502,19 +1510,30 @@ static void submit_bio_wait_endio(struct bio *bio)
 */
 int submit_bio_wait(struct bio *bio)
 {
-	DECLARE_COMPLETION_ONSTACK_MAP(done,
-			bio->bi_bdev->bd_disk->lockdep_map);
-
-	bio->bi_private = &done;
-	bio->bi_end_io = submit_bio_wait_endio;
-	bio->bi_opf |= REQ_SYNC;
-	submit_bio(bio);
-	blk_wait_io(&done);
-
+	bio_await(bio, NULL, NULL);
 	return blk_status_to_errno(bio->bi_status);
 }
 EXPORT_SYMBOL(submit_bio_wait);

+static void bio_endio_cb(struct bio *bio, void *priv)
+{
+	bio_endio(bio);
+}
+
+/*
+ * Submit @bio synchronously, or call bio_endio on it if the current process
+ * is being killed.
+ */
+int bio_submit_or_kill(struct bio *bio, unsigned int flags)
+{
+	if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) {
+		bio_await(bio, NULL, bio_endio_cb);
+		return -EINTR;
+	}
+
+	return submit_bio_wait(bio);
+}
+
 /**
 * bdev_rw_virt - synchronously read into / write from kernel mapping
 * @bdev:	block device to access
@@ -1545,26 +1564,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
 }
 EXPORT_SYMBOL_GPL(bdev_rw_virt);

-static void bio_wait_end_io(struct bio *bio)
-{
-	complete(bio->bi_private);
-	bio_put(bio);
-}
-
-/*
- * bio_await_chain - ends @bio and waits for every chained bio to complete
- */
-void bio_await_chain(struct bio *bio)
-{
-	DECLARE_COMPLETION_ONSTACK_MAP(done,
-			bio->bi_bdev->bd_disk->lockdep_map);
-
-	bio->bi_private = &done;
-	bio->bi_end_io = bio_wait_end_io;
-	bio_endio(bio);
-	blk_wait_io(&done);
-}
-
 void __bio_advance(struct bio *bio, unsigned bytes)
 {
 	if (bio_integrity(bio))
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -24,6 +24,7 @@
 #include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/wait_bit.h>
 #include <linux/atomic.h>
 #include <linux/ctype.h>
 #include <linux/resume_user_mode.h>
@@ -611,6 +612,8 @@ restart:

 	q->root_blkg = NULL;
 	spin_unlock_irq(&q->queue_lock);
+
+	wake_up_var(&q->root_blkg);
 }

 static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
@@ -1498,6 +1501,18 @@ int blkcg_init_disk(struct gendisk *disk)
 	struct blkcg_gq *new_blkg, *blkg;
 	bool preloaded;

+	/*
+	 * If the queue is shared across disk rebind (e.g., SCSI), the
+	 * previous disk's blkcg state is cleaned up asynchronously via
+	 * disk_release() -> blkcg_exit_disk(). Wait for that cleanup to
+	 * finish (indicated by root_blkg becoming NULL) before setting up
+	 * new blkcg state. Otherwise, we may overwrite q->root_blkg while
+	 * the old one is still alive, and radix_tree_insert() in
+	 * blkg_create() will fail with -EEXIST because the old entries
+	 * still occupy the same queue id slot in blkcg->blkg_tree.
+	 */
+	wait_var_event(&q->root_blkg, !READ_ONCE(q->root_blkg));
+
 	new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
 	if (!new_blkg)
 		return -ENOMEM;
@@ -2022,6 +2037,7 @@ void blkcg_maybe_throttle_current(void)
 	return;
 out:
 	rcu_read_unlock();
+	put_disk(disk);
 }

 /**
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -18,7 +18,7 @@ struct blk_crypto_kobj {
 struct blk_crypto_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct blk_crypto_profile *profile,
-			struct blk_crypto_attr *attr, char *page);
+			const struct blk_crypto_attr *attr, char *page);
 };

 static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
@@ -26,39 +26,39 @@ static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
 	return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
 }

-static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+static const struct blk_crypto_attr *attr_to_crypto_attr(const struct attribute *attr)
 {
-	return container_of(attr, struct blk_crypto_attr, attr);
+	return container_of_const(attr, struct blk_crypto_attr, attr);
 }

 static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile,
-				    struct blk_crypto_attr *attr, char *page)
+				    const struct blk_crypto_attr *attr, char *page)
 {
 	/* Always show supported, since the file doesn't exist otherwise. */
 	return sysfs_emit(page, "supported\n");
 }

 static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
-				 struct blk_crypto_attr *attr, char *page)
+				 const struct blk_crypto_attr *attr, char *page)
 {
 	return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
 }

 static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
-				 struct blk_crypto_attr *attr, char *page)
+				 const struct blk_crypto_attr *attr, char *page)
 {
 	return sysfs_emit(page, "%u\n", profile->num_slots);
 }

 static ssize_t raw_keys_show(struct blk_crypto_profile *profile,
-			     struct blk_crypto_attr *attr, char *page)
+			     const struct blk_crypto_attr *attr, char *page)
 {
 	/* Always show supported, since the file doesn't exist otherwise. */
 	return sysfs_emit(page, "supported\n");
 }

 #define BLK_CRYPTO_RO_ATTR(_name) \
-	static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+	static const struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)

 BLK_CRYPTO_RO_ATTR(hw_wrapped_keys);
 BLK_CRYPTO_RO_ATTR(max_dun_bits);
@@ -66,10 +66,10 @@ BLK_CRYPTO_RO_ATTR(num_keyslots);
 BLK_CRYPTO_RO_ATTR(raw_keys);

 static umode_t blk_crypto_is_visible(struct kobject *kobj,
-				     struct attribute *attr, int n)
+				     const struct attribute *attr, int n)
 {
 	struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
-	struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+	const struct blk_crypto_attr *a = attr_to_crypto_attr(attr);

 	if (a == &hw_wrapped_keys_attr &&
 	    !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
@@ -81,7 +81,7 @@ static umode_t blk_crypto_is_visible(struct kobject *kobj,
 	return 0444;
 }

-static struct attribute *blk_crypto_attrs[] = {
+static const struct attribute *const blk_crypto_attrs[] = {
 	&hw_wrapped_keys_attr.attr,
 	&max_dun_bits_attr.attr,
 	&num_keyslots_attr.attr,
@@ -90,8 +90,8 @@ static struct attribute *blk_crypto_attrs[] = {
 };

 static const struct attribute_group blk_crypto_attr_group = {
-	.attrs = blk_crypto_attrs,
-	.is_visible = blk_crypto_is_visible,
+	.attrs_const = blk_crypto_attrs,
+	.is_visible_const = blk_crypto_is_visible,
 };

 /*
@@ -99,13 +99,13 @@ static const struct attribute_group blk_crypto_attr_group = {
 * modes, these are initialized at boot time by blk_crypto_sysfs_init().
 */
 static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
-static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+static const struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];

 static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
-					  struct attribute *attr, int n)
+					  const struct attribute *attr, int n)
 {
 	struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
-	struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+	const struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
 	int mode_num = a - __blk_crypto_mode_attrs;

 	if (profile->modes_supported[mode_num])
@@ -114,7 +114,7 @@ static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
 }

 static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
-				    struct blk_crypto_attr *attr, char *page)
+				    const struct blk_crypto_attr *attr, char *page)
 {
 	int mode_num = attr - __blk_crypto_mode_attrs;

@@ -123,8 +123,8 @@ static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,

 static const struct attribute_group blk_crypto_modes_attr_group = {
 	.name = "modes",
-	.attrs = blk_crypto_mode_attrs,
-	.is_visible = blk_crypto_mode_is_visible,
+	.attrs_const = blk_crypto_mode_attrs,
+	.is_visible_const = blk_crypto_mode_is_visible,
 };

 static const struct attribute_group *blk_crypto_attr_groups[] = {
@@ -137,7 +137,7 @@ static ssize_t blk_crypto_attr_show(struct kobject *kobj,
 				    struct attribute *attr, char *page)
 {
 	struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
-	struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+	const struct blk_crypto_attr *a = attr_to_crypto_attr(attr);

 	return a->show(profile, a, page);
 }
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -30,17 +30,17 @@ struct blk_ia_range_sysfs_entry {
 	ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
 };

-static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
+static const struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
 	.attr = { .name = "sector", .mode = 0444 },
 	.show = blk_ia_range_sector_show,
 };

-static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
+static const struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
 	.attr = { .name = "nr_sectors", .mode = 0444 },
 	.show = blk_ia_range_nr_sectors_show,
 };

-static struct attribute *blk_ia_range_attrs[] = {
+static const struct attribute *const blk_ia_range_attrs[] = {
 	&blk_ia_range_sector_entry.attr,
 	&blk_ia_range_nr_sectors_entry.attr,
 	NULL,
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }

-static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p,
+			 u32 *nr_done)
 {
 	u32 nr_met[2] = { };
 	u32 nr_missed[2] = { };
@@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p

 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
 				   ioc->period_us * NSEC_PER_USEC);
+
+	*nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE];
 }

 /* was iocg idle this period? */
@@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer)
 	u64 usage_us_sum = 0;
 	u32 ppm_rthr;
 	u32 ppm_wthr;
-	u32 missed_ppm[2], rq_wait_pct;
+	u32 missed_ppm[2], rq_wait_pct, nr_done;
 	u64 period_vtime;
 	int prev_busy_level;

 	/* how were the latencies during the period? */
-	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);

 	/* take care of active iocgs */
 	spin_lock_irq(&ioc->lock);
@@ -2397,9 +2400,17 @@ static void ioc_timer_fn(struct timer_list *timer)
 	 * and should increase vtime rate.
 	 */
 	prev_busy_level = ioc->busy_level;
-	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
-	    missed_ppm[READ] > ppm_rthr ||
-	    missed_ppm[WRITE] > ppm_wthr) {
+	if (!nr_done && nr_lagging) {
+		/*
+		 * When there are lagging IOs but no completions, we don't
+		 * know if the IO latency will meet the QoS targets. The
+		 * disk might be saturated or not. We should not reset
+		 * busy_level to 0 (which would prevent vrate from scaling
+		 * up or down), but rather to keep it unchanged.
+		 */
+	} else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+		   missed_ppm[READ] > ppm_rthr ||
+		   missed_ppm[WRITE] > ppm_wthr) {
 		/* clearly missing QoS targets, slow down vrate */
 		ioc->busy_level = max(ioc->busy_level, 0);
 		ioc->busy_level++;
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -155,13 +155,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
 	__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio,
 			flags, limit);
 	if (bio) {
-		if ((flags & BLKDEV_ZERO_KILLABLE) &&
-		    fatal_signal_pending(current)) {
-			bio_await_chain(bio);
-			blk_finish_plug(&plug);
-			return -EINTR;
-		}
-		ret = submit_bio_wait(bio);
+		ret = bio_submit_or_kill(bio, flags);
 		bio_put(bio);
 	}
 	blk_finish_plug(&plug);
@@ -236,13 +230,7 @@ static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector,
 	blk_start_plug(&plug);
 	__blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags);
 	if (bio) {
-		if ((flags & BLKDEV_ZERO_KILLABLE) &&
-		    fatal_signal_pending(current)) {
-			bio_await_chain(bio);
-			blk_finish_plug(&plug);
-			return -EINTR;
-		}
-		ret = submit_bio_wait(bio);
+		ret = bio_submit_or_kill(bio, flags);
 		bio_put(bio);
 	}
 	blk_finish_plug(&plug);
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(NO_ELV_SWITCH),
 	QUEUE_FLAG_NAME(QOS_ENABLED),
 	QUEUE_FLAG_NAME(BIO_ISSUE_TIME),
+	QUEUE_FLAG_NAME(ZONED_QD1_WRITES),
 };
 #undef QUEUE_FLAG_NAME

--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -53,7 +53,7 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
 	struct request_queue *q;
 	ssize_t res;

-	entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
+	entry = container_of_const(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
 	hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
 	q = hctx->queue;

@@ -101,20 +101,20 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return pos + ret;
 }

-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
+static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
 	.attr = {.name = "nr_tags", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_nr_tags_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
+static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
 	.attr = {.name = "nr_reserved_tags", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
 };
-static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
+static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
 	.attr = {.name = "cpu_list", .mode = 0444 },
 	.show = blk_mq_hw_sysfs_cpus_show,
 };

-static struct attribute *default_hw_ctx_attrs[] = {
+static const struct attribute *const default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_nr_tags.attr,
 	&blk_mq_hw_sysfs_nr_reserved_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3424,6 +3424,25 @@ EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 */
 void blk_steal_bios(struct bio_list *list, struct request *rq)
 {
+	struct bio *bio;
+
+	for (bio = rq->bio; bio; bio = bio->bi_next) {
+		if (bio->bi_opf & REQ_POLLED) {
+			bio->bi_opf &= ~REQ_POLLED;
+			bio->bi_cookie = BLK_QC_T_NONE;
+		}
+		/*
+		 * The alternate request queue that we may end up submitting
+		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
+		 * will fail the I/O immediately with EAGAIN to the issuer.
+		 * We are not in the issuer context which cannot block. Clear
+		 * the flag to avoid spurious EAGAIN I/O failures.
+		 */
+		bio->bi_opf &= ~REQ_NOWAIT;
+		bio_clear_flag(bio, BIO_QOS_THROTTLED);
+		bio_clear_flag(bio, BIO_QOS_MERGED);
+	}
+
 	if (rq->bio) {
 		if (list->tail)
 			list->tail->bi_next = rq->bio;
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -189,11 +189,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
 	}

 	/*
-	 * The PI generation / validation helpers do not expect intervals to
-	 * straddle multiple bio_vecs.  Enforce alignment so that those are
+	 * Some IO controllers can not handle data intervals straddling
+	 * multiple bio_vecs.  For those, enforce alignment so that those are
 	 * never generated, and that each buffer is aligned as expected.
 	 */
-	if (bi->csum_type) {
+	if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) {
 		lim->dma_alignment = max(lim->dma_alignment,
 					(1U << bi->interval_exp) - 1);
 	}
@@ -992,10 +992,14 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
 		if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
 		    (bi->flags & BLK_INTEGRITY_REF_TAG))
 			goto incompatible;
+		if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) &&
+		    !(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE))
+			ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE;
 	} else {
 		ti->flags = BLK_INTEGRITY_STACKED;
 		ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
-			     (bi->flags & BLK_INTEGRITY_REF_TAG);
+			     (bi->flags & BLK_INTEGRITY_REF_TAG) |
+			     (bi->flags & BLK_SPLIT_INTERVAL_CAPABLE);
 		ti->csum_type = bi->csum_type;
 		ti->pi_tuple_size = bi->pi_tuple_size;
 		ti->metadata_size = bi->metadata_size;
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page)
 	return queue_var_show(disk_nr_zones(disk), page);
 }

+static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page)
+{
+	return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue),
+			      page);
+}
+
+static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk,
+					    const char *page, size_t count)
+{
+	struct request_queue *q = disk->queue;
+	unsigned long qd1_writes;
+	unsigned int memflags;
+	ssize_t ret;
+
+	ret = queue_var_store(&qd1_writes, page, count);
+	if (ret < 0)
+		return ret;
+
+	memflags = blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+	if (qd1_writes)
+		blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q);
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q, memflags);
+
+	return count;
+}
+
 static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
 {
 	return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page);
@@ -551,27 +581,27 @@ static int queue_wc_store(struct gendisk *disk, const char *page,
 	return 0;
 }

-#define QUEUE_RO_ENTRY(_prefix, _name)			\
-static struct queue_sysfs_entry _prefix##_entry = {	\
-	.attr	= { .name = _name, .mode = 0444 },	\
-	.show	= _prefix##_show,			\
+#define QUEUE_RO_ENTRY(_prefix, _name)				\
+static const struct queue_sysfs_entry _prefix##_entry = {	\
+	.attr	= { .name = _name, .mode = 0444 },		\
+	.show	= _prefix##_show,				\
 };

-#define QUEUE_RW_ENTRY(_prefix, _name)			\
-static struct queue_sysfs_entry _prefix##_entry = {	\
-	.attr	= { .name = _name, .mode = 0644 },	\
-	.show	= _prefix##_show,			\
-	.store	= _prefix##_store,			\
+#define QUEUE_RW_ENTRY(_prefix, _name)				\
+static const struct queue_sysfs_entry _prefix##_entry = {	\
+	.attr	= { .name = _name, .mode = 0644 },		\
+	.show	= _prefix##_show,				\
+	.store	= _prefix##_store,				\
 };

 #define QUEUE_LIM_RO_ENTRY(_prefix, _name)			\
-static struct queue_sysfs_entry _prefix##_entry = {	\
+static const struct queue_sysfs_entry _prefix##_entry = {	\
 	.attr		= { .name = _name, .mode = 0444 },	\
 	.show_limit	= _prefix##_show,			\
 }

 #define QUEUE_LIM_RW_ENTRY(_prefix, _name)			\
-static struct queue_sysfs_entry _prefix##_entry = {	\
+static const struct queue_sysfs_entry _prefix##_entry = {	\
 	.attr		= { .name = _name, .mode = 0644 },	\
 	.show_limit	= _prefix##_show,			\
 	.store_limit	= _prefix##_store,			\
@@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
 QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");

 QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
 QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
 QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
@@ -634,7 +665,7 @@ QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
 QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment");

 /* legacy alias for logical_block_size: */
-static struct queue_sysfs_entry queue_hw_sector_size_entry = {
+static const struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.attr		= {.name = "hw_sector_size", .mode = 0444 },
 	.show_limit	= queue_logical_block_size_show,
 };
@@ -700,7 +731,7 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
 #endif

 /* Common attributes for bio-based and request-based queues. */
-static struct attribute *queue_attrs[] = {
+static const struct attribute *const queue_attrs[] = {
 	/*
 	 * Attributes which are protected with q->limits_lock.
 	 */
@@ -754,12 +785,13 @@ static struct attribute *queue_attrs[] = {
 	&queue_nomerges_entry.attr,
 	&queue_poll_entry.attr,
 	&queue_poll_delay_entry.attr,
+	&queue_zoned_qd1_writes_entry.attr,

 	NULL,
 };

 /* Request-based queue attributes that are not relevant for bio-based queues. */
-static struct attribute *blk_mq_queue_attrs[] = {
+static const struct attribute *const blk_mq_queue_attrs[] = {
 	/*
 	 * Attributes which require some form of locking other than
 	 * q->sysfs_lock.
@@ -779,14 +811,15 @@ static struct attribute *blk_mq_queue_attrs[] = {
 	NULL,
 };

-static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
+static umode_t queue_attr_visible(struct kobject *kobj, const struct attribute *attr,
 				int n)
 {
 	struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
 	struct request_queue *q = disk->queue;

 	if ((attr == &queue_max_open_zones_entry.attr ||
-	     attr == &queue_max_active_zones_entry.attr) &&
+	     attr == &queue_max_active_zones_entry.attr ||
+	     attr == &queue_zoned_qd1_writes_entry.attr) &&
 	    !blk_queue_is_zoned(q))
 		return 0;

@@ -794,7 +827,7 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
 }

 static umode_t blk_mq_queue_attr_visible(struct kobject *kobj,
-					 struct attribute *attr, int n)
+					 const struct attribute *attr, int n)
 {
 	struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
 	struct request_queue *q = disk->queue;
@@ -808,17 +841,17 @@ static umode_t blk_mq_queue_attr_visible(struct kobject *kobj,
 	return attr->mode;
 }

-static struct attribute_group queue_attr_group = {
-	.attrs = queue_attrs,
-	.is_visible = queue_attr_visible,
+static const struct attribute_group queue_attr_group = {
+	.attrs_const = queue_attrs,
+	.is_visible_const = queue_attr_visible,
 };

-static struct attribute_group blk_mq_queue_attr_group = {
-	.attrs = blk_mq_queue_attrs,
-	.is_visible = blk_mq_queue_attr_visible,
+static const struct attribute_group blk_mq_queue_attr_group = {
+	.attrs_const = blk_mq_queue_attrs,
+	.is_visible_const = blk_mq_queue_attr_visible,
 };

-#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
+#define to_queue(atr) container_of_const((atr), struct queue_sysfs_entry, attr)

 static ssize_t
 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
@@ -934,6 +967,14 @@ int blk_register_queue(struct gendisk *disk)
 		blk_mq_debugfs_register(q);
 	blk_debugfs_unlock(q, memflags);

+	/*
+	 * For blk-mq rotational zoned devices, default to using QD=1
+	 * writes. For non-mq rotational zoned devices, the device driver can
+	 * set an appropriate default.
+	 */
+	if (queue_is_mq(q) && blk_queue_rot(q) && blk_queue_is_zoned(q))
+		blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q);
+
 	ret = disk_register_independent_access_ranges(disk);
 	if (ret)
 		goto out_debugfs_remove;
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -782,10 +782,11 @@ void wbt_init_enable_default(struct gendisk *disk)
 		return;

 	rwb = wbt_alloc();
-	if (WARN_ON_ONCE(!rwb))
+	if (!rwb)
 		return;

-	if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
+	if (wbt_init(disk, rwb)) {
+		pr_warn("%s: failed to enable wbt\n", disk->disk_name);
 		wbt_free(rwb);
 		return;
 	}
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -16,6 +16,8 @@
 #include <linux/spinlock.h>
 #include <linux/refcount.h>
 #include <linux/mempool.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>

 #include <trace/events/block.h>

@@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = {
 /*
 * Per-zone write plug.
 * @node: hlist_node structure for managing the plug using a hash table.
+ * @entry: list_head structure for listing the plug in the disk list of active
+ *         zone write plugs.
 * @bio_list: The list of BIOs that are currently plugged.
 * @bio_work: Work struct to handle issuing of plugged BIOs
 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
@@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = {
 */
 struct blk_zone_wplug {
 	struct hlist_node	node;
+	struct list_head	entry;
 	struct bio_list		bio_list;
 	struct work_struct	bio_work;
 	struct rcu_head		rcu_head;
@@ -99,17 +104,17 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
 *    being executed or the zone write plug bio list is not empty.
 *  - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
 *    write pointer offset and need to update it.
- *  - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
- *    from the disk hash table and that the initial reference to the zone
- *    write plug set when the plug was first added to the hash table has been
- *    dropped. This flag is set when a zone is reset, finished or become full,
- *    to prevent new references to the zone write plug to be taken for
- *    newly incoming BIOs. A zone write plug flagged with this flag will be
- *    freed once all remaining references from BIOs or functions are dropped.
+ *  - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
+ *    removed from the disk hash table of zone write plugs when the last
+ *    reference on the zone write plug is dropped. If set, this flag also
+ *    indicates that the initial extra reference on the zone write plug was
+ *    dropped, meaning that the reference count indicates the current number of
+ *    active users (code context or BIOs and requests in flight). This flag is
+ *    set when a zone is reset, finished or becomes full.
 */
 #define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE	(1U << 1)
-#define BLK_ZONE_WPLUG_UNHASHED		(1U << 2)
+#define BLK_ZONE_WPLUG_DEAD		(1U << 2)

 /**
 * blk_zone_cond_str - Return a zone condition name string
@@ -412,20 +417,32 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 	return 0;
 }

-static int blkdev_truncate_zone_range(struct block_device *bdev,
-		blk_mode_t mode, const struct blk_zone_range *zrange)
+static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode,
+			     struct blk_zone_range *zrange)
 {
 	loff_t start, end;
+	int ret = -EINVAL;

+	inode_lock(bdev->bd_mapping->host);
+	filemap_invalidate_lock(bdev->bd_mapping);
 	if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
 	    zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
 		/* Out of range */
-		return -EINVAL;
+		goto out_unlock;

 	start = zrange->sector << SECTOR_SHIFT;
 	end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;

-	return truncate_bdev_range(bdev, mode, start, end);
+	ret = truncate_bdev_range(bdev, mode, start, end);
+	if (ret)
+		goto out_unlock;
+
+	ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector,
+			       zrange->nr_sectors);
+out_unlock:
+	filemap_invalidate_unlock(bdev->bd_mapping);
+	inode_unlock(bdev->bd_mapping->host);
+	return ret;
 }

 /*
@@ -438,7 +455,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 	void __user *argp = (void __user *)arg;
 	struct blk_zone_range zrange;
 	enum req_op op;
-	int ret;

 	if (!argp)
 		return -EINVAL;
@@ -454,15 +470,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,

 	switch (cmd) {
 	case BLKRESETZONE:
-		op = REQ_OP_ZONE_RESET;
-
-		/* Invalidate the page cache, including dirty pages. */
-		inode_lock(bdev->bd_mapping->host);
-		filemap_invalidate_lock(bdev->bd_mapping);
-		ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
-		if (ret)
-			goto fail;
-		break;
+		return blkdev_reset_zone(bdev, mode, &zrange);
 	case BLKOPENZONE:
 		op = REQ_OP_ZONE_OPEN;
 		break;
@@ -476,15 +484,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 		return -ENOTTY;
 	}

-	ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
-
-fail:
-	if (cmd == BLKRESETZONE) {
-		filemap_invalidate_unlock(bdev->bd_mapping);
-		inode_unlock(bdev->bd_mapping->host);
-	}
-
-	return ret;
+	return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
 }

 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
@@ -492,18 +492,12 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
 	return zone->start + zone->len >= get_capacity(disk);
 }

-static bool disk_zone_is_full(struct gendisk *disk,
-			      unsigned int zno, unsigned int offset_in_zone)
-{
-	if (zno < disk->nr_zones - 1)
-		return offset_in_zone >= disk->zone_capacity;
-	return offset_in_zone >= disk->last_zone_capacity;
-}
-
 static bool disk_zone_wplug_is_full(struct gendisk *disk,
 				    struct blk_zone_wplug *zwplug)
 {
-	return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
+	if (zwplug->zone_no < disk->nr_zones - 1)
+		return zwplug->wp_offset >= disk->zone_capacity;
+	return zwplug->wp_offset >= disk->last_zone_capacity;
 }

 static bool disk_insert_zone_wplug(struct gendisk *disk,
@@ -520,10 +514,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
 	 * are racing with other submission context, so we may already have a
 	 * zone write plug for the same zone.
 	 */
-	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
 	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
 		if (zwplg->zone_no == zwplug->zone_no) {
-			spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+			spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
+					       flags);
 			return false;
 		}
 	}
@@ -535,7 +530,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
 	 * necessarilly in the active condition.
 	 */
 	zones_cond = rcu_dereference_check(disk->zones_cond,
-				lockdep_is_held(&disk->zone_wplugs_lock));
+				lockdep_is_held(&disk->zone_wplugs_hash_lock));
 	if (zones_cond)
 		zwplug->cond = zones_cond[zwplug->zone_no];
 	else
@@ -543,7 +538,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,

 	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
 	atomic_inc(&disk->nr_zone_wplugs);
-	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);

 	return true;
 }
@@ -587,105 +582,76 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
 	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
 }

-static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
-{
-	if (refcount_dec_and_test(&zwplug->ref)) {
-		WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
-		WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
-		WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
-
-		call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
-	}
-}
-
-static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
-						 struct blk_zone_wplug *zwplug)
-{
-	lockdep_assert_held(&zwplug->lock);
-
-	/* If the zone write plug was already removed, we are done. */
-	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
-		return false;
-
-	/* If the zone write plug is still plugged, it cannot be removed. */
-	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
-		return false;
-
-	/*
-	 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
-	 * happen after handling a request completion with
-	 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
-	 * that are chained). In such case, disk_zone_wplug_unplug_bio()
-	 * should not attempt to remove the zone write plug until all BIO
-	 * completions are seen. Check by looking at the zone write plug
-	 * reference count, which is 2 when the plug is unused (one reference
-	 * taken when the plug was allocated and another reference taken by the
-	 * caller context).
-	 */
-	if (refcount_read(&zwplug->ref) > 2)
-		return false;
-
-	/* We can remove zone write plugs for zones that are empty or full. */
-	return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
-}
-
-static void disk_remove_zone_wplug(struct gendisk *disk,
-				   struct blk_zone_wplug *zwplug)
+static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
 {
+	struct gendisk *disk = zwplug->disk;
 	unsigned long flags;

-	/* If the zone write plug was already removed, we have nothing to do. */
-	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
-		return;
+	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
+	WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
+	WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));

-	/*
-	 * Mark the zone write plug as unhashed and drop the extra reference we
-	 * took when the plug was inserted in the hash table. Also update the
-	 * disk zone condition array with the current condition of the zone
-	 * write plug.
-	 */
-	zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
-	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
 	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
-				lockdep_is_held(&disk->zone_wplugs_lock)),
+				lockdep_is_held(&disk->zone_wplugs_hash_lock)),
 			  zwplug->zone_no, zwplug->cond);
 	hlist_del_init_rcu(&zwplug->node);
 	atomic_dec(&disk->nr_zone_wplugs);
-	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
+
+	call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+	if (refcount_dec_and_test(&zwplug->ref))
+		disk_free_zone_wplug(zwplug);
+}
+
+/*
+ * Flag the zone write plug as dead and drop the initial reference we got when
+ * the zone write plug was added to the hash table. The zone write plug will be
+ * unhashed when its last reference is dropped.
+ */
+static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
+{
+	lockdep_assert_held(&zwplug->lock);
+
+	if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
+		zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
+		disk_put_zone_wplug(zwplug);
+	}
+}
+
+static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
+				       struct blk_zone_wplug *zwplug);
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(work, struct blk_zone_wplug, bio_work);
+
+	disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
+
+	/* Drop the reference we took in disk_zone_wplug_schedule_work(). */
 	disk_put_zone_wplug(zwplug);
 }

-static void blk_zone_wplug_bio_work(struct work_struct *work);
-
 /*
- * Get a reference on the write plug for the zone containing @sector.
- * If the plug does not exist, it is allocated and hashed.
- * Return a pointer to the zone write plug with the plug spinlock held.
+ * Get a zone write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and inserted in the disk hash
+ * table.
 */
-static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
-					sector_t sector, gfp_t gfp_mask,
-					unsigned long *flags)
+static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
+					sector_t sector, gfp_t gfp_mask)
 {
 	unsigned int zno = disk_zone_no(disk, sector);
 	struct blk_zone_wplug *zwplug;

 again:
 	zwplug = disk_get_zone_wplug(disk, sector);
-	if (zwplug) {
-		/*
-		 * Check that a BIO completion or a zone reset or finish
-		 * operation has not already removed the zone write plug from
-		 * the hash table and dropped its reference count. In such case,
-		 * we need to get a new plug so start over from the beginning.
-		 */
-		spin_lock_irqsave(&zwplug->lock, *flags);
-		if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
-			spin_unlock_irqrestore(&zwplug->lock, *flags);
-			disk_put_zone_wplug(zwplug);
-			goto again;
-		}
+	if (zwplug)
 		return zwplug;
-	}

 	/*
 	 * Allocate and initialize a zone write plug with an extra reference
@@ -704,17 +670,15 @@ again:
 	zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
 	bio_list_init(&zwplug->bio_list);
 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+	INIT_LIST_HEAD(&zwplug->entry);
 	zwplug->disk = disk;

-	spin_lock_irqsave(&zwplug->lock, *flags);
-
 	/*
 	 * Insert the new zone write plug in the hash table. This can fail only
 	 * if another context already inserted a plug. Retry from the beginning
 	 * in such case.
 	 */
 	if (!disk_insert_zone_wplug(disk, zwplug)) {
-		spin_unlock_irqrestore(&zwplug->lock, *flags);
 		mempool_free(zwplug, disk->zone_wplugs_pool);
 		goto again;
 	}
@@ -739,6 +703,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
 */
 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
 {
+	struct gendisk *disk = zwplug->disk;
 	struct bio *bio;

 	lockdep_assert_held(&zwplug->lock);
@@ -752,6 +717,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
 		blk_zone_wplug_bio_io_error(zwplug, bio);

 	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+	/*
+	 * If we are using the per disk zone write plugs worker thread, remove
+	 * the zone write plug from the work list and drop the reference we
+	 * took when the zone write plug was added to that list.
+	 */
+	if (blk_queue_zoned_qd1_writes(disk->queue)) {
+		spin_lock(&disk->zone_wplugs_list_lock);
+		if (!list_empty(&zwplug->entry)) {
+			list_del_init(&zwplug->entry);
+			disk_put_zone_wplug(zwplug);
+		}
+		spin_unlock(&disk->zone_wplugs_list_lock);
+	}
 }

 /*
@@ -788,14 +767,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
 	disk_zone_wplug_update_cond(disk, zwplug);

 	disk_zone_wplug_abort(zwplug);
-
-	/*
-	 * The zone write plug now has no BIO plugged: remove it from the
-	 * hash table so that it cannot be seen. The plug will be freed
-	 * when the last reference is dropped.
-	 */
-	if (disk_should_remove_zone_wplug(disk, zwplug))
-		disk_remove_zone_wplug(disk, zwplug);
+	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
+		disk_mark_zone_wplug_dead(zwplug);
 }

 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
@@ -1192,19 +1165,24 @@ void blk_zone_mgmt_bio_endio(struct bio *bio)
 	}
 }

-static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
-					      struct blk_zone_wplug *zwplug)
+static void disk_zone_wplug_schedule_work(struct gendisk *disk,
+					  struct blk_zone_wplug *zwplug)
 {
 	lockdep_assert_held(&zwplug->lock);

 	/*
-	 * Take a reference on the zone write plug and schedule the submission
-	 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
-	 * reference we take here.
+	 * Schedule the submission of the next plugged BIO. Taking a reference
+	 * to the zone write plug is required as the bio_work belongs to the
+	 * plug, and thus we must ensure that the write plug does not go away
+	 * while the work is being scheduled but has not run yet.
+	 * blk_zone_wplug_bio_work() will release the reference we take here,
+	 * and we also drop this reference if the work is already scheduled.
 	 */
 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+	WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
 	refcount_inc(&zwplug->ref);
-	queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+	if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
+		disk_put_zone_wplug(zwplug);
 }

 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
@@ -1241,6 +1219,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
 	bio_list_add(&zwplug->bio_list, bio);
 	trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
 				      bio->bi_iter.bi_sector, bio_sectors(bio));
+
+	/*
+	 * If we are using the disk zone write plugs worker instead of the per
+	 * zone write plug BIO work, add the zone write plug to the work list
+	 * if it is not already there. Make sure to also get an extra reference
+	 * on the zone write plug so that it does not go away until it is
+	 * removed from the work list.
+	 */
+	if (blk_queue_zoned_qd1_writes(disk->queue)) {
+		spin_lock(&disk->zone_wplugs_list_lock);
+		if (list_empty(&zwplug->entry)) {
+			list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
+			refcount_inc(&zwplug->ref);
+		}
+		spin_unlock(&disk->zone_wplugs_list_lock);
+	}
 }

 /*
@@ -1438,7 +1432,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 	if (bio->bi_opf & REQ_NOWAIT)
 		gfp_mask = GFP_NOWAIT;

-	zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+	zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
 	if (!zwplug) {
 		if (bio->bi_opf & REQ_NOWAIT)
 			bio_wouldblock_error(bio);
@@ -1447,6 +1441,21 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 		return true;
 	}

+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * If we got a zone write plug marked as dead, then the user is issuing
+	 * writes to a full zone, or without synchronizing with zone reset or
+	 * zone finish operations. In such case, fail the BIO to signal this
+	 * invalid usage.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		disk_put_zone_wplug(zwplug);
+		bio_io_error(bio);
+		return true;
+	}
+
 	/* Indicate that this BIO is being handled using zone write plugging. */
 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);

@@ -1459,6 +1468,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
 		goto queue_bio;
 	}

+	/*
+	 * For rotational devices, we will use the gendisk zone write plugs
+	 * work instead of the per zone write plug BIO work, so queue the BIO.
+	 */
+	if (blk_queue_zoned_qd1_writes(disk->queue))
+		goto queue_bio;
+
 	/* If the zone is already plugged, add the BIO to the BIO plug list. */
 	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
 		goto queue_bio;
@@ -1481,7 +1497,10 @@ queue_bio:

 	if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
 		zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
-		disk_zone_wplug_schedule_bio_work(disk, zwplug);
+		if (blk_queue_zoned_qd1_writes(disk->queue))
+			wake_up_process(disk->zone_wplugs_worker);
+		else
+			disk_zone_wplug_schedule_work(disk, zwplug);
 	}

 	spin_unlock_irqrestore(&zwplug->lock, flags);
@@ -1527,7 +1546,7 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
 				    disk->disk_name, zwplug->zone_no);
 		disk_zone_wplug_abort(zwplug);
 	}
-	disk_remove_zone_wplug(disk, zwplug);
+	disk_mark_zone_wplug_dead(zwplug);
 	spin_unlock_irqrestore(&zwplug->lock, flags);

 	disk_put_zone_wplug(zwplug);
@@ -1622,21 +1641,21 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,

 	spin_lock_irqsave(&zwplug->lock, flags);

-	/* Schedule submission of the next plugged BIO if we have one. */
-	if (!bio_list_empty(&zwplug->bio_list)) {
-		disk_zone_wplug_schedule_bio_work(disk, zwplug);
-		spin_unlock_irqrestore(&zwplug->lock, flags);
-		return;
-	}
-
-	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
-
 	/*
-	 * If the zone is full (it was fully written or finished, or empty
-	 * (it was reset), remove its zone write plug from the hash table.
+	 * For rotational devices, signal the BIO completion to the zone write
+	 * plug work. Otherwise, schedule submission of the next plugged BIO
+	 * if we have one.
 	 */
-	if (disk_should_remove_zone_wplug(disk, zwplug))
-		disk_remove_zone_wplug(disk, zwplug);
+	if (bio_list_empty(&zwplug->bio_list))
+		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+	if (blk_queue_zoned_qd1_writes(disk->queue))
+		complete(&disk->zone_wplugs_worker_bio_done);
+	else if (!bio_list_empty(&zwplug->bio_list))
+		disk_zone_wplug_schedule_work(disk, zwplug);
+
+	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
+		disk_mark_zone_wplug_dead(zwplug);

 	spin_unlock_irqrestore(&zwplug->lock, flags);
 }
@@ -1727,10 +1746,9 @@ void blk_zone_write_plug_finish_request(struct request *req)
 	disk_put_zone_wplug(zwplug);
 }

-static void blk_zone_wplug_bio_work(struct work_struct *work)
+static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
+				       struct blk_zone_wplug *zwplug)
 {
-	struct blk_zone_wplug *zwplug =
-		container_of(work, struct blk_zone_wplug, bio_work);
 	struct block_device *bdev;
 	unsigned long flags;
 	struct bio *bio;
@@ -1746,7 +1764,7 @@ again:
 	if (!bio) {
 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
 		spin_unlock_irqrestore(&zwplug->lock, flags);
-		goto put_zwplug;
+		return false;
 	}

 	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
@@ -1760,14 +1778,15 @@ again:
 		goto again;
 	}

-	bdev = bio->bi_bdev;
-
 	/*
 	 * blk-mq devices will reuse the extra reference on the request queue
 	 * usage counter we took when the BIO was plugged, but the submission
 	 * path for BIO-based devices will not do that. So drop this extra
 	 * reference here.
 	 */
+	if (blk_queue_zoned_qd1_writes(disk->queue))
+		reinit_completion(&disk->zone_wplugs_worker_bio_done);
+	bdev = bio->bi_bdev;
 	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
 		bdev->bd_disk->fops->submit_bio(bio);
 		blk_queue_exit(bdev->bd_disk->queue);
@@ -1775,14 +1794,78 @@ again:
 		blk_mq_submit_bio(bio);
 	}

-put_zwplug:
-	/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
-	disk_put_zone_wplug(zwplug);
+	return true;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
+{
+	struct blk_zone_wplug *zwplug;
+
+	spin_lock_irq(&disk->zone_wplugs_list_lock);
+	zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
+					  struct blk_zone_wplug, entry);
+	if (zwplug)
+		list_del_init(&zwplug->entry);
+	spin_unlock_irq(&disk->zone_wplugs_list_lock);
+
+	return zwplug;
+}
+
+static int disk_zone_wplugs_worker(void *data)
+{
+	struct gendisk *disk = data;
+	struct blk_zone_wplug *zwplug;
+	unsigned int noio_flag;
+
+	noio_flag = memalloc_noio_save();
+	set_user_nice(current, MIN_NICE);
+	set_freezable();
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+
+		zwplug = disk_get_zone_wplugs_work(disk);
+		if (zwplug) {
+			/*
+			 * Process all BIOs of this zone write plug and then
+			 * drop the reference we took when adding the zone write
+			 * plug to the active list.
+			 */
+			set_current_state(TASK_RUNNING);
+			while (disk_zone_wplug_submit_bio(disk, zwplug))
+				blk_wait_io(&disk->zone_wplugs_worker_bio_done);
+			disk_put_zone_wplug(zwplug);
+			continue;
+		}
+
+		/*
+		 * Only sleep if nothing sets the state to running. Else check
+		 * for zone write plugs work again as a newly submitted BIO
+		 * might have added a zone write plug to the work list.
+		 */
+		if (get_current_state() == TASK_RUNNING) {
+			try_to_freeze();
+		} else {
+			if (kthread_should_stop()) {
+				set_current_state(TASK_RUNNING);
+				break;
+			}
+			schedule();
+		}
+	}
+
+	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
+	memalloc_noio_restore(noio_flag);
+
+	return 0;
 }

 void disk_init_zone_resources(struct gendisk *disk)
 {
-	spin_lock_init(&disk->zone_wplugs_lock);
+	spin_lock_init(&disk->zone_wplugs_hash_lock);
+	spin_lock_init(&disk->zone_wplugs_list_lock);
+	INIT_LIST_HEAD(&disk->zone_wplugs_list);
+	init_completion(&disk->zone_wplugs_worker_bio_done);
 }

 /*
@@ -1798,6 +1881,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 				     unsigned int pool_size)
 {
 	unsigned int i;
+	int ret = -ENOMEM;

 	atomic_set(&disk->nr_zone_wplugs, 0);
 	disk->zone_wplugs_hash_bits =
@@ -1823,8 +1907,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 	if (!disk->zone_wplugs_wq)
 		goto destroy_pool;

+	disk->zone_wplugs_worker =
+		kthread_create(disk_zone_wplugs_worker, disk,
+			       "%s_zwplugs_worker", disk->disk_name);
+	if (IS_ERR(disk->zone_wplugs_worker)) {
+		ret = PTR_ERR(disk->zone_wplugs_worker);
+		disk->zone_wplugs_worker = NULL;
+		goto destroy_wq;
+	}
+	wake_up_process(disk->zone_wplugs_worker);
+
 	return 0;

+destroy_wq:
+	destroy_workqueue(disk->zone_wplugs_wq);
+	disk->zone_wplugs_wq = NULL;
 destroy_pool:
 	mempool_destroy(disk->zone_wplugs_pool);
 	disk->zone_wplugs_pool = NULL;
@@ -1832,7 +1929,7 @@ free_hash:
 	kfree(disk->zone_wplugs_hash);
 	disk->zone_wplugs_hash = NULL;
 	disk->zone_wplugs_hash_bits = 0;
-	return -ENOMEM;
+	return ret;
 }

 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
@@ -1848,9 +1945,9 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
 		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
 			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
 					     struct blk_zone_wplug, node);
-			refcount_inc(&zwplug->ref);
-			disk_remove_zone_wplug(disk, zwplug);
-			disk_put_zone_wplug(zwplug);
+			spin_lock_irq(&zwplug->lock);
+			disk_mark_zone_wplug_dead(zwplug);
+			spin_unlock_irq(&zwplug->lock);
 		}
 	}

@@ -1872,16 +1969,20 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
 {
 	unsigned long flags;

-	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
 	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
-				lockdep_is_held(&disk->zone_wplugs_lock));
-	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+				lockdep_is_held(&disk->zone_wplugs_hash_lock));
+	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);

 	kfree_rcu_mightsleep(zones_cond);
 }

 void disk_free_zone_resources(struct gendisk *disk)
 {
+	if (disk->zone_wplugs_worker)
+		kthread_stop(disk->zone_wplugs_worker);
+	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
+
 	if (disk->zone_wplugs_wq) {
 		destroy_workqueue(disk->zone_wplugs_wq);
 		disk->zone_wplugs_wq = NULL;
@@ -1910,6 +2011,7 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 {
 	struct queue_limits *lim = &disk->queue->limits;
 	unsigned int pool_size;
+	int ret = 0;

 	args->disk = disk;
 	args->nr_zones =
@@ -1932,10 +2034,13 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 		pool_size =
 			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);

-	if (!disk->zone_wplugs_hash)
-		return disk_alloc_zone_resources(disk, pool_size);
+	if (!disk->zone_wplugs_hash) {
+		ret = disk_alloc_zone_resources(disk, pool_size);
+		if (ret)
+			kfree(args->zones_cond);
+	}

-	return 0;
+	return ret;
 }

 /*
@@ -1967,6 +2072,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
 	disk->zone_capacity = args->zone_capacity;
 	disk->last_zone_capacity = args->last_zone_capacity;
 	disk_set_zones_cond_array(disk, args->zones_cond);
+	args->zones_cond = NULL;

 	/*
 	 * Some devices can advertise zone resource limits that are larger than
@@ -2078,7 +2184,6 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
 	struct gendisk *disk = args->disk;
 	struct blk_zone_wplug *zwplug;
 	unsigned int wp_offset;
-	unsigned long flags;

 	/*
 	 * Remember the capacity of the first sequential zone and check
@@ -2108,10 +2213,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
 	if (!wp_offset || wp_offset >= zone->capacity)
 		return 0;

-	zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
+	zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
 	if (!zwplug)
 		return -ENOMEM;
-	spin_unlock_irqrestore(&zwplug->lock, flags);
 	disk_put_zone_wplug(zwplug);

 	return 0;
@@ -2249,21 +2353,30 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	}
 	memalloc_noio_restore(noio_flag);

+	if (ret <= 0)
+		goto free_resources;
+
 	/*
 	 * If zones where reported, make sure that the entire disk capacity
 	 * has been checked.
 	 */
-	if (ret > 0 && args.sector != capacity) {
+	if (args.sector != capacity) {
 		pr_warn("%s: Missing zones from sector %llu\n",
 			disk->disk_name, args.sector);
 		ret = -ENODEV;
+		goto free_resources;
 	}

-	if (ret > 0)
-		return disk_update_zone_resources(disk, &args);
+	ret = disk_update_zone_resources(disk, &args);
+	if (ret)
+		goto free_resources;

+	return 0;
+
+free_resources:
 	pr_warn("%s: failed to revalidate zones\n", disk->disk_name);

+	kfree(args.zones_cond);
 	memflags = blk_mq_freeze_queue(q);
 	disk_free_zone_resources(disk);
 	blk_mq_unfreeze_queue(q, memflags);
--- a/block/blk.h
+++ b/block/blk.h
@@ -55,7 +55,7 @@ bool __blk_freeze_queue_start(struct request_queue *q,
 			      struct task_struct *owner);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
 void submit_bio_noacct_nocheck(struct bio *bio, bool split);
-void bio_await_chain(struct bio *bio);
+int bio_submit_or_kill(struct bio *bio, unsigned int flags);

 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
@@ -108,11 +108,6 @@ static inline void blk_wait_io(struct completion *done)
 struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
 void blkdev_put_no_open(struct block_device *bdev);

-#define BIO_INLINE_VECS 4
-struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
-		gfp_t gfp_mask);
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
-
 bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
 		struct page *page, unsigned len, unsigned offset);

--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -393,7 +393,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,

 	blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);

-	bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn);
+	bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn, NULL);
 	if (IS_ERR(bset->bd)) {
 		ret = PTR_ERR(bset->bd);
 		goto out_cleanup_queue;
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -12,6 +12,7 @@
 #include <linux/idr.h>
 #include <linux/bsg.h>
 #include <linux/slab.h>
+#include <linux/io_uring/cmd.h>

 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
@@ -28,6 +29,7 @@ struct bsg_device {
 	unsigned int timeout;
 	unsigned int reserved_size;
 	bsg_sg_io_fn *sg_io_fn;
+	bsg_uring_cmd_fn *uring_cmd_fn;
 };

 static inline struct bsg_device *to_bsg_device(struct inode *inode)
@@ -158,11 +160,38 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 }

+static int bsg_check_uring_features(unsigned int issue_flags)
+{
+	/* BSG passthrough requires big SQE/CQE support */
+	if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
+	    (IO_URING_F_SQE128|IO_URING_F_CQE32))
+		return -EOPNOTSUPP;
+	return 0;
+}
+
+static int bsg_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+{
+	struct bsg_device *bd = to_bsg_device(file_inode(ioucmd->file));
+	bool open_for_write = ioucmd->file->f_mode & FMODE_WRITE;
+	struct request_queue *q = bd->queue;
+	int ret;
+
+	ret = bsg_check_uring_features(issue_flags);
+	if (ret)
+		return ret;
+
+	if (!bd->uring_cmd_fn)
+		return -EOPNOTSUPP;
+
+	return bd->uring_cmd_fn(q, ioucmd, issue_flags, open_for_write);
+}
+
 static const struct file_operations bsg_fops = {
 	.open		=	bsg_open,
 	.release	=	bsg_release,
 	.unlocked_ioctl	=	bsg_ioctl,
 	.compat_ioctl	=	compat_ptr_ioctl,
+	.uring_cmd	=	bsg_uring_cmd,
 	.owner		=	THIS_MODULE,
 	.llseek		=	default_llseek,
 };
@@ -187,7 +216,8 @@ void bsg_unregister_queue(struct bsg_device *bd)
 EXPORT_SYMBOL_GPL(bsg_unregister_queue);

 struct bsg_device *bsg_register_queue(struct request_queue *q,
-		struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn)
+		struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn,
+		bsg_uring_cmd_fn *uring_cmd_fn)
 {
 	struct bsg_device *bd;
 	int ret;
@@ -199,6 +229,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
 	bd->reserved_size = INT_MAX;
 	bd->queue = q;
 	bd->sg_io_fn = sg_io_fn;
+	bd->uring_cmd_fn = uring_cmd_fn;

 	ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL);
 	if (ret < 0) {
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -290,13 +290,14 @@ EXPORT_SYMBOL(disk_check_media_change);
 * Should be called when the media changes for @disk.  Generates a uevent
 * and attempts to free all dentries and inodes and invalidates all block
 * device page cache entries in that case.
+ *
+ * Callers that need a partition re-scan should arrange for one explicitly.
 */
 void disk_force_media_change(struct gendisk *disk)
 {
 	disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
 	inc_diskseq(disk);
 	bdev_mark_dead(disk->part0, true);
-	set_bit(GD_NEED_PART_SCAN, &disk->state);
 }
 EXPORT_SYMBOL_GPL(disk_force_media_change);

--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -153,13 +153,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 	nr_sects = len >> SECTOR_SHIFT;

 	blk_start_plug(&plug);
-	while (1) {
-		if (fatal_signal_pending(current)) {
-			if (prev)
-				bio_await_chain(prev);
-			err = -EINTR;
-			goto out_unplug;
-		}
+	while (!fatal_signal_pending(current)) {
 		bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
 				GFP_KERNEL);
 		if (!bio)
@@ -167,12 +161,11 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 		prev = bio_chain_and_submit(prev, bio);
 	}
 	if (prev) {
-		err = submit_bio_wait(prev);
+		err = bio_submit_or_kill(prev, BLKDEV_ZERO_KILLABLE);
 		if (err == -EOPNOTSUPP)
 			err = 0;
 		bio_put(prev);
 	}
-out_unplug:
 	blk_finish_plug(&plug);
 fail:
 	filemap_invalidate_unlock(bdev->bd_mapping);
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -19,6 +19,7 @@
 enum {
 	TCG_SECP_00 = 0,
 	TCG_SECP_01,
+	TCG_SECP_02,
 };

 /*
@@ -125,6 +126,7 @@ enum opal_uid {
 	OPAL_LOCKING_INFO_TABLE,
 	OPAL_ENTERPRISE_LOCKING_INFO_TABLE,
 	OPAL_DATASTORE,
+	OPAL_LOCKING_TABLE,
 	/* C_PIN_TABLE object ID's */
 	OPAL_C_PIN_MSID,
 	OPAL_C_PIN_SID,
@@ -154,6 +156,7 @@ enum opal_method {
 	OPAL_AUTHENTICATE,
 	OPAL_RANDOM,
 	OPAL_ERASE,
+	OPAL_REACTIVATE,
 };

 enum opal_token {
@@ -224,6 +227,8 @@ enum opal_lockingstate {

 enum opal_parameter {
 	OPAL_SUM_SET_LIST = 0x060000,
+	OPAL_SUM_RANGE_POLICY = 0x060001,
+	OPAL_SUM_ADMIN1_PIN = 0x060002,
 };

 enum opal_revertlsp {
@@ -269,6 +274,25 @@ struct opal_header {
 	struct opal_data_subpacket subpkt;
 };

+/*
+ * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
+ * Section: 3.3.4.7.5 STACK_RESET
+ */
+#define OPAL_STACK_RESET 0x0002
+
+struct opal_stack_reset {
+	u8 extendedComID[4];
+	__be32 request_code;
+};
+
+struct opal_stack_reset_response {
+	u8 extendedComID[4];
+	__be32 request_code;
+	u8 reserved0[2];
+	__be16 data_length;
+	__be32 response;
+};
+
 #define FC_TPER       0x0001
 #define FC_LOCKING    0x0002
 #define FC_GEOMETRY   0x0003
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -40,9 +40,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
 		   (le32_to_cpu(dr->disc_size) >> 9);

 	if (name) {
-		strlcat(state->pp_buf, " [", PAGE_SIZE);
-		strlcat(state->pp_buf, name, PAGE_SIZE);
-		strlcat(state->pp_buf, "]", PAGE_SIZE);
+		seq_buf_printf(&state->pp_buf, " [%s]", name);
 	}
 	put_partition(state, slot, first_sector, nr_sects);
 	return dr;
@@ -78,14 +76,14 @@ static int riscix_partition(struct parsed_partitions *state,
 	if (!rr)
 		return -1;

-	strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " [RISCiX]");


 	if (rr->magic == RISCIX_MAGIC) {
 		unsigned long size = nr_sects > 2 ? 2 : nr_sects;
 		int part;

-		strlcat(state->pp_buf, " <", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, " <");

 		put_partition(state, slot++, first_sect, size);
 		for (part = 0; part < 8; part++) {
@@ -94,13 +92,11 @@ static int riscix_partition(struct parsed_partitions *state,
 				put_partition(state, slot++,
 					le32_to_cpu(rr->part[part].start),
 					le32_to_cpu(rr->part[part].length));
-				strlcat(state->pp_buf, "(", PAGE_SIZE);
-				strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
-				strlcat(state->pp_buf, ")", PAGE_SIZE);
+				seq_buf_printf(&state->pp_buf, "(%s)", rr->part[part].name);
 			}
 		}

-		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, " >\n");
 	} else {
 		put_partition(state, slot++, first_sect, nr_sects);
 	}
@@ -130,7 +126,7 @@ static int linux_partition(struct parsed_partitions *state,
 	struct linux_part *linuxp;
 	unsigned long size = nr_sects > 2 ? 2 : nr_sects;

-	strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " [Linux]");

 	put_partition(state, slot++, first_sect, size);

@@ -138,7 +134,7 @@ static int linux_partition(struct parsed_partitions *state,
 	if (!linuxp)
 		return -1;

-	strlcat(state->pp_buf, " <", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " <");
 	while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
 	       linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
 		if (slot == state->limit)
@@ -148,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state,
 				 le32_to_cpu(linuxp->nr_sects));
 		linuxp ++;
 	}
-	strlcat(state->pp_buf, " >", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " >");

 	put_dev_sector(sect);
 	return slot;
@@ -293,7 +289,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
 			break;
 		}
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }
 #endif
@@ -366,7 +362,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
 		return 0;
 	}

-	strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " [ICS]");

 	for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
 		u32 start = le32_to_cpu(p->start);
@@ -400,7 +396,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
 	}

 	put_dev_sector(sect);
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }
 #endif
@@ -460,7 +456,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
 		return 0;
 	}

-	strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " [POWERTEC]");

 	for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
 		u32 start = le32_to_cpu(p->start);
@@ -471,7 +467,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
 	}

 	put_dev_sector(sect);
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }
 #endif
@@ -542,7 +538,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)

 		size = get_capacity(state->disk);
 		put_partition(state, slot++, start, size - start);
-		strlcat(state->pp_buf, "\n", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, "\n");
 	}

 	return i ? 1 : 0;
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -173,24 +173,22 @@ int aix_partition(struct parsed_partitions *state)
 	if (d) {
 		struct lvm_rec *p = (struct lvm_rec *)d;
 		u16 lvm_version = be16_to_cpu(p->version);
-		char tmp[64];

 		if (lvm_version == 1) {
 			int pp_size_log2 = be16_to_cpu(p->pp_size);

 			pp_bytes_size = 1 << pp_size_log2;
 			pp_blocks_size = pp_bytes_size / 512;
-			snprintf(tmp, sizeof(tmp),
-				" AIX LVM header version %u found\n",
-				lvm_version);
+			seq_buf_printf(&state->pp_buf,
+				       " AIX LVM header version %u found\n",
+				       lvm_version);
 			vgda_len = be32_to_cpu(p->vgda_len);
 			vgda_sector = be32_to_cpu(p->vgda_psn[0]);
 		} else {
-			snprintf(tmp, sizeof(tmp),
-				" unsupported AIX LVM version %d found\n",
-				lvm_version);
+			seq_buf_printf(&state->pp_buf,
+				       " unsupported AIX LVM version %d found\n",
+				       lvm_version);
 		}
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
 		put_dev_sector(sect);
 	}
 	if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
@@ -251,14 +249,11 @@ int aix_partition(struct parsed_partitions *state)
 				continue;
 			}
 			if (lp_ix == lvip[lv_ix].pps_per_lv) {
-				char tmp[70];
-
 				put_partition(state, lv_ix + 1,
 				  (i + 1 - lp_ix) * pp_blocks_size + psn_part1,
 				  lvip[lv_ix].pps_per_lv * pp_blocks_size);
-				snprintf(tmp, sizeof(tmp), " <%s>\n",
-					 n[lv_ix].name);
-				strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				seq_buf_printf(&state->pp_buf, " <%s>\n",
+					       n[lv_ix].name);
 				lvip[lv_ix].lv_is_contiguous = 1;
 				ret = 1;
 				next_lp_ix = 1;
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -81,13 +81,8 @@ int amiga_partition(struct parsed_partitions *state)
 	/* blksize is blocks per 512 byte standard block */
 	blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;

-	{
-		char tmp[7 + 10 + 1 + 1];
-
-		/* Be more informative */
-		snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
-	}
+	/* Be more informative */
+	seq_buf_printf(&state->pp_buf, " RDSK (%d)", blksize * 512);
 	blk = be32_to_cpu(rdb->rdb_PartitionList);
 	put_dev_sector(sect);
 	for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) {
@@ -179,27 +174,27 @@ int amiga_partition(struct parsed_partitions *state)
 		{
 			/* Be even more informative to aid mounting */
 			char dostype[4];
-			char tmp[42];

 			__be32 *dt = (__be32 *)dostype;
 			*dt = pb->pb_Environment[16];
 			if (dostype[3] < ' ')
-				snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
-					dostype[0], dostype[1],
-					dostype[2], dostype[3] + '@' );
+				seq_buf_printf(&state->pp_buf,
+					       " (%c%c%c^%c)",
+					       dostype[0], dostype[1],
+					       dostype[2],
+					       dostype[3] + '@');
 			else
-				snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
-					dostype[0], dostype[1],
-					dostype[2], dostype[3]);
-			strlcat(state->pp_buf, tmp, PAGE_SIZE);
-			snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
-				be32_to_cpu(pb->pb_Environment[6]),
-				be32_to_cpu(pb->pb_Environment[4]));
-			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+				seq_buf_printf(&state->pp_buf,
+					       " (%c%c%c%c)",
+					       dostype[0], dostype[1],
+					       dostype[2], dostype[3]);
+			seq_buf_printf(&state->pp_buf, "(res %d spb %d)",
+				       be32_to_cpu(pb->pb_Environment[6]),
+				       be32_to_cpu(pb->pb_Environment[4]));
 		}
 		res = 1;
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");

 rdb_done:
 	return res;
--- a/block/partitions/atari.c
+++ b/block/partitions/atari.c
@@ -70,7 +70,7 @@ int atari_partition(struct parsed_partitions *state)
 	}

 	pi = &rs->part[0];
-	strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " AHDI");
 	for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
 		struct rootsector *xrs;
 		Sector sect2;
@@ -89,7 +89,7 @@ int atari_partition(struct parsed_partitions *state)
 #ifdef ICD_PARTS
 		part_fmt = 1;
 #endif
-		strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, " XGM<");
 		partsect = extensect = be32_to_cpu(pi->st);
 		while (1) {
 			xrs = read_part_sector(state, partsect, &sect2);
@@ -128,14 +128,14 @@ int atari_partition(struct parsed_partitions *state)
 				break;
 			}
 		}
-		strlcat(state->pp_buf, " >", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, " >");
 	}
 #ifdef ICD_PARTS
 	if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
 		pi = &rs->icdpart[0];
 		/* sanity check: no ICD format if first partition invalid */
 		if (OK_id(pi->id)) {
-			strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, " ICD<");
 			for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
 				/* accept only GEM,BGM,RAW,LNX,SWP partitions */
 				if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -144,13 +144,13 @@ int atari_partition(struct parsed_partitions *state)
 						be32_to_cpu(pi->st),
 						be32_to_cpu(pi->siz));
 			}
-			strlcat(state->pp_buf, " >", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, " >");
 		}
 	}
 #endif
 	put_dev_sector(sect);

-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");

 	return 1;
 }
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/seq_buf.h>
 #include "../blk.h"

 /*
@@ -20,7 +21,7 @@ struct parsed_partitions {
 	int next;
 	int limit;
 	bool access_beyond_eod;
-	char *pp_buf;
+	struct seq_buf pp_buf;
 };

 typedef struct {
@@ -37,12 +38,9 @@ static inline void
 put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
 {
 	if (n < p->limit) {
-		char tmp[1 + BDEVNAME_SIZE + 10 + 1];
-
 		p->parts[n].from = from;
 		p->parts[n].size = size;
-		snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
-		strlcat(p->pp_buf, tmp, PAGE_SIZE);
+		seq_buf_printf(&p->pp_buf, " %s%d", p->name, n);
 	}
 }

--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -229,7 +229,6 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
 		struct parsed_partitions *state)
 {
 	struct partition_meta_info *info;
-	char tmp[sizeof(info->volname) + 4];

 	if (slot >= state->limit)
 		return 1;
@@ -244,8 +243,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart,

 	strscpy(info->volname, subpart->name, sizeof(info->volname));

-	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
-	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	seq_buf_printf(&state->pp_buf, "(%s)", info->volname);

 	state->parts[slot].has_info = true;

@@ -379,7 +377,7 @@ int cmdline_partition(struct parsed_partitions *state)
 	cmdline_parts_set(parts, disk_size, state);
 	cmdline_parts_verifier(1, state);

-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");

 	return 1;
 }
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -8,6 +8,7 @@
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/string.h>
+#include <linux/sysfs.h>
 #include <linux/ctype.h>
 #include <linux/vmalloc.h>
 #include <linux/raid/detect.h>
@@ -123,16 +124,16 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 	state = allocate_partitions(hd);
 	if (!state)
 		return NULL;
-	state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
-	if (!state->pp_buf) {
+	state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL);
+	if (!state->pp_buf.buffer) {
 		free_partitions(state);
 		return NULL;
 	}
-	state->pp_buf[0] = '\0';
+	seq_buf_init(&state->pp_buf, state->pp_buf.buffer, PAGE_SIZE);

 	state->disk = hd;
 	strscpy(state->name, hd->disk_name);
-	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
+	seq_buf_printf(&state->pp_buf, " %s:", state->name);
 	if (isdigit(state->name[strlen(state->name)-1]))
 		sprintf(state->name, "p");

@@ -151,9 +152,9 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)

 	}
 	if (res > 0) {
-		printk(KERN_INFO "%s", state->pp_buf);
+		printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));

-		free_page((unsigned long)state->pp_buf);
+		free_page((unsigned long)state->pp_buf.buffer);
 		return state;
 	}
 	if (state->access_beyond_eod)
@@ -164,12 +165,12 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 	if (err)
 		res = err;
 	if (res) {
-		strlcat(state->pp_buf,
-			" unable to read partition table\n", PAGE_SIZE);
-		printk(KERN_INFO "%s", state->pp_buf);
+		seq_buf_puts(&state->pp_buf,
+			     " unable to read partition table\n");
+		printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
 	}

-	free_page((unsigned long)state->pp_buf);
+	free_page((unsigned long)state->pp_buf.buffer);
 	free_partitions(state);
 	return ERR_PTR(res);
 }
@@ -177,31 +178,31 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 static ssize_t part_partition_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
+	return sysfs_emit(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
 }

 static ssize_t part_start_show(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
+	return sysfs_emit(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
 }

 static ssize_t part_ro_show(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
+	return sysfs_emit(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
 }

 static ssize_t part_alignment_offset_show(struct device *dev,
 					  struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev)));
+	return sysfs_emit(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev)));
 }

 static ssize_t part_discard_alignment_show(struct device *dev,
 					   struct device_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
+	return sysfs_emit(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
 }

 static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -751,6 +751,6 @@ int efi_partition(struct parsed_partitions *state)
 	}
 	kfree(ptes);
 	kfree(gpt);
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -173,15 +173,13 @@ static int find_vol1_partitions(struct parsed_partitions *state,
 {
 	sector_t blk;
 	int counter;
-	char tmp[64];
 	Sector sect;
 	unsigned char *data;
 	loff_t offset, size;
 	struct vtoc_format1_label f1;
 	int secperblk;

-	snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
-	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	seq_buf_printf(&state->pp_buf, "VOL1/%8s:", name);
 	/*
 	 * get start of VTOC from the disk label and then search for format1
 	 * and format8 labels
@@ -219,7 +217,7 @@ static int find_vol1_partitions(struct parsed_partitions *state,
 		blk++;
 		data = read_part_sector(state, blk * secperblk, &sect);
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");

 	if (!data)
 		return -1;
@@ -237,11 +235,9 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
 				dasd_information2_t *info)
 {
 	loff_t offset, geo_size, size;
-	char tmp[64];
 	int secperblk;

-	snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
-	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	seq_buf_printf(&state->pp_buf, "LNX1/%8s:", name);
 	secperblk = blocksize >> 9;
 	if (label->lnx.ldl_version == 0xf2) {
 		size = label->lnx.formatted_blocks * secperblk;
@@ -258,7 +254,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
 		size = nr_sectors;
 		if (size != geo_size) {
 			if (!info) {
-				strlcat(state->pp_buf, "\n", PAGE_SIZE);
+				seq_buf_puts(&state->pp_buf, "\n");
 				return 1;
 			}
 			if (!strcmp(info->type, "ECKD"))
@@ -270,7 +266,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
 	/* first and only partition starts in the first block after the label */
 	offset = labelsect + secperblk;
 	put_partition(state, 1, offset, size - offset);
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }

@@ -282,7 +278,6 @@ static int find_cms1_partitions(struct parsed_partitions *state,
 				sector_t labelsect)
 {
 	loff_t offset, size;
-	char tmp[64];
 	int secperblk;

 	/*
@@ -291,14 +286,12 @@ static int find_cms1_partitions(struct parsed_partitions *state,
 	blocksize = label->cms.block_size;
 	secperblk = blocksize >> 9;
 	if (label->cms.disk_offset != 0) {
-		snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		seq_buf_printf(&state->pp_buf, "CMS1/%8s(MDSK):", name);
 		/* disk is reserved minidisk */
 		offset = label->cms.disk_offset * secperblk;
 		size = (label->cms.block_count - 1) * secperblk;
 	} else {
-		snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		seq_buf_printf(&state->pp_buf, "CMS1/%8s:", name);
 		/*
 		 * Special case for FBA devices:
 		 * If an FBA device is CMS formatted with blocksize > 512 byte
@@ -314,7 +307,7 @@ static int find_cms1_partitions(struct parsed_partitions *state,
 	}

 	put_partition(state, 1, offset, size-offset);
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }

@@ -391,11 +384,11 @@ int ibm_partition(struct parsed_partitions *state)
 		 */
 		res = 1;
 		if (info->format == DASD_FORMAT_LDL) {
-			strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, "(nonl)");
 			size = nr_sectors;
 			offset = (info->label_block + 1) * (blocksize >> 9);
 			put_partition(state, 1, offset, size-offset);
-			strlcat(state->pp_buf, "\n", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, "\n");
 		}
 	} else
 		res = 0;
--- a/block/partitions/karma.c
+++ b/block/partitions/karma.c
@@ -53,7 +53,7 @@ int karma_partition(struct parsed_partitions *state)
 		}
 		slot++;
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	put_dev_sector(sect);
 	return 1;
 }
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -582,7 +582,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
 		return false;
 	}

-	strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
+	seq_buf_puts(&pp->pp_buf, " [LDM]");

 	/* Create the data partitions */
 	list_for_each (item, &ldb->v_part) {
@@ -597,7 +597,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
 		part_num++;
 	}

-	strlcat(pp->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&pp->pp_buf, "\n");
 	return true;
 }

--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -86,7 +86,7 @@ int mac_partition(struct parsed_partitions *state)
 	if (blocks_in_map >= state->limit)
 		blocks_in_map = state->limit - 1;

-	strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " [mac]");
 	for (slot = 1; slot <= blocks_in_map; ++slot) {
 		int pos = slot * secsize;
 		put_dev_sector(sect);
@@ -152,6 +152,6 @@ int mac_partition(struct parsed_partitions *state)
 #endif

 	put_dev_sector(sect);
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	return 1;
 }
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -263,18 +263,11 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 		put_dev_sector(sect);
 		return;
 	}
-	{
-		char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
-
-		snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
-	}
+	seq_buf_printf(&state->pp_buf, " %s%d: <solaris:", state->name, origin);
 	if (le32_to_cpu(v->v_version) != 1) {
-		char tmp[64];
-
-		snprintf(tmp, sizeof(tmp), "  cannot handle version %d vtoc>\n",
-			 le32_to_cpu(v->v_version));
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		seq_buf_printf(&state->pp_buf,
+			       "  cannot handle version %d vtoc>\n",
+			       le32_to_cpu(v->v_version));
 		put_dev_sector(sect);
 		return;
 	}
@@ -282,12 +275,10 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 	max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
 	for (i = 0; i < max_nparts && state->next < state->limit; i++) {
 		struct solaris_x86_slice *s = &v->v_slice[i];
-		char tmp[3 + 10 + 1 + 1];

 		if (s->s_size == 0)
 			continue;
-		snprintf(tmp, sizeof(tmp), " [s%d]", i);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		seq_buf_printf(&state->pp_buf, " [s%d]", i);
 		/* solaris partitions are relative to current MS-DOS
 		 * one; must add the offset of the current partition */
 		put_partition(state, state->next++,
@@ -295,7 +286,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
 				 le32_to_cpu(s->s_size));
 	}
 	put_dev_sector(sect);
-	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " >\n");
 #endif
 }

@@ -359,7 +350,6 @@ static void parse_bsd(struct parsed_partitions *state,
 	Sector sect;
 	struct bsd_disklabel *l;
 	struct bsd_partition *p;
-	char tmp[64];

 	l = read_part_sector(state, offset + 1, &sect);
 	if (!l)
@@ -369,8 +359,7 @@ static void parse_bsd(struct parsed_partitions *state,
 		return;
 	}

-	snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
-	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	seq_buf_printf(&state->pp_buf, " %s%d: <%s:", state->name, origin, flavour);

 	if (le16_to_cpu(l->d_npartitions) < max_partitions)
 		max_partitions = le16_to_cpu(l->d_npartitions);
@@ -391,18 +380,16 @@ static void parse_bsd(struct parsed_partitions *state,
 			/* full parent partition, we have it already */
 			continue;
 		if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
-			strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, "bad subpartition - ignored\n");
 			continue;
 		}
 		put_partition(state, state->next++, bsd_start, bsd_size);
 	}
 	put_dev_sector(sect);
-	if (le16_to_cpu(l->d_npartitions) > max_partitions) {
-		snprintf(tmp, sizeof(tmp), " (ignored %d more)",
-			 le16_to_cpu(l->d_npartitions) - max_partitions);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
-	}
-	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	if (le16_to_cpu(l->d_npartitions) > max_partitions)
+		seq_buf_printf(&state->pp_buf, " (ignored %d more)",
+			       le16_to_cpu(l->d_npartitions) - max_partitions);
+	seq_buf_puts(&state->pp_buf, " >\n");
 }
 #endif

@@ -496,12 +483,7 @@ static void parse_unixware(struct parsed_partitions *state,
 		put_dev_sector(sect);
 		return;
 	}
-	{
-		char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
-
-		snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
-	}
+	seq_buf_printf(&state->pp_buf, " %s%d: <unixware:", state->name, origin);
 	p = &l->vtoc.v_slice[1];
 	/* I omit the 0th slice as it is the same as whole disk. */
 	while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -515,7 +497,7 @@ static void parse_unixware(struct parsed_partitions *state,
 		p++;
 	}
 	put_dev_sector(sect);
-	strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, " >\n");
 #endif
 }

@@ -546,10 +528,7 @@ static void parse_minix(struct parsed_partitions *state,
 	 * the normal boot sector. */
 	if (msdos_magic_present(data + 510) &&
 	    p->sys_ind == MINIX_PARTITION) { /* subpartition table present */
-		char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
-
-		snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
-		strlcat(state->pp_buf, tmp, PAGE_SIZE);
+		seq_buf_printf(&state->pp_buf, " %s%d: <minix:", state->name, origin);
 		for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
 			if (state->next == state->limit)
 				break;
@@ -558,7 +537,7 @@ static void parse_minix(struct parsed_partitions *state,
 				put_partition(state, state->next++,
 					      start_sect(p), nr_sects(p));
 		}
-		strlcat(state->pp_buf, " >\n", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, " >\n");
 	}
 	put_dev_sector(sect);
 #endif /* CONFIG_MINIX_SUBPARTITION */
@@ -602,7 +581,7 @@ int msdos_partition(struct parsed_partitions *state)
 #ifdef CONFIG_AIX_PARTITION
 		return aix_partition(state);
 #else
-		strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, " [AIX]");
 		return 0;
 #endif
 	}
@@ -629,7 +608,7 @@ int msdos_partition(struct parsed_partitions *state)
 			fb = (struct fat_boot_sector *) data;
 			if (slot == 1 && fb->reserved && fb->fats
 				&& fat_valid_media(fb->media)) {
-				strlcat(state->pp_buf, "\n", PAGE_SIZE);
+				seq_buf_puts(&state->pp_buf, "\n");
 				put_dev_sector(sect);
 				return 1;
 			} else {
@@ -678,9 +657,9 @@ int msdos_partition(struct parsed_partitions *state)
 			n = min(size, max(sector_size, n));
 			put_partition(state, slot, start, n);

-			strlcat(state->pp_buf, " <", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, " <");
 			parse_extended(state, start, size, disksig);
-			strlcat(state->pp_buf, " >", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, " >");
 			continue;
 		}
 		put_partition(state, slot, start, size);
@@ -688,12 +667,12 @@ int msdos_partition(struct parsed_partitions *state)
 		if (p->sys_ind == LINUX_RAID_PARTITION)
 			state->parts[slot].flags = ADDPART_FLAG_RAID;
 		if (p->sys_ind == DM6_PARTITION)
-			strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, "[DM]");
 		if (p->sys_ind == EZD_PARTITION)
-			strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
+			seq_buf_puts(&state->pp_buf, "[EZD]");
 	}

-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");

 	/* second pass - output for each on a separate line */
 	p = (struct msdos_partition *) (0x1be + data);
--- a/block/partitions/of.c
+++ b/block/partitions/of.c
@@ -36,7 +36,6 @@ static void add_of_partition(struct parsed_partitions *state, int slot,
 			     struct device_node *np)
 {
 	struct partition_meta_info *info;
-	char tmp[sizeof(info->volname) + 4];
 	const char *partname;
 	int len;

@@ -63,8 +62,7 @@ static void add_of_partition(struct parsed_partitions *state, int slot,
 		partname = of_get_property(np, "name", &len);
 	strscpy(info->volname, partname, sizeof(info->volname));

-	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
-	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	seq_buf_printf(&state->pp_buf, "(%s)", info->volname);
 }

 int of_partition(struct parsed_partitions *state)
@@ -104,7 +102,7 @@ int of_partition(struct parsed_partitions *state)
 		slot++;
 	}

-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");

 	return 1;
 }
--- a/block/partitions/osf.c
+++ b/block/partitions/osf.c
@@ -81,7 +81,7 @@ int osf_partition(struct parsed_partitions *state)
 				le32_to_cpu(partition->p_size));
 		slot++;
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	put_dev_sector(sect);
 	return 1;
 }
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@@ -79,7 +79,7 @@ int sgi_partition(struct parsed_partitions *state)
 		}
 		slot++;
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	put_dev_sector(sect);
 	return 1;
 }
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@@ -121,7 +121,7 @@ int sun_partition(struct parsed_partitions *state)
 		}
 		slot++;
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	put_dev_sector(sect);
 	return 1;
 }
--- a/block/partitions/sysv68.c
+++ b/block/partitions/sysv68.c
@@ -54,7 +54,6 @@ int sysv68_partition(struct parsed_partitions *state)
 	unsigned char *data;
 	struct dkblk0 *b;
 	struct slice *slice;
-	char tmp[64];

 	data = read_part_sector(state, 0, &sect);
 	if (!data)
@@ -74,8 +73,7 @@ int sysv68_partition(struct parsed_partitions *state)
 		return -1;

 	slices -= 1; /* last slice is the whole disk */
-	snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
-	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+	seq_buf_printf(&state->pp_buf, "sysV68: %s(s%u)", state->name, slices);
 	slice = (struct slice *)data;
 	for (i = 0; i < slices; i++, slice++) {
 		if (slot == state->limit)
@@ -84,12 +82,11 @@ int sysv68_partition(struct parsed_partitions *state)
 			put_partition(state, slot,
 				be32_to_cpu(slice->blkoff),
 				be32_to_cpu(slice->nblocks));
-			snprintf(tmp, sizeof(tmp), "(s%u)", i);
-			strlcat(state->pp_buf, tmp, PAGE_SIZE);
+			seq_buf_printf(&state->pp_buf, "(s%u)", i);
 		}
 		slot++;
 	}
-	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+	seq_buf_puts(&state->pp_buf, "\n");
 	put_dev_sector(sect);
 	return 1;
 }
--- a/block/partitions/ultrix.c
+++ b/block/partitions/ultrix.c
@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state)
 					      label->pt_part[i].pi_blkoff,
 					      label->pt_part[i].pi_nblocks);
 		put_dev_sector(sect);
-		strlcat(state->pp_buf, "\n", PAGE_SIZE);
+		seq_buf_puts(&state->pp_buf, "\n");
 		return 1;
 	} else {
 		put_dev_sector(sect);
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -160,6 +160,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
 		{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
 	[OPAL_DATASTORE] =
 		{ 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 },
+	[OPAL_LOCKING_TABLE] =
+		{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00 },

 	/* C_PIN_TABLE object ID's */
 	[OPAL_C_PIN_MSID] =
@@ -218,6 +220,8 @@ static const u8 opalmethod[][OPAL_METHOD_LENGTH] = {
 		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
 	[OPAL_ERASE] =
 		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
+	[OPAL_REACTIVATE] =
+		{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x01 },
 };

 static int end_opal_session_error(struct opal_dev *dev);
@@ -1514,7 +1518,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
 	return err;
 }

-static int setup_locking_range(struct opal_dev *dev, void *data)
+static int setup_enable_range(struct opal_dev *dev, void *data)
 {
 	u8 uid[OPAL_UID_LENGTH];
 	struct opal_user_lr_setup *setup = data;
@@ -1528,38 +1532,47 @@ static int setup_locking_range(struct opal_dev *dev, void *data)

 	if (lr == 0)
 		err = enable_global_lr(dev, uid, setup);
-	else {
-		err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
-
-		add_token_u8(&err, dev, OPAL_STARTNAME);
-		add_token_u8(&err, dev, OPAL_VALUES);
-		add_token_u8(&err, dev, OPAL_STARTLIST);
-
-		add_token_u8(&err, dev, OPAL_STARTNAME);
-		add_token_u8(&err, dev, OPAL_RANGESTART);
-		add_token_u64(&err, dev, setup->range_start);
-		add_token_u8(&err, dev, OPAL_ENDNAME);
-
-		add_token_u8(&err, dev, OPAL_STARTNAME);
-		add_token_u8(&err, dev, OPAL_RANGELENGTH);
-		add_token_u64(&err, dev, setup->range_length);
-		add_token_u8(&err, dev, OPAL_ENDNAME);
-
-		add_token_u8(&err, dev, OPAL_STARTNAME);
-		add_token_u8(&err, dev, OPAL_READLOCKENABLED);
-		add_token_u64(&err, dev, !!setup->RLE);
-		add_token_u8(&err, dev, OPAL_ENDNAME);
-
-		add_token_u8(&err, dev, OPAL_STARTNAME);
-		add_token_u8(&err, dev, OPAL_WRITELOCKENABLED);
-		add_token_u64(&err, dev, !!setup->WLE);
-		add_token_u8(&err, dev, OPAL_ENDNAME);
-
-		add_token_u8(&err, dev, OPAL_ENDLIST);
-		add_token_u8(&err, dev, OPAL_ENDNAME);
-	}
+	else
+		err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, 0, 0);
 	if (err) {
-		pr_debug("Error building Setup Locking range command.\n");
+		pr_debug("Failed to create enable lr command.\n");
+		return err;
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
+static int setup_locking_range_start_length(struct opal_dev *dev, void *data)
+{
+	int err;
+	u8 uid[OPAL_UID_LENGTH];
+	struct opal_user_lr_setup *setup = data;
+
+	err = build_locking_range(uid, sizeof(uid), setup->session.opal_key.lr);
+	if (err)
+		return err;
+
+	err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_VALUES);
+	add_token_u8(&err, dev, OPAL_STARTLIST);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_RANGESTART);
+	add_token_u64(&err, dev, setup->range_start);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_STARTNAME);
+	add_token_u8(&err, dev, OPAL_RANGELENGTH);
+	add_token_u64(&err, dev, setup->range_length);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	add_token_u8(&err, dev, OPAL_ENDLIST);
+	add_token_u8(&err, dev, OPAL_ENDNAME);
+
+	if (err) {
+		pr_debug("Error building Setup Locking RangeStartLength command.\n");
 		return err;
 	}

@@ -1568,7 +1581,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)

 static int response_get_column(const struct parsed_resp *resp,
 			       int *iter,
-			       u8 column,
+			       u64 column,
 			       u64 *value)
 {
 	const struct opal_resp_tok *tok;
@@ -1586,7 +1599,7 @@ static int response_get_column(const struct parsed_resp *resp,
 	n++;

 	if (response_get_u64(resp, n) != column) {
-		pr_debug("Token %d does not match expected column %u.\n",
+		pr_debug("Token %d does not match expected column %llu.\n",
 			 n, column);
 		return OPAL_INVAL_PARAM;
 	}
@@ -1744,6 +1757,12 @@ static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data)
 					  OPAL_ADMINSP_UID, NULL, 0);
 }

+static int start_anybodyLSP_opal_session(struct opal_dev *dev, void *data)
+{
+	return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
+					  OPAL_LOCKINGSP_UID, NULL, 0);
+}
+
 static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
 {
 	int ret;
@@ -2285,6 +2304,74 @@ static int activate_lsp(struct opal_dev *dev, void *data)
 	return finalize_and_send(dev, parse_and_check_status);
 }

+static int reactivate_lsp(struct opal_dev *dev, void *data)
+{
+	struct opal_lr_react *opal_react = data;
+	u8 user_lr[OPAL_UID_LENGTH];
+	int err, i;
+
+	err = cmd_start(dev, opaluid[OPAL_THISSP_UID],
+			opalmethod[OPAL_REACTIVATE]);
+
+	if (err) {
+		pr_debug("Error building Reactivate LockingSP command.\n");
+		return err;
+	}
+
+	/*
+	 * If neither 'entire_table' nor 'num_lrs' is set, the device
+	 * gets reactivated with SUM disabled. Only Admin1PIN will change
+	 * if set.
+	 */
+	if (opal_react->entire_table) {
+		/* Entire Locking table (all locking ranges) will be put in SUM. */
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u64(&err, dev, OPAL_SUM_SET_LIST);
+		add_token_bytestring(&err, dev, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+	} else if (opal_react->num_lrs) {
+		/* Subset of Locking table (selected locking range(s)) to be put in SUM */
+		err = build_locking_range(user_lr, sizeof(user_lr),
+					  opal_react->lr[0]);
+		if (err)
+			return err;
+
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u64(&err, dev, OPAL_SUM_SET_LIST);
+
+		add_token_u8(&err, dev, OPAL_STARTLIST);
+		add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+		for (i = 1; i < opal_react->num_lrs; i++) {
+			user_lr[7] = opal_react->lr[i];
+			add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
+		}
+		add_token_u8(&err, dev, OPAL_ENDLIST);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+	}
+
+	/* Skipping the rangle policy parameter is same as setting its value to zero */
+	if (opal_react->range_policy && (opal_react->num_lrs || opal_react->entire_table)) {
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u64(&err, dev, OPAL_SUM_RANGE_POLICY);
+		add_token_u8(&err, dev, 1);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+	}
+
+	/*
+	 * Optional parameter. If set, it changes the Admin1 PIN even when SUM
+	 * is being disabled.
+	 */
+	if (opal_react->new_admin_key.key_len) {
+		add_token_u8(&err, dev, OPAL_STARTNAME);
+		add_token_u64(&err, dev, OPAL_SUM_ADMIN1_PIN);
+		add_token_bytestring(&err, dev, opal_react->new_admin_key.key,
+				     opal_react->new_admin_key.key_len);
+		add_token_u8(&err, dev, OPAL_ENDNAME);
+	}
+
+	return finalize_and_send(dev, parse_and_check_status);
+}
+
 /* Determine if we're in the Manufactured Inactive or Active state */
 static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
 {
@@ -2955,12 +3042,92 @@ static int opal_activate_lsp(struct opal_dev *dev,
 	return ret;
 }

+static int opal_reactivate_lsp(struct opal_dev *dev,
+			       struct opal_lr_react *opal_lr_react)
+{
+	const struct opal_step active_steps[] = {
+		{ start_admin1LSP_opal_session, &opal_lr_react->key },
+		{ reactivate_lsp, opal_lr_react },
+		/* No end_opal_session. The controller terminates the session */
+	};
+	int ret;
+
+	/* use either 'entire_table' parameter or set of locking ranges */
+	if (opal_lr_react->num_lrs > OPAL_MAX_LRS ||
+	    (opal_lr_react->num_lrs && opal_lr_react->entire_table))
+		return -EINVAL;
+
+	ret = opal_get_key(dev, &opal_lr_react->key);
+	if (ret)
+		return ret;
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	return ret;
+}
+
 static int opal_setup_locking_range(struct opal_dev *dev,
 				    struct opal_user_lr_setup *opal_lrs)
 {
 	const struct opal_step lr_steps[] = {
 		{ start_auth_opal_session, &opal_lrs->session },
-		{ setup_locking_range, opal_lrs },
+		{ setup_locking_range_start_length, opal_lrs },
+		{ setup_enable_range, opal_lrs },
+		{ end_opal_session, }
+	}, lr_global_steps[] = {
+		{ start_auth_opal_session, &opal_lrs->session },
+		{ setup_enable_range, opal_lrs },
+		{ end_opal_session, }
+	};
+	int ret;
+
+	ret = opal_get_key(dev, &opal_lrs->session.opal_key);
+	if (ret)
+		return ret;
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	if (opal_lrs->session.opal_key.lr == 0)
+		ret = execute_steps(dev, lr_global_steps, ARRAY_SIZE(lr_global_steps));
+	else
+		ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	return ret;
+}
+
+static int opal_setup_locking_range_start_length(struct opal_dev *dev,
+				    struct opal_user_lr_setup *opal_lrs)
+{
+	const struct opal_step lr_steps[] = {
+		{ start_auth_opal_session, &opal_lrs->session },
+		{ setup_locking_range_start_length, opal_lrs },
+		{ end_opal_session, }
+	};
+	int ret;
+
+	/* we can not set global locking range offset or length */
+	if (opal_lrs->session.opal_key.lr == 0)
+		return -EINVAL;
+
+	ret = opal_get_key(dev, &opal_lrs->session.opal_key);
+	if (ret)
+		return ret;
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	return ret;
+}
+
+static int opal_enable_disable_range(struct opal_dev *dev,
+			     struct opal_user_lr_setup *opal_lrs)
+{
+	const struct opal_step lr_steps[] = {
+		{ start_auth_opal_session, &opal_lrs->session },
+		{ setup_enable_range, opal_lrs },
 		{ end_opal_session, }
 	};
 	int ret;
@@ -3228,6 +3395,200 @@ static int opal_get_geometry(struct opal_dev *dev, void __user *data)
 	return 0;
 }

+static int get_sum_ranges(struct opal_dev *dev, void *data)
+{
+	const char *lr_uid;
+	size_t lr_uid_len;
+	u64 val;
+	const struct opal_resp_tok *tok;
+	int err, tok_n = 2;
+	struct opal_sum_ranges *sranges = data;
+	const __u8 lr_all[OPAL_MAX_LRS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+
+	err = generic_get_columns(dev, opaluid[OPAL_LOCKING_INFO_TABLE], OPAL_SUM_SET_LIST,
+				  OPAL_SUM_RANGE_POLICY);
+	if (err) {
+		pr_debug("Couldn't get locking info table columns %d to %d.\n",
+			 OPAL_SUM_SET_LIST, OPAL_SUM_RANGE_POLICY);
+		return err;
+	}
+
+	tok = response_get_token(&dev->parsed, tok_n);
+	if (IS_ERR(tok))
+		return PTR_ERR(tok);
+
+	if (!response_token_matches(tok, OPAL_STARTNAME)) {
+		pr_debug("Unexpected response token type %d.\n", tok_n);
+		return OPAL_INVAL_PARAM;
+	}
+	tok_n++;
+
+	if (response_get_u64(&dev->parsed, tok_n) != OPAL_SUM_SET_LIST) {
+		pr_debug("Token %d does not match expected column %u.\n",
+			 tok_n, OPAL_SUM_SET_LIST);
+		return OPAL_INVAL_PARAM;
+	}
+	tok_n++;
+
+	tok = response_get_token(&dev->parsed, tok_n);
+	if (IS_ERR(tok))
+		return PTR_ERR(tok);
+
+	/*
+	 * The OPAL_SUM_SET_LIST response contains two distinct values:
+	 *
+	 *  - the list of individual locking ranges (UIDs) put in SUM. The list
+	 *    may also be empty signaling the SUM is disabled.
+	 *
+	 *  - the Locking table UID if the entire Locking table is put in SUM.
+	 */
+	if (response_token_matches(tok, OPAL_STARTLIST)) {
+		sranges->num_lrs = 0;
+
+		tok_n++;
+		tok = response_get_token(&dev->parsed, tok_n);
+		if (IS_ERR(tok))
+			return PTR_ERR(tok);
+
+		while (!response_token_matches(tok, OPAL_ENDLIST)) {
+			lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid);
+			if (lr_uid_len != OPAL_UID_LENGTH) {
+				pr_debug("Unexpected response token type %d.\n", tok_n);
+				return OPAL_INVAL_PARAM;
+			}
+
+			if (memcmp(lr_uid, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH)) {
+				if (lr_uid[5] != LOCKING_RANGE_NON_GLOBAL) {
+					pr_debug("Unexpected byte %d at LR UUID position 5.\n",
+						 lr_uid[5]);
+					return OPAL_INVAL_PARAM;
+				}
+				sranges->lr[sranges->num_lrs++] = lr_uid[7];
+			} else
+				sranges->lr[sranges->num_lrs++] = 0;
+
+			tok_n++;
+			tok = response_get_token(&dev->parsed, tok_n);
+			if (IS_ERR(tok))
+				return PTR_ERR(tok);
+		}
+	} else {
+		/* Only OPAL_LOCKING_TABLE UID is an alternative to OPAL_STARTLIST here. */
+		lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid);
+		if (lr_uid_len != OPAL_UID_LENGTH) {
+			pr_debug("Unexpected response token type %d.\n", tok_n);
+			return OPAL_INVAL_PARAM;
+		}
+
+		if (memcmp(lr_uid, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH)) {
+			pr_debug("Unexpected response UID.\n");
+			return OPAL_INVAL_PARAM;
+		}
+
+		/* sed-opal kernel API already provides following limit in Activate command */
+		sranges->num_lrs = OPAL_MAX_LRS;
+		memcpy(sranges->lr, lr_all, OPAL_MAX_LRS);
+	}
+	tok_n++;
+
+	tok = response_get_token(&dev->parsed, tok_n);
+	if (IS_ERR(tok))
+		return PTR_ERR(tok);
+
+	if (!response_token_matches(tok, OPAL_ENDNAME)) {
+		pr_debug("Unexpected response token type %d.\n", tok_n);
+		return OPAL_INVAL_PARAM;
+	}
+	tok_n++;
+
+	err = response_get_column(&dev->parsed, &tok_n, OPAL_SUM_RANGE_POLICY, &val);
+	if (err)
+		return err;
+
+	sranges->range_policy = val ? 1 : 0;
+
+	return 0;
+}
+
+static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opal_sum_rngs,
+			       void __user *data)
+{
+	const struct opal_step admin_steps[] = {
+		{ start_admin1LSP_opal_session, &opal_sum_rngs->key },
+		{ get_sum_ranges, opal_sum_rngs },
+		{ end_opal_session, }
+	}, anybody_steps[] = {
+		{ start_anybodyLSP_opal_session, NULL },
+		{ get_sum_ranges, opal_sum_rngs },
+		{ end_opal_session, }
+	};
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+	setup_opal_dev(dev);
+	if (opal_sum_rngs->key.key_len)
+		/* Use Admin1 session (authenticated by PIN) to retrieve LockingInfo columns */
+		ret = execute_steps(dev, admin_steps, ARRAY_SIZE(admin_steps));
+	else
+		/* Use Anybody session (no key) to retrieve LockingInfo columns */
+		ret = execute_steps(dev, anybody_steps, ARRAY_SIZE(anybody_steps));
+	mutex_unlock(&dev->dev_lock);
+
+	/* skip session info when copying back to uspace */
+	if (!ret && copy_to_user(data + offsetof(struct opal_sum_ranges, num_lrs),
+				(void *)opal_sum_rngs + offsetof(struct opal_sum_ranges, num_lrs),
+				sizeof(*opal_sum_rngs) - offsetof(struct opal_sum_ranges, num_lrs))) {
+		pr_debug("Error copying SUM ranges info to userspace\n");
+		return -EFAULT;
+	}
+
+	return ret;
+}
+
+static int opal_stack_reset(struct opal_dev *dev)
+{
+	struct opal_stack_reset *req;
+	struct opal_stack_reset_response *resp;
+	int ret;
+
+	mutex_lock(&dev->dev_lock);
+
+	memset(dev->cmd, 0, IO_BUFFER_LENGTH);
+	req = (struct opal_stack_reset *)dev->cmd;
+	req->extendedComID[0] = dev->comid >> 8;
+	req->extendedComID[1] = dev->comid & 0xFF;
+	req->request_code = cpu_to_be32(OPAL_STACK_RESET);
+
+	ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02,
+			     dev->cmd, IO_BUFFER_LENGTH, true);
+	if (ret) {
+		pr_debug("Error sending stack reset: %d\n", ret);
+		goto out;
+	}
+
+	memset(dev->resp, 0, IO_BUFFER_LENGTH);
+	ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02,
+			     dev->resp, IO_BUFFER_LENGTH, false);
+	if (ret) {
+		pr_debug("Error receiving stack reset response: %d\n", ret);
+		goto out;
+	}
+
+	resp = (struct opal_stack_reset_response *)dev->resp;
+	if (be16_to_cpu(resp->data_length) != 4) {
+		pr_debug("Stack reset pending\n");
+		ret = -EBUSY;
+		goto out;
+	}
+	if (be32_to_cpu(resp->response) != 0) {
+		pr_debug("Stack reset failed: %u\n", be32_to_cpu(resp->response));
+		ret = -EIO;
+	}
+out:
+	mutex_unlock(&dev->dev_lock);
+	return ret;
+}
+
 int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 {
 	void *p;
@@ -3313,6 +3674,21 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
 	case IOC_OPAL_SET_SID_PW:
 		ret = opal_set_new_sid_pw(dev, p);
 		break;
+	case IOC_OPAL_REACTIVATE_LSP:
+		ret = opal_reactivate_lsp(dev, p);
+		break;
+	case IOC_OPAL_LR_SET_START_LEN:
+		ret = opal_setup_locking_range_start_length(dev, p);
+		break;
+	case IOC_OPAL_ENABLE_DISABLE_LR:
+		ret = opal_enable_disable_range(dev, p);
+		break;
+	case IOC_OPAL_GET_SUM_STATUS:
+		ret = opal_get_sum_ranges(dev, p, arg);
+		break;
+	case IOC_OPAL_STACK_RESET:
+		ret = opal_stack_reset(dev);
+		break;

 	default:
 		break;
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -12,230 +12,115 @@
 #include <linux/unaligned.h>
 #include "blk.h"

-struct blk_integrity_iter {
-	void			*prot_buf;
-	void			*data_buf;
-	sector_t		seed;
-	unsigned int		data_size;
-	unsigned short		interval;
-	const char		*disk_name;
+#define APP_TAG_ESCAPE 0xffff
+#define REF_TAG_ESCAPE 0xffffffff
+
+/*
+ * This union is used for onstack allocations when the pi field is split across
+ * segments. blk_validate_integrity_limits() guarantees pi_tuple_size matches
+ * the sizeof one of these two types.
+ */
+union pi_tuple {
+	struct crc64_pi_tuple	crc64_pi;
+	struct t10_pi_tuple	t10_pi;
 };

-static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len,
-		unsigned char csum_type)
+struct blk_integrity_iter {
+	struct bio			*bio;
+	struct bio_integrity_payload	*bip;
+	struct blk_integrity		*bi;
+	struct bvec_iter		data_iter;
+	struct bvec_iter		prot_iter;
+	unsigned int			interval_remaining;
+	u64				seed;
+	u64				csum;
+};
+
+static void blk_calculate_guard(struct blk_integrity_iter *iter, void *data,
+				unsigned int len)
 {
-	if (csum_type == BLK_INTEGRITY_CSUM_IP)
-		return (__force __be16)ip_compute_csum(data, len);
-	return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len));
+	switch (iter->bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+		iter->csum = crc64_nvme(iter->csum, data, len);
+		break;
+	case BLK_INTEGRITY_CSUM_CRC:
+		iter->csum = crc_t10dif_update(iter->csum, data, len);
+		break;
+	case BLK_INTEGRITY_CSUM_IP:
+		iter->csum = (__force u32)csum_partial(data, len,
+						(__force __wsum)iter->csum);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		iter->csum = U64_MAX;
+		break;
+	}
+}
+
+static void blk_integrity_csum_finish(struct blk_integrity_iter *iter)
+{
+	switch (iter->bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_IP:
+		iter->csum = (__force u16)csum_fold((__force __wsum)iter->csum);
+		break;
+	default:
+		break;
+	}
 }

 /*
- * Type 1 and Type 2 protection use the same format: 16 bit guard tag,
- * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
- * tag.
+ * Update the csum for formats that have metadata padding in front of the data
+ * integrity field
 */
-static void t10_pi_generate(struct blk_integrity_iter *iter,
-		struct blk_integrity *bi)
+static void blk_integrity_csum_offset(struct blk_integrity_iter *iter)
 {
-	u8 offset = bi->pi_offset;
-	unsigned int i;
+	unsigned int offset = iter->bi->pi_offset;
+	struct bio_vec *bvec = iter->bip->bip_vec;

-	for (i = 0 ; i < iter->data_size ; i += iter->interval) {
-		struct t10_pi_tuple *pi = iter->prot_buf + offset;
+	while (offset > 0) {
+		struct bio_vec pbv = bvec_iter_bvec(bvec, iter->prot_iter);
+		unsigned int len = min(pbv.bv_len, offset);
+		void *prot_buf = bvec_kmap_local(&pbv);

-		pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval,
-				bi->csum_type);
-		if (offset)
-			pi->guard_tag = t10_pi_csum(pi->guard_tag,
-					iter->prot_buf, offset, bi->csum_type);
-		pi->app_tag = 0;
+		blk_calculate_guard(iter, prot_buf, len);
+		kunmap_local(prot_buf);
+		offset -= len;
+		bvec_iter_advance_single(bvec, &iter->prot_iter, len);
+	}
+	blk_integrity_csum_finish(iter);
+}

-		if (bi->flags & BLK_INTEGRITY_REF_TAG)
-			pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
-		else
-			pi->ref_tag = 0;
+static void blk_integrity_copy_from_tuple(struct bio_integrity_payload *bip,
+					  struct bvec_iter *iter, void *tuple,
+					  unsigned int tuple_size)
+{
+	while (tuple_size) {
+		struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter);
+		unsigned int len = min(tuple_size, pbv.bv_len);
+		void *prot_buf = bvec_kmap_local(&pbv);

-		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->metadata_size;
-		iter->seed++;
+		memcpy(prot_buf, tuple, len);
+		kunmap_local(prot_buf);
+		bvec_iter_advance_single(bip->bip_vec, iter, len);
+		tuple_size -= len;
+		tuple += len;
 	}
 }

-static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
-		struct blk_integrity *bi)
+static void blk_integrity_copy_to_tuple(struct bio_integrity_payload *bip,
+					struct bvec_iter *iter, void *tuple,
+					unsigned int tuple_size)
 {
-	u8 offset = bi->pi_offset;
-	unsigned int i;
+	while (tuple_size) {
+		struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter);
+		unsigned int len = min(tuple_size, pbv.bv_len);
+		void *prot_buf = bvec_kmap_local(&pbv);

-	for (i = 0 ; i < iter->data_size ; i += iter->interval) {
-		struct t10_pi_tuple *pi = iter->prot_buf + offset;
-		__be16 csum;
-
-		if (bi->flags & BLK_INTEGRITY_REF_TAG) {
-			if (pi->app_tag == T10_PI_APP_ESCAPE)
-				goto next;
-
-			if (be32_to_cpu(pi->ref_tag) !=
-			    lower_32_bits(iter->seed)) {
-				pr_err("%s: ref tag error at location %llu " \
-				       "(rcvd %u)\n", iter->disk_name,
-				       (unsigned long long)
-				       iter->seed, be32_to_cpu(pi->ref_tag));
-				return BLK_STS_PROTECTION;
-			}
-		} else {
-			if (pi->app_tag == T10_PI_APP_ESCAPE &&
-			    pi->ref_tag == T10_PI_REF_ESCAPE)
-				goto next;
-		}
-
-		csum = t10_pi_csum(0, iter->data_buf, iter->interval,
-				bi->csum_type);
-		if (offset)
-			csum = t10_pi_csum(csum, iter->prot_buf, offset,
-					bi->csum_type);
-
-		if (pi->guard_tag != csum) {
-			pr_err("%s: guard tag error at sector %llu " \
-			       "(rcvd %04x, want %04x)\n", iter->disk_name,
-			       (unsigned long long)iter->seed,
-			       be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
-			return BLK_STS_PROTECTION;
-		}
-
-next:
-		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->metadata_size;
-		iter->seed++;
-	}
-
-	return BLK_STS_OK;
-}
-
-/**
- * t10_pi_type1_prepare - prepare PI prior submitting request to device
- * @rq:              request with PI that should be prepared
- *
- * For Type 1/Type 2, the virtual start sector is the one that was
- * originally submitted by the block layer for the ref_tag usage. Due to
- * partitioning, MD/DM cloning, etc. the actual physical start sector is
- * likely to be different. Remap protection information to match the
- * physical LBA.
- */
-static void t10_pi_type1_prepare(struct request *rq)
-{
-	struct blk_integrity *bi = &rq->q->limits.integrity;
-	const int tuple_sz = bi->metadata_size;
-	u32 ref_tag = t10_pi_ref_tag(rq);
-	u8 offset = bi->pi_offset;
-	struct bio *bio;
-
-	__rq_for_each_bio(bio, rq) {
-		struct bio_integrity_payload *bip = bio_integrity(bio);
-		u32 virt = bip_get_seed(bip) & 0xffffffff;
-		struct bio_vec iv;
-		struct bvec_iter iter;
-
-		/* Already remapped? */
-		if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
-			break;
-
-		bip_for_each_vec(iv, bip, iter) {
-			unsigned int j;
-			void *p;
-
-			p = bvec_kmap_local(&iv);
-			for (j = 0; j < iv.bv_len; j += tuple_sz) {
-				struct t10_pi_tuple *pi = p + offset;
-
-				if (be32_to_cpu(pi->ref_tag) == virt)
-					pi->ref_tag = cpu_to_be32(ref_tag);
-				virt++;
-				ref_tag++;
-				p += tuple_sz;
-			}
-			kunmap_local(p);
-		}
-
-		bip->bip_flags |= BIP_MAPPED_INTEGRITY;
-	}
-}
-
-/**
- * t10_pi_type1_complete - prepare PI prior returning request to the blk layer
- * @rq:              request with PI that should be prepared
- * @nr_bytes:        total bytes to prepare
- *
- * For Type 1/Type 2, the virtual start sector is the one that was
- * originally submitted by the block layer for the ref_tag usage. Due to
- * partitioning, MD/DM cloning, etc. the actual physical start sector is
- * likely to be different. Since the physical start sector was submitted
- * to the device, we should remap it back to virtual values expected by the
- * block layer.
- */
-static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
-{
-	struct blk_integrity *bi = &rq->q->limits.integrity;
-	unsigned intervals = nr_bytes >> bi->interval_exp;
-	const int tuple_sz = bi->metadata_size;
-	u32 ref_tag = t10_pi_ref_tag(rq);
-	u8 offset = bi->pi_offset;
-	struct bio *bio;
-
-	__rq_for_each_bio(bio, rq) {
-		struct bio_integrity_payload *bip = bio_integrity(bio);
-		u32 virt = bip_get_seed(bip) & 0xffffffff;
-		struct bio_vec iv;
-		struct bvec_iter iter;
-
-		bip_for_each_vec(iv, bip, iter) {
-			unsigned int j;
-			void *p;
-
-			p = bvec_kmap_local(&iv);
-			for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
-				struct t10_pi_tuple *pi = p + offset;
-
-				if (be32_to_cpu(pi->ref_tag) == ref_tag)
-					pi->ref_tag = cpu_to_be32(virt);
-				virt++;
-				ref_tag++;
-				intervals--;
-				p += tuple_sz;
-			}
-			kunmap_local(p);
-		}
-	}
-}
-
-static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
-{
-	return cpu_to_be64(crc64_nvme(crc, data, len));
-}
-
-static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
-		struct blk_integrity *bi)
-{
-	u8 offset = bi->pi_offset;
-	unsigned int i;
-
-	for (i = 0 ; i < iter->data_size ; i += iter->interval) {
-		struct crc64_pi_tuple *pi = iter->prot_buf + offset;
-
-		pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval);
-		if (offset)
-			pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag),
-					iter->prot_buf, offset);
-		pi->app_tag = 0;
-
-		if (bi->flags & BLK_INTEGRITY_REF_TAG)
-			put_unaligned_be48(iter->seed, pi->ref_tag);
-		else
-			put_unaligned_be48(0ULL, pi->ref_tag);
-
-		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->metadata_size;
-		iter->seed++;
+		memcpy(tuple, prot_buf, len);
+		kunmap_local(prot_buf);
+		bvec_iter_advance_single(bip->bip_vec, iter, len);
+		tuple_size -= len;
+		tuple += len;
 	}
 }

@@ -246,228 +131,437 @@ static bool ext_pi_ref_escape(const u8 ref_tag[6])
 	return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
 }

-static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
-		struct blk_integrity *bi)
+static blk_status_t blk_verify_ext_pi(struct blk_integrity_iter *iter,
+				      struct crc64_pi_tuple *pi)
 {
-	u8 offset = bi->pi_offset;
-	unsigned int i;
+	u64 seed = lower_48_bits(iter->seed);
+	u64 guard = get_unaligned_be64(&pi->guard_tag);
+	u64 ref = get_unaligned_be48(pi->ref_tag);
+	u16 app = get_unaligned_be16(&pi->app_tag);

-	for (i = 0; i < iter->data_size; i += iter->interval) {
-		struct crc64_pi_tuple *pi = iter->prot_buf + offset;
-		u64 ref, seed;
-		__be64 csum;
-
-		if (bi->flags & BLK_INTEGRITY_REF_TAG) {
-			if (pi->app_tag == T10_PI_APP_ESCAPE)
-				goto next;
-
-			ref = get_unaligned_be48(pi->ref_tag);
-			seed = lower_48_bits(iter->seed);
-			if (ref != seed) {
-				pr_err("%s: ref tag error at location %llu (rcvd %llu)\n",
-					iter->disk_name, seed, ref);
-				return BLK_STS_PROTECTION;
-			}
-		} else {
-			if (pi->app_tag == T10_PI_APP_ESCAPE &&
-			    ext_pi_ref_escape(pi->ref_tag))
-				goto next;
-		}
-
-		csum = ext_pi_crc64(0, iter->data_buf, iter->interval);
-		if (offset)
-			csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf,
-					    offset);
-
-		if (pi->guard_tag != csum) {
-			pr_err("%s: guard tag error at sector %llu " \
-			       "(rcvd %016llx, want %016llx)\n",
-				iter->disk_name, (unsigned long long)iter->seed,
-				be64_to_cpu(pi->guard_tag), be64_to_cpu(csum));
+	if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) {
+		if (app == APP_TAG_ESCAPE)
+			return BLK_STS_OK;
+		if (ref != seed) {
+			pr_err("%s: ref tag error at location %llu (rcvd %llu)\n",
+				iter->bio->bi_bdev->bd_disk->disk_name, seed,
+				ref);
 			return BLK_STS_PROTECTION;
 		}
+	} else if (app == APP_TAG_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) {
+		return BLK_STS_OK;
+	}

-next:
-		iter->data_buf += iter->interval;
-		iter->prot_buf += bi->metadata_size;
-		iter->seed++;
+	if (guard != iter->csum) {
+		pr_err("%s: guard tag error at sector %llu (rcvd %016llx, want %016llx)\n",
+			iter->bio->bi_bdev->bd_disk->disk_name, iter->seed,
+			guard, iter->csum);
+		return BLK_STS_PROTECTION;
 	}

 	return BLK_STS_OK;
 }

-static void ext_pi_type1_prepare(struct request *rq)
+static blk_status_t blk_verify_pi(struct blk_integrity_iter *iter,
+				      struct t10_pi_tuple *pi, u16 guard)
 {
-	struct blk_integrity *bi = &rq->q->limits.integrity;
-	const int tuple_sz = bi->metadata_size;
-	u64 ref_tag = ext_pi_ref_tag(rq);
-	u8 offset = bi->pi_offset;
-	struct bio *bio;
+	u32 seed = lower_32_bits(iter->seed);
+	u32 ref = get_unaligned_be32(&pi->ref_tag);
+	u16 app = get_unaligned_be16(&pi->app_tag);

-	__rq_for_each_bio(bio, rq) {
-		struct bio_integrity_payload *bip = bio_integrity(bio);
-		u64 virt = lower_48_bits(bip_get_seed(bip));
-		struct bio_vec iv;
-		struct bvec_iter iter;
-
-		/* Already remapped? */
-		if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
-			break;
-
-		bip_for_each_vec(iv, bip, iter) {
-			unsigned int j;
-			void *p;
-
-			p = bvec_kmap_local(&iv);
-			for (j = 0; j < iv.bv_len; j += tuple_sz) {
-				struct crc64_pi_tuple *pi = p +  offset;
-				u64 ref = get_unaligned_be48(pi->ref_tag);
-
-				if (ref == virt)
-					put_unaligned_be48(ref_tag, pi->ref_tag);
-				virt++;
-				ref_tag++;
-				p += tuple_sz;
-			}
-			kunmap_local(p);
+	if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) {
+		if (app == APP_TAG_ESCAPE)
+			return BLK_STS_OK;
+		if (ref != seed) {
+			pr_err("%s: ref tag error at location %u (rcvd %u)\n",
+				iter->bio->bi_bdev->bd_disk->disk_name, seed,
+				ref);
+			return BLK_STS_PROTECTION;
 		}
+	} else if (app == APP_TAG_ESCAPE && ref == REF_TAG_ESCAPE) {
+		return BLK_STS_OK;
+	}

-		bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+	if (guard != (u16)iter->csum) {
+		pr_err("%s: guard tag error at sector %llu (rcvd %04x, want %04x)\n",
+			iter->bio->bi_bdev->bd_disk->disk_name, iter->seed,
+			guard, (u16)iter->csum);
+		return BLK_STS_PROTECTION;
+	}
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t blk_verify_t10_pi(struct blk_integrity_iter *iter,
+				      struct t10_pi_tuple *pi)
+{
+	u16 guard = get_unaligned_be16(&pi->guard_tag);
+
+	return blk_verify_pi(iter, pi, guard);
+}
+
+static blk_status_t blk_verify_ip_pi(struct blk_integrity_iter *iter,
+				     struct t10_pi_tuple *pi)
+{
+	u16 guard = get_unaligned((u16 *)&pi->guard_tag);
+
+	return blk_verify_pi(iter, pi, guard);
+}
+
+static blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter,
+					 union pi_tuple *tuple)
+{
+	switch (iter->bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+		return blk_verify_ext_pi(iter, &tuple->crc64_pi);
+	case BLK_INTEGRITY_CSUM_CRC:
+		return blk_verify_t10_pi(iter, &tuple->t10_pi);
+	case BLK_INTEGRITY_CSUM_IP:
+		return blk_verify_ip_pi(iter, &tuple->t10_pi);
+	default:
+		return BLK_STS_OK;
 	}
 }

-static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
+static void blk_set_ext_pi(struct blk_integrity_iter *iter,
+			   struct crc64_pi_tuple *pi)
 {
-	struct blk_integrity *bi = &rq->q->limits.integrity;
-	unsigned intervals = nr_bytes >> bi->interval_exp;
-	const int tuple_sz = bi->metadata_size;
-	u64 ref_tag = ext_pi_ref_tag(rq);
-	u8 offset = bi->pi_offset;
-	struct bio *bio;
+	put_unaligned_be64(iter->csum, &pi->guard_tag);
+	put_unaligned_be16(0, &pi->app_tag);
+	put_unaligned_be48(iter->seed, &pi->ref_tag);
+}

-	__rq_for_each_bio(bio, rq) {
-		struct bio_integrity_payload *bip = bio_integrity(bio);
-		u64 virt = lower_48_bits(bip_get_seed(bip));
-		struct bio_vec iv;
-		struct bvec_iter iter;
+static void blk_set_pi(struct blk_integrity_iter *iter,
+		       struct t10_pi_tuple *pi, __be16 csum)
+{
+	put_unaligned(csum, &pi->guard_tag);
+	put_unaligned_be16(0, &pi->app_tag);
+	put_unaligned_be32(iter->seed, &pi->ref_tag);
+}

-		bip_for_each_vec(iv, bip, iter) {
-			unsigned int j;
-			void *p;
+static void blk_set_t10_pi(struct blk_integrity_iter *iter,
+			   struct t10_pi_tuple *pi)
+{
+	blk_set_pi(iter, pi, cpu_to_be16((u16)iter->csum));
+}

-			p = bvec_kmap_local(&iv);
-			for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
-				struct crc64_pi_tuple *pi = p + offset;
-				u64 ref = get_unaligned_be48(pi->ref_tag);
+static void blk_set_ip_pi(struct blk_integrity_iter *iter,
+			  struct t10_pi_tuple *pi)
+{
+	blk_set_pi(iter, pi, (__force __be16)(u16)iter->csum);
+}

-				if (ref == ref_tag)
-					put_unaligned_be48(virt, pi->ref_tag);
-				virt++;
-				ref_tag++;
-				intervals--;
-				p += tuple_sz;
-			}
-			kunmap_local(p);
-		}
+static void blk_integrity_set(struct blk_integrity_iter *iter,
+			      union pi_tuple *tuple)
+{
+	switch (iter->bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+		return blk_set_ext_pi(iter, &tuple->crc64_pi);
+	case BLK_INTEGRITY_CSUM_CRC:
+		return blk_set_t10_pi(iter, &tuple->t10_pi);
+	case BLK_INTEGRITY_CSUM_IP:
+		return blk_set_ip_pi(iter, &tuple->t10_pi);
+	default:
+		WARN_ON_ONCE(1);
+		return;
 	}
 }

+static blk_status_t blk_integrity_interval(struct blk_integrity_iter *iter,
+					   bool verify)
+{
+	blk_status_t ret = BLK_STS_OK;
+	union pi_tuple tuple;
+	void *ptuple = &tuple;
+	struct bio_vec pbv;
+
+	blk_integrity_csum_offset(iter);
+	pbv = bvec_iter_bvec(iter->bip->bip_vec, iter->prot_iter);
+	if (pbv.bv_len >= iter->bi->pi_tuple_size) {
+		ptuple = bvec_kmap_local(&pbv);
+		bvec_iter_advance_single(iter->bip->bip_vec, &iter->prot_iter,
+				iter->bi->metadata_size - iter->bi->pi_offset);
+	} else if (verify) {
+		blk_integrity_copy_to_tuple(iter->bip, &iter->prot_iter,
+				ptuple, iter->bi->pi_tuple_size);
+	}
+
+	if (verify)
+		ret = blk_integrity_verify(iter, ptuple);
+	else
+		blk_integrity_set(iter, ptuple);
+
+	if (likely(ptuple != &tuple)) {
+		kunmap_local(ptuple);
+	} else if (!verify) {
+		blk_integrity_copy_from_tuple(iter->bip, &iter->prot_iter,
+				ptuple, iter->bi->pi_tuple_size);
+	}
+
+	iter->interval_remaining = 1 << iter->bi->interval_exp;
+	iter->csum = 0;
+	iter->seed++;
+	return ret;
+}
+
+static blk_status_t blk_integrity_iterate(struct bio *bio,
+					  struct bvec_iter *data_iter,
+					  bool verify)
+{
+	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct blk_integrity_iter iter = {
+		.bio = bio,
+		.bip = bip,
+		.bi = bi,
+		.data_iter = *data_iter,
+		.prot_iter = bip->bip_iter,
+		.interval_remaining = 1 << bi->interval_exp,
+		.seed = data_iter->bi_sector,
+		.csum = 0,
+	};
+	blk_status_t ret = BLK_STS_OK;
+
+	while (iter.data_iter.bi_size && ret == BLK_STS_OK) {
+		struct bio_vec bv = bvec_iter_bvec(iter.bio->bi_io_vec,
+						   iter.data_iter);
+		void *kaddr = bvec_kmap_local(&bv);
+		void *data = kaddr;
+		unsigned int len;
+
+		bvec_iter_advance_single(iter.bio->bi_io_vec, &iter.data_iter,
+					 bv.bv_len);
+		while (bv.bv_len && ret == BLK_STS_OK) {
+			len = min(iter.interval_remaining, bv.bv_len);
+			blk_calculate_guard(&iter, data, len);
+			bv.bv_len -= len;
+			data += len;
+			iter.interval_remaining -= len;
+			if (!iter.interval_remaining)
+				ret = blk_integrity_interval(&iter, verify);
+		}
+		kunmap_local(kaddr);
+	}
+
+	return ret;
+}
+
 void bio_integrity_generate(struct bio *bio)
 {
 	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
-	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct blk_integrity_iter iter;
-	struct bvec_iter bviter;
-	struct bio_vec bv;

-	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
-	iter.interval = 1 << bi->interval_exp;
-	iter.seed = bio->bi_iter.bi_sector;
-	iter.prot_buf = bvec_virt(bip->bip_vec);
-	bio_for_each_segment(bv, bio, bviter) {
-		void *kaddr = bvec_kmap_local(&bv);
-
-		iter.data_buf = kaddr;
-		iter.data_size = bv.bv_len;
-		switch (bi->csum_type) {
-		case BLK_INTEGRITY_CSUM_CRC64:
-			ext_pi_crc64_generate(&iter, bi);
-			break;
-		case BLK_INTEGRITY_CSUM_CRC:
-		case BLK_INTEGRITY_CSUM_IP:
-			t10_pi_generate(&iter, bi);
-			break;
-		default:
-			break;
-		}
-		kunmap_local(kaddr);
+	switch (bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+	case BLK_INTEGRITY_CSUM_CRC:
+	case BLK_INTEGRITY_CSUM_IP:
+		blk_integrity_iterate(bio, &bio->bi_iter, false);
+		break;
+	default:
+		break;
 	}
 }

 blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter)
 {
 	struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
-	struct bio_integrity_payload *bip = bio_integrity(bio);
-	struct blk_integrity_iter iter;
-	struct bvec_iter bviter;
-	struct bio_vec bv;

-	/*
-	 * At the moment verify is called bi_iter has been advanced during split
-	 * and completion, so use the copy created during submission here.
-	 */
-	iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
-	iter.interval = 1 << bi->interval_exp;
-	iter.seed = saved_iter->bi_sector;
-	iter.prot_buf = bvec_virt(bip->bip_vec);
-	__bio_for_each_segment(bv, bio, bviter, *saved_iter) {
-		void *kaddr = bvec_kmap_local(&bv);
-		blk_status_t ret = BLK_STS_OK;
-
-		iter.data_buf = kaddr;
-		iter.data_size = bv.bv_len;
-		switch (bi->csum_type) {
-		case BLK_INTEGRITY_CSUM_CRC64:
-			ret = ext_pi_crc64_verify(&iter, bi);
-			break;
-		case BLK_INTEGRITY_CSUM_CRC:
-		case BLK_INTEGRITY_CSUM_IP:
-			ret = t10_pi_verify(&iter, bi);
-			break;
-		default:
-			break;
-		}
-		kunmap_local(kaddr);
-
-		if (ret)
-			return ret;
+	switch (bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+	case BLK_INTEGRITY_CSUM_CRC:
+	case BLK_INTEGRITY_CSUM_IP:
+		return blk_integrity_iterate(bio, saved_iter, true);
+	default:
+		break;
 	}

 	return BLK_STS_OK;
 }

-void blk_integrity_prepare(struct request *rq)
+/*
+ * Advance @iter past the protection offset for protection formats that
+ * contain front padding on the metadata region.
+ */
+static void blk_pi_advance_offset(struct blk_integrity *bi,
+				  struct bio_integrity_payload *bip,
+				  struct bvec_iter *iter)
+{
+	unsigned int offset = bi->pi_offset;
+
+	while (offset > 0) {
+		struct bio_vec bv = mp_bvec_iter_bvec(bip->bip_vec, *iter);
+		unsigned int len = min(bv.bv_len, offset);
+
+		bvec_iter_advance_single(bip->bip_vec, iter, len);
+		offset -= len;
+	}
+}
+
+static void *blk_tuple_remap_begin(union pi_tuple *tuple,
+				   struct blk_integrity *bi,
+				   struct bio_integrity_payload *bip,
+				   struct bvec_iter *iter)
+{
+	struct bvec_iter titer;
+	struct bio_vec pbv;
+
+	blk_pi_advance_offset(bi, bip, iter);
+	pbv = bvec_iter_bvec(bip->bip_vec, *iter);
+	if (likely(pbv.bv_len >= bi->pi_tuple_size))
+		return bvec_kmap_local(&pbv);
+
+	/*
+	 * We need to preserve the state of the original iter for the
+	 * copy_from_tuple at the end, so make a temp iter for here.
+	 */
+	titer = *iter;
+	blk_integrity_copy_to_tuple(bip, &titer, tuple, bi->pi_tuple_size);
+	return tuple;
+}
+
+static void blk_tuple_remap_end(union pi_tuple *tuple, void *ptuple,
+				struct blk_integrity *bi,
+				struct bio_integrity_payload *bip,
+				struct bvec_iter *iter)
+{
+	unsigned int len = bi->metadata_size - bi->pi_offset;
+
+	if (likely(ptuple != tuple)) {
+		kunmap_local(ptuple);
+	} else {
+		blk_integrity_copy_from_tuple(bip, iter, ptuple,
+				bi->pi_tuple_size);
+		len -= bi->pi_tuple_size;
+	}
+
+	bvec_iter_advance(bip->bip_vec, iter, len);
+}
+
+static void blk_set_ext_unmap_ref(struct crc64_pi_tuple *pi, u64 virt,
+				  u64 ref_tag)
+{
+	u64 ref = get_unaligned_be48(&pi->ref_tag);
+
+	if (ref == lower_48_bits(ref_tag) && ref != lower_48_bits(virt))
+		put_unaligned_be48(virt, pi->ref_tag);
+}
+
+static void blk_set_t10_unmap_ref(struct t10_pi_tuple *pi, u32 virt,
+				  u32 ref_tag)
+{
+	u32 ref = get_unaligned_be32(&pi->ref_tag);
+
+	if (ref == ref_tag && ref != virt)
+		put_unaligned_be32(virt, &pi->ref_tag);
+}
+
+static void blk_reftag_remap_complete(struct blk_integrity *bi,
+				      union pi_tuple *tuple, u64 virt, u64 ref)
+{
+	switch (bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+		blk_set_ext_unmap_ref(&tuple->crc64_pi, virt, ref);
+		break;
+	case BLK_INTEGRITY_CSUM_CRC:
+	case BLK_INTEGRITY_CSUM_IP:
+		blk_set_t10_unmap_ref(&tuple->t10_pi, virt, ref);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+}
+
+static void blk_set_ext_map_ref(struct crc64_pi_tuple *pi, u64 virt,
+				u64 ref_tag)
+{
+	u64 ref = get_unaligned_be48(&pi->ref_tag);
+
+	if (ref == lower_48_bits(virt) && ref != ref_tag)
+		put_unaligned_be48(ref_tag, pi->ref_tag);
+}
+
+static void blk_set_t10_map_ref(struct t10_pi_tuple *pi, u32 virt, u32 ref_tag)
+{
+	u32 ref = get_unaligned_be32(&pi->ref_tag);
+
+	if (ref == virt && ref != ref_tag)
+		put_unaligned_be32(ref_tag, &pi->ref_tag);
+}
+
+static void blk_reftag_remap_prepare(struct blk_integrity *bi,
+				     union pi_tuple *tuple,
+				     u64 virt, u64 ref)
+{
+	switch (bi->csum_type) {
+	case BLK_INTEGRITY_CSUM_CRC64:
+		blk_set_ext_map_ref(&tuple->crc64_pi, virt, ref);
+		break;
+	case BLK_INTEGRITY_CSUM_CRC:
+	case BLK_INTEGRITY_CSUM_IP:
+		blk_set_t10_map_ref(&tuple->t10_pi, virt, ref);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+}
+
+static void __blk_reftag_remap(struct bio *bio, struct blk_integrity *bi,
+			       unsigned *intervals, u64 *ref, bool prep)
+{
+	struct bio_integrity_payload *bip = bio_integrity(bio);
+	struct bvec_iter iter = bip->bip_iter;
+	u64 virt = bip_get_seed(bip);
+	union pi_tuple *ptuple;
+	union pi_tuple tuple;
+
+	if (prep && bip->bip_flags & BIP_MAPPED_INTEGRITY) {
+		*ref += bio->bi_iter.bi_size >> bi->interval_exp;
+		return;
+	}
+
+	while (iter.bi_size && *intervals) {
+		ptuple = blk_tuple_remap_begin(&tuple, bi, bip, &iter);
+
+		if (prep)
+			blk_reftag_remap_prepare(bi, ptuple, virt, *ref);
+		else
+			blk_reftag_remap_complete(bi, ptuple, virt, *ref);
+
+		blk_tuple_remap_end(&tuple, ptuple, bi, bip, &iter);
+		(*intervals)--;
+		(*ref)++;
+		virt++;
+	}
+
+	if (prep)
+		bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+}
+
+static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes,
+				bool prep)
 {
 	struct blk_integrity *bi = &rq->q->limits.integrity;
+	u64 ref = blk_rq_pos(rq) >> (bi->interval_exp - SECTOR_SHIFT);
+	unsigned intervals = nr_bytes >> bi->interval_exp;
+	struct bio *bio;

 	if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
 		return;

-	if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64)
-		ext_pi_type1_prepare(rq);
-	else
-		t10_pi_type1_prepare(rq);
+	__rq_for_each_bio(bio, rq) {
+		__blk_reftag_remap(bio, bi, &intervals, &ref, prep);
+		if (!intervals)
+			break;
+	}
+}
+
+void blk_integrity_prepare(struct request *rq)
+{
+	blk_integrity_remap(rq, blk_rq_bytes(rq), true);
 }

 void blk_integrity_complete(struct request *rq, unsigned int nr_bytes)
 {
-	struct blk_integrity *bi = &rq->q->limits.integrity;
-
-	if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
-		return;
-
-	if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64)
-		ext_pi_type1_complete(rq, nr_bytes);
-	else
-		t10_pi_type1_complete(rq, nr_bytes);
+	blk_integrity_remap(rq, nr_bytes, false);
 }
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -141,12 +141,6 @@ config CRYPTO_ACOMP
 	select CRYPTO_ALGAPI
 	select CRYPTO_ACOMP2

-config CRYPTO_HKDF
-	tristate
-	select CRYPTO_SHA256 if CRYPTO_SELFTESTS
-	select CRYPTO_SHA512 if CRYPTO_SELFTESTS
-	select CRYPTO_HASH2
-
 config CRYPTO_MANAGER
 	tristate
 	default CRYPTO_ALGAPI if CRYPTO_SELFTESTS
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -36,7 +36,6 @@ obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o
 obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o
 obj-$(CONFIG_CRYPTO_SIG2) += sig.o
 obj-$(CONFIG_CRYPTO_KPP2) += kpp.o
-obj-$(CONFIG_CRYPTO_HKDF) += hkdf.o

 dh_generic-y := dh.o
 dh_generic-y += dh_helper.o
--- a/crypto/hkdf.c
+++ b/crypto/hkdf.c
@@ -1,573 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
- * Function"), aka RFC 5869.  See also the original paper (Krawczyk 2010):
- * "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
- *
- * Copyright 2019 Google LLC
- */
-
-#include <crypto/internal/hash.h>
-#include <crypto/sha2.h>
-#include <crypto/hkdf.h>
-#include <linux/module.h>
-
-/*
- * HKDF consists of two steps:
- *
- * 1. HKDF-Extract: extract a pseudorandom key from the input keying material
- *    and optional salt.
- * 2. HKDF-Expand: expand the pseudorandom key into output keying material of
- *    any length, parameterized by an application-specific info string.
- *
- */
-
-/**
- * hkdf_extract - HKDF-Extract (RFC 5869 section 2.2)
- * @hmac_tfm: an HMAC transform using the hash function desired for HKDF.  The
- *            caller is responsible for setting the @prk afterwards.
- * @ikm: input keying material
- * @ikmlen: length of @ikm
- * @salt: input salt value
- * @saltlen: length of @salt
- * @prk: resulting pseudorandom key
- *
- * Extracts a pseudorandom key @prk from the input keying material
- * @ikm with length @ikmlen and salt @salt with length @saltlen.
- * The length of @prk is given by the digest size of @hmac_tfm.
- * For an 'unsalted' version of HKDF-Extract @salt must be set
- * to all zeroes and @saltlen must be set to the length of @prk.
- *
- * Returns 0 on success with the pseudorandom key stored in @prk,
- * or a negative errno value otherwise.
- */
-int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
-		 unsigned int ikmlen, const u8 *salt, unsigned int saltlen,
-		 u8 *prk)
-{
-	int err;
-
-	err = crypto_shash_setkey(hmac_tfm, salt, saltlen);
-	if (!err)
-		err = crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(hkdf_extract);
-
-/**
- * hkdf_expand - HKDF-Expand (RFC 5869 section 2.3)
- * @hmac_tfm: hash context keyed with pseudorandom key
- * @info: application-specific information
- * @infolen: length of @info
- * @okm: output keying material
- * @okmlen: length of @okm
- *
- * This expands the pseudorandom key, which was already keyed into @hmac_tfm,
- * into @okmlen bytes of output keying material parameterized by the
- * application-specific @info of length @infolen bytes.
- * This is thread-safe and may be called by multiple threads in parallel.
- *
- * Returns 0 on success with output keying material stored in @okm,
- * or a negative errno value otherwise.
- */
-int hkdf_expand(struct crypto_shash *hmac_tfm,
-		const u8 *info, unsigned int infolen,
-		u8 *okm, unsigned int okmlen)
-{
-	SHASH_DESC_ON_STACK(desc, hmac_tfm);
-	unsigned int i, hashlen = crypto_shash_digestsize(hmac_tfm);
-	int err;
-	const u8 *prev = NULL;
-	u8 counter = 1;
-	u8 tmp[HASH_MAX_DIGESTSIZE] = {};
-
-	if (WARN_ON(okmlen > 255 * hashlen))
-		return -EINVAL;
-
-	desc->tfm = hmac_tfm;
-
-	for (i = 0; i < okmlen; i += hashlen) {
-		err = crypto_shash_init(desc);
-		if (err)
-			goto out;
-
-		if (prev) {
-			err = crypto_shash_update(desc, prev, hashlen);
-			if (err)
-				goto out;
-		}
-
-		if (infolen) {
-			err = crypto_shash_update(desc, info, infolen);
-			if (err)
-				goto out;
-		}
-
-		BUILD_BUG_ON(sizeof(counter) != 1);
-		if (okmlen - i < hashlen) {
-			err = crypto_shash_finup(desc, &counter, 1, tmp);
-			if (err)
-				goto out;
-			memcpy(&okm[i], tmp, okmlen - i);
-			memzero_explicit(tmp, sizeof(tmp));
-		} else {
-			err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
-			if (err)
-				goto out;
-		}
-		counter++;
-		prev = &okm[i];
-	}
-	err = 0;
-out:
-	if (unlikely(err))
-		memzero_explicit(okm, okmlen); /* so caller doesn't need to */
-	shash_desc_zero(desc);
-	memzero_explicit(tmp, HASH_MAX_DIGESTSIZE);
-	return err;
-}
-EXPORT_SYMBOL_GPL(hkdf_expand);
-
-struct hkdf_testvec {
-	const char *test;
-	const u8 *ikm;
-	const u8 *salt;
-	const u8 *info;
-	const u8 *prk;
-	const u8 *okm;
-	u16 ikm_size;
-	u16 salt_size;
-	u16 info_size;
-	u16 prk_size;
-	u16 okm_size;
-};
-
-/*
- * HKDF test vectors from RFC5869
- *
- * Additional HKDF test vectors from
- * https://github.com/brycx/Test-Vector-Generation/blob/master/HKDF/hkdf-hmac-sha2-test-vectors.md
- */
-static const struct hkdf_testvec hkdf_sha256_tv[] = {
-	{
-		.test = "basic hdkf test",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
-			"\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 22,
-		.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
-		.salt_size = 13,
-		.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
-		.info_size = 10,
-		.prk = "\x07\x77\x09\x36\x2c\x2e\x32\xdf\x0d\xdc\x3f\x0d\xc4\x7b\xba\x63"
-			"\x90\xb6\xc7\x3b\xb5\x0f\x9c\x31\x22\xec\x84\x4a\xd7\xc2\xb3\xe5",
-		.prk_size = 32,
-		.okm  = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a"
-			"\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf"
-			"\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65",
-		.okm_size = 42,
-	}, {
-		.test = "hkdf test with long input",
-		.ikm  = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			"\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			"\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
-		.ikm_size = 80,
-		.salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
-		.salt_size = 80,
-		.info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.info_size = 80,
-		.prk  = "\x06\xa6\xb8\x8c\x58\x53\x36\x1a\x06\x10\x4c\x9c\xeb\x35\xb4\x5c"
-			"\xef\x76\x00\x14\x90\x46\x71\x01\x4a\x19\x3f\x40\xc1\x5f\xc2\x44",
-		.prk_size = 32,
-		.okm  = "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34"
-			"\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c"
-			"\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09"
-			"\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71"
-			"\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f"
-			"\x1d\x87",
-		.okm_size = 82,
-	}, {
-		.test = "hkdf test with zero salt and info",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
-			"\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 22,
-		.salt = NULL,
-		.salt_size = 0,
-		.info = NULL,
-		.info_size = 0,
-		.prk  = "\x19\xef\x24\xa3\x2c\x71\x7b\x16\x7f\x33\xa9\x1d\x6f\x64\x8b\xdf"
-			"\x96\x59\x67\x76\xaf\xdb\x63\x77\xac\x43\x4c\x1c\x29\x3c\xcb\x04",
-		.prk_size = 32,
-		.okm  = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31"
-			"\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d"
-			"\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8",
-		.okm_size = 42,
-	}, {
-		.test = "hkdf test with short input",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 11,
-		.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
-		.salt_size = 13,
-		.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
-		.info_size = 10,
-		.prk  = "\x82\x65\xf6\x9d\x7f\xf7\xe5\x01\x37\x93\x01\x5c\xa0\xef\x92\x0c"
-			"\xb1\x68\x21\x99\xc8\xbc\x3a\x00\xda\x0c\xab\x47\xb7\xb0\x0f\xdf",
-		.prk_size = 32,
-		.okm  = "\x58\xdc\xe1\x0d\x58\x01\xcd\xfd\xa8\x31\x72\x6b\xfe\xbc\xb7\x43"
-			"\xd1\x4a\x7e\xe8\x3a\xa0\x57\xa9\x3d\x59\xb0\xa1\x31\x7f\xf0\x9d"
-			"\x10\x5c\xce\xcf\x53\x56\x92\xb1\x4d\xd5",
-		.okm_size = 42,
-	}, {
-		.test = "unsalted hkdf test with zero info",
-		.ikm  = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
-			"\x0c\x0c\x0c\x0c\x0c\x0c",
-		.ikm_size = 22,
-		.salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-			"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
-		.salt_size = 32,
-		.info = NULL,
-		.info_size = 0,
-		.prk  = "\xaa\x84\x1e\x1f\x35\x74\xf3\x2d\x13\xfb\xa8\x00\x5f\xcd\x9b\x8d"
-			"\x77\x67\x82\xa5\xdf\xa1\x92\x38\x92\xfd\x8b\x63\x5d\x3a\x89\xdf",
-		.prk_size = 32,
-		.okm  = "\x59\x68\x99\x17\x9a\xb1\xbc\x00\xa7\xc0\x37\x86\xff\x43\xee\x53"
-			"\x50\x04\xbe\x2b\xb9\xbe\x68\xbc\x14\x06\x63\x6f\x54\xbd\x33\x8a"
-			"\x66\xa2\x37\xba\x2a\xcb\xce\xe3\xc9\xa7",
-		.okm_size = 42,
-	}
-};
-
-static const struct hkdf_testvec hkdf_sha384_tv[] = {
-	{
-		.test = "basic hkdf test",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
-			"\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 22,
-		.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
-		.salt_size = 13,
-		.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
-		.info_size = 10,
-		.prk  = "\x70\x4b\x39\x99\x07\x79\xce\x1d\xc5\x48\x05\x2c\x7d\xc3\x9f\x30"
-			"\x35\x70\xdd\x13\xfb\x39\xf7\xac\xc5\x64\x68\x0b\xef\x80\xe8\xde"
-			"\xc7\x0e\xe9\xa7\xe1\xf3\xe2\x93\xef\x68\xec\xeb\x07\x2a\x5a\xde",
-		.prk_size = 48,
-		.okm  = "\x9b\x50\x97\xa8\x60\x38\xb8\x05\x30\x90\x76\xa4\x4b\x3a\x9f\x38"
-			"\x06\x3e\x25\xb5\x16\xdc\xbf\x36\x9f\x39\x4c\xfa\xb4\x36\x85\xf7"
-			"\x48\xb6\x45\x77\x63\xe4\xf0\x20\x4f\xc5",
-		.okm_size = 42,
-	}, {
-		.test = "hkdf test with long input",
-		.ikm  = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			"\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			"\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
-		.ikm_size = 80,
-		.salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
-		.salt_size = 80,
-		.info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.info_size = 80,
-		.prk  = "\xb3\x19\xf6\x83\x1d\xff\x93\x14\xef\xb6\x43\xba\xa2\x92\x63\xb3"
-			"\x0e\x4a\x8d\x77\x9f\xe3\x1e\x9c\x90\x1e\xfd\x7d\xe7\x37\xc8\x5b"
-			"\x62\xe6\x76\xd4\xdc\x87\xb0\x89\x5c\x6a\x7d\xc9\x7b\x52\xce\xbb",
-		.prk_size = 48,
-		.okm  = "\x48\x4c\xa0\x52\xb8\xcc\x72\x4f\xd1\xc4\xec\x64\xd5\x7b\x4e\x81"
-			"\x8c\x7e\x25\xa8\xe0\xf4\x56\x9e\xd7\x2a\x6a\x05\xfe\x06\x49\xee"
-			"\xbf\x69\xf8\xd5\xc8\x32\x85\x6b\xf4\xe4\xfb\xc1\x79\x67\xd5\x49"
-			"\x75\x32\x4a\x94\x98\x7f\x7f\x41\x83\x58\x17\xd8\x99\x4f\xdb\xd6"
-			"\xf4\xc0\x9c\x55\x00\xdc\xa2\x4a\x56\x22\x2f\xea\x53\xd8\x96\x7a"
-			"\x8b\x2e",
-		.okm_size = 82,
-	}, {
-		.test = "hkdf test with zero salt and info",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
-			"\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 22,
-		.salt = NULL,
-		.salt_size = 0,
-		.info = NULL,
-		.info_size = 0,
-		.prk  = "\x10\xe4\x0c\xf0\x72\xa4\xc5\x62\x6e\x43\xdd\x22\xc1\xcf\x72\x7d"
-			"\x4b\xb1\x40\x97\x5c\x9a\xd0\xcb\xc8\xe4\x5b\x40\x06\x8f\x8f\x0b"
-			"\xa5\x7c\xdb\x59\x8a\xf9\xdf\xa6\x96\x3a\x96\x89\x9a\xf0\x47\xe5",
-		.prk_size = 48,
-		.okm  = "\xc8\xc9\x6e\x71\x0f\x89\xb0\xd7\x99\x0b\xca\x68\xbc\xde\xc8\xcf"
-			"\x85\x40\x62\xe5\x4c\x73\xa7\xab\xc7\x43\xfa\xde\x9b\x24\x2d\xaa"
-			"\xcc\x1c\xea\x56\x70\x41\x5b\x52\x84\x9c",
-		.okm_size = 42,
-	}, {
-		.test = "hkdf test with short input",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 11,
-		.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
-		.salt_size = 13,
-		.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
-		.info_size = 10,
-		.prk  = "\x6d\x31\x69\x98\x28\x79\x80\x88\xb3\x59\xda\xd5\x0b\x8f\x01\xb0"
-			"\x15\xf1\x7a\xa3\xbd\x4e\x27\xa6\xe9\xf8\x73\xb7\x15\x85\xca\x6a"
-			"\x00\xd1\xf0\x82\x12\x8a\xdb\x3c\xf0\x53\x0b\x57\xc0\xf9\xac\x72",
-		.prk_size = 48,
-		.okm  = "\xfb\x7e\x67\x43\xeb\x42\xcd\xe9\x6f\x1b\x70\x77\x89\x52\xab\x75"
-			"\x48\xca\xfe\x53\x24\x9f\x7f\xfe\x14\x97\xa1\x63\x5b\x20\x1f\xf1"
-			"\x85\xb9\x3e\x95\x19\x92\xd8\x58\xf1\x1a",
-		.okm_size = 42,
-	}, {
-		.test = "unsalted hkdf test with zero info",
-		.ikm  = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
-			"\x0c\x0c\x0c\x0c\x0c\x0c",
-		.ikm_size = 22,
-		.salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-			"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-			"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
-		.salt_size = 48,
-		.info = NULL,
-		.info_size = 0,
-		.prk  = "\x9d\x2d\xa5\x06\x6f\x05\xd1\x6c\x59\xfe\xdf\x6c\x5f\x32\xc7\x5e"
-			"\xda\x9a\x47\xa7\x9c\x93\x6a\xa4\x4c\xb7\x63\xa8\xe2\x2f\xfb\xfc"
-			"\xd8\xfe\x55\x43\x58\x53\x47\x21\x90\x39\xd1\x68\x28\x36\x33\xf5",
-		.prk_size = 48,
-		.okm  = "\x6a\xd7\xc7\x26\xc8\x40\x09\x54\x6a\x76\xe0\x54\x5d\xf2\x66\x78"
-			"\x7e\x2b\x2c\xd6\xca\x43\x73\xa1\xf3\x14\x50\xa7\xbd\xf9\x48\x2b"
-			"\xfa\xb8\x11\xf5\x54\x20\x0e\xad\x8f\x53",
-		.okm_size = 42,
-	}
-};
-
-static const struct hkdf_testvec hkdf_sha512_tv[] = {
-	{
-		.test = "basic hkdf test",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
-			"\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 22,
-		.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
-		.salt_size = 13,
-		.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
-		.info_size = 10,
-		.prk  = "\x66\x57\x99\x82\x37\x37\xde\xd0\x4a\x88\xe4\x7e\x54\xa5\x89\x0b"
-			"\xb2\xc3\xd2\x47\xc7\xa4\x25\x4a\x8e\x61\x35\x07\x23\x59\x0a\x26"
-			"\xc3\x62\x38\x12\x7d\x86\x61\xb8\x8c\xf8\x0e\xf8\x02\xd5\x7e\x2f"
-			"\x7c\xeb\xcf\x1e\x00\xe0\x83\x84\x8b\xe1\x99\x29\xc6\x1b\x42\x37",
-		.prk_size = 64,
-		.okm  = "\x83\x23\x90\x08\x6c\xda\x71\xfb\x47\x62\x5b\xb5\xce\xb1\x68\xe4"
-			"\xc8\xe2\x6a\x1a\x16\xed\x34\xd9\xfc\x7f\xe9\x2c\x14\x81\x57\x93"
-			"\x38\xda\x36\x2c\xb8\xd9\xf9\x25\xd7\xcb",
-		.okm_size = 42,
-	}, {
-		.test = "hkdf test with long input",
-		.ikm  = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
-			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
-			"\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
-			"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
-			"\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
-		.ikm_size = 80,
-		.salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
-			"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
-			"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
-			"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
-			"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
-		.salt_size = 80,
-		.info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
-			"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
-			"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
-			"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
-			"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
-		.info_size = 80,
-		.prk  = "\x35\x67\x25\x42\x90\x7d\x4e\x14\x2c\x00\xe8\x44\x99\xe7\x4e\x1d"
-			"\xe0\x8b\xe8\x65\x35\xf9\x24\xe0\x22\x80\x4a\xd7\x75\xdd\xe2\x7e"
-			"\xc8\x6c\xd1\xe5\xb7\xd1\x78\xc7\x44\x89\xbd\xbe\xb3\x07\x12\xbe"
-			"\xb8\x2d\x4f\x97\x41\x6c\x5a\x94\xea\x81\xeb\xdf\x3e\x62\x9e\x4a",
-		.prk_size = 64,
-		.okm  = "\xce\x6c\x97\x19\x28\x05\xb3\x46\xe6\x16\x1e\x82\x1e\xd1\x65\x67"
-			"\x3b\x84\xf4\x00\xa2\xb5\x14\xb2\xfe\x23\xd8\x4c\xd1\x89\xdd\xf1"
-			"\xb6\x95\xb4\x8c\xbd\x1c\x83\x88\x44\x11\x37\xb3\xce\x28\xf1\x6a"
-			"\xa6\x4b\xa3\x3b\xa4\x66\xb2\x4d\xf6\xcf\xcb\x02\x1e\xcf\xf2\x35"
-			"\xf6\xa2\x05\x6c\xe3\xaf\x1d\xe4\x4d\x57\x20\x97\xa8\x50\x5d\x9e"
-			"\x7a\x93",
-		.okm_size = 82,
-	}, {
-		.test = "hkdf test with zero salt and info",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
-			"\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 22,
-		.salt = NULL,
-		.salt_size = 0,
-		.info = NULL,
-		.info_size = 0,
-		.prk  = "\xfd\x20\x0c\x49\x87\xac\x49\x13\x13\xbd\x4a\x2a\x13\x28\x71\x21"
-			"\x24\x72\x39\xe1\x1c\x9e\xf8\x28\x02\x04\x4b\x66\xef\x35\x7e\x5b"
-			"\x19\x44\x98\xd0\x68\x26\x11\x38\x23\x48\x57\x2a\x7b\x16\x11\xde"
-			"\x54\x76\x40\x94\x28\x63\x20\x57\x8a\x86\x3f\x36\x56\x2b\x0d\xf6",
-		.prk_size = 64,
-		.okm  = "\xf5\xfa\x02\xb1\x82\x98\xa7\x2a\x8c\x23\x89\x8a\x87\x03\x47\x2c"
-			"\x6e\xb1\x79\xdc\x20\x4c\x03\x42\x5c\x97\x0e\x3b\x16\x4b\xf9\x0f"
-			"\xff\x22\xd0\x48\x36\xd0\xe2\x34\x3b\xac",
-		.okm_size = 42,
-	}, {
-		.test = "hkdf test with short input",
-		.ikm  = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
-		.ikm_size = 11,
-		.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
-		.salt_size = 13,
-		.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
-		.info_size = 10,
-		.prk  = "\x67\x40\x9c\x9c\xac\x28\xb5\x2e\xe9\xfa\xd9\x1c\x2f\xda\x99\x9f"
-			"\x7c\xa2\x2e\x34\x34\xf0\xae\x77\x28\x63\x83\x65\x68\xad\x6a\x7f"
-			"\x10\xcf\x11\x3b\xfd\xdd\x56\x01\x29\xa5\x94\xa8\xf5\x23\x85\xc2"
-			"\xd6\x61\xd7\x85\xd2\x9c\xe9\x3a\x11\x40\x0c\x92\x06\x83\x18\x1d",
-		.prk_size = 64,
-		.okm  = "\x74\x13\xe8\x99\x7e\x02\x06\x10\xfb\xf6\x82\x3f\x2c\xe1\x4b\xff"
-			"\x01\x87\x5d\xb1\xca\x55\xf6\x8c\xfc\xf3\x95\x4d\xc8\xaf\xf5\x35"
-			"\x59\xbd\x5e\x30\x28\xb0\x80\xf7\xc0\x68",
-		.okm_size = 42,
-	}, {
-		.test = "unsalted hkdf test with zero info",
-		.ikm  = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
-			"\x0c\x0c\x0c\x0c\x0c\x0c",
-		.ikm_size = 22,
-		.salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-			"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-			"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
-			"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
-		.salt_size = 64,
-		.info = NULL,
-		.info_size = 0,
-		.prk  = "\x53\x46\xb3\x76\xbf\x3a\xa9\xf8\x4f\x8f\x6e\xd5\xb1\xc4\xf4\x89"
-			"\x17\x2e\x24\x4d\xac\x30\x3d\x12\xf6\x8e\xcc\x76\x6e\xa6\x00\xaa"
-			"\x88\x49\x5e\x7f\xb6\x05\x80\x31\x22\xfa\x13\x69\x24\xa8\x40\xb1"
-			"\xf0\x71\x9d\x2d\x5f\x68\xe2\x9b\x24\x22\x99\xd7\x58\xed\x68\x0c",
-		.prk_size = 64,
-		.okm  = "\x14\x07\xd4\x60\x13\xd9\x8b\xc6\xde\xce\xfc\xfe\xe5\x5f\x0f\x90"
-			"\xb0\xc7\xf6\x3d\x68\xeb\x1a\x80\xea\xf0\x7e\x95\x3c\xfc\x0a\x3a"
-			"\x52\x40\xa1\x55\xd6\xe4\xda\xa9\x65\xbb",
-		.okm_size = 42,
-	}
-};
-
-static int hkdf_test(const char *shash, const struct hkdf_testvec *tv)
-{	struct crypto_shash *tfm = NULL;
-	u8 *prk = NULL, *okm = NULL;
-	unsigned int prk_size;
-	const char *driver;
-	int err;
-
-	tfm = crypto_alloc_shash(shash, 0, 0);
-	if (IS_ERR(tfm)) {
-		pr_err("%s(%s): failed to allocate transform: %ld\n",
-		       tv->test, shash, PTR_ERR(tfm));
-		return PTR_ERR(tfm);
-	}
-	driver = crypto_shash_driver_name(tfm);
-
-	prk_size = crypto_shash_digestsize(tfm);
-	prk = kzalloc(prk_size, GFP_KERNEL);
-	if (!prk) {
-		err = -ENOMEM;
-		goto out_free;
-	}
-
-	if (tv->prk_size != prk_size) {
-		pr_err("%s(%s): prk size mismatch (vec %u, digest %u\n",
-		       tv->test, driver, tv->prk_size, prk_size);
-		err = -EINVAL;
-		goto out_free;
-	}
-
-	err = hkdf_extract(tfm, tv->ikm, tv->ikm_size,
-			   tv->salt, tv->salt_size, prk);
-	if (err) {
-		pr_err("%s(%s): hkdf_extract failed with %d\n",
-		       tv->test, driver, err);
-		goto out_free;
-	}
-
-	if (memcmp(prk, tv->prk, tv->prk_size)) {
-		pr_err("%s(%s): hkdf_extract prk mismatch\n",
-		       tv->test, driver);
-		print_hex_dump(KERN_ERR, "prk: ", DUMP_PREFIX_NONE,
-			       16, 1, prk, tv->prk_size, false);
-		err = -EINVAL;
-		goto out_free;
-	}
-
-	okm = kzalloc(tv->okm_size, GFP_KERNEL);
-	if (!okm) {
-		err = -ENOMEM;
-		goto out_free;
-	}
-
-	err = crypto_shash_setkey(tfm, tv->prk, tv->prk_size);
-	if (err) {
-		pr_err("%s(%s): failed to set prk, error %d\n",
-		       tv->test, driver, err);
-		goto out_free;
-	}
-
-	err = hkdf_expand(tfm, tv->info, tv->info_size,
-			  okm, tv->okm_size);
-	if (err) {
-		pr_err("%s(%s): hkdf_expand() failed with %d\n",
-		       tv->test, driver, err);
-	} else if (memcmp(okm, tv->okm, tv->okm_size)) {
-		pr_err("%s(%s): hkdf_expand() okm mismatch\n",
-		       tv->test, driver);
-		print_hex_dump(KERN_ERR, "okm: ", DUMP_PREFIX_NONE,
-			       16, 1, okm, tv->okm_size, false);
-		err = -EINVAL;
-	}
-out_free:
-	kfree(okm);
-	kfree(prk);
-	crypto_free_shash(tfm);
-	return err;
-}
-
-static int __init crypto_hkdf_module_init(void)
-{
-	int ret = 0, i;
-
-	if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS))
-		return 0;
-
-	for (i = 0; i < ARRAY_SIZE(hkdf_sha256_tv); i++) {
-		ret = hkdf_test("hmac(sha256)", &hkdf_sha256_tv[i]);
-		if (ret)
-			return ret;
-	}
-	for (i = 0; i < ARRAY_SIZE(hkdf_sha384_tv); i++) {
-		ret = hkdf_test("hmac(sha384)", &hkdf_sha384_tv[i]);
-		if (ret)
-			return ret;
-	}
-	for (i = 0; i < ARRAY_SIZE(hkdf_sha512_tv); i++) {
-		ret = hkdf_test("hmac(sha512)", &hkdf_sha512_tv[i]);
-		if (ret)
-			return ret;
-	}
-	return 0;
-}
-
-static void __exit crypto_hkdf_module_exit(void) {}
-
-late_initcall(crypto_hkdf_module_init);
-module_exit(crypto_hkdf_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("HMAC-based Key Derivation Function (HKDF)");
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -3,7 +3,6 @@ drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o
 drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
 drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
 drbd-y += drbd_interval.o drbd_state.o
-drbd-y += drbd_nla.o
 drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o

 obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -874,7 +874,7 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
 	if (uuid && uuid != UUID_JUST_CREATED)
 		uuid = uuid + UUID_NEW_BM_OFFSET;
 	else
-		get_random_bytes(&uuid, sizeof(u64));
+		uuid = get_random_u64();
 	drbd_uuid_set(device, UI_BITMAP, uuid);
 	drbd_print_uuids(device, "updated sync UUID");
 	drbd_md_sync(device);
@@ -3337,7 +3337,7 @@ void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
 	u64 val;
 	unsigned long long bm_uuid;

-	get_random_bytes(&val, sizeof(u64));
+	val = get_random_u64();

 	spin_lock_irq(&device->ldev->md.uuid_lock);
 	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
--- a/drivers/block/drbd/drbd_nla.c
+++ b/drivers/block/drbd/drbd_nla.c
@@ -1,56 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/kernel.h>
-#include <net/netlink.h>
-#include <linux/drbd_genl_api.h>
-#include "drbd_nla.h"
-
-static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
-{
-	struct nlattr *head = nla_data(nla);
-	int len = nla_len(nla);
-	int rem;
-
-	/*
-	 * validate_nla (called from nla_parse_nested) ignores attributes
-	 * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
-	 * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
-	 * flag set also, check and remove that flag before calling
-	 * nla_parse_nested.
-	 */
-
-	nla_for_each_attr(nla, head, len, rem) {
-		if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
-			nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
-			if (nla_type(nla) > maxtype)
-				return -EOPNOTSUPP;
-		}
-	}
-	return 0;
-}
-
-int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
-			  const struct nla_policy *policy)
-{
-	int err;
-
-	err = drbd_nla_check_mandatory(maxtype, nla);
-	if (!err)
-		err = nla_parse_nested_deprecated(tb, maxtype, nla, policy,
-						  NULL);
-
-	return err;
-}
-
-struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
-{
-	int err;
-	/*
-	 * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
-	 * we don't know about that attribute, reject all the nested
-	 * attributes.
-	 */
-	err = drbd_nla_check_mandatory(maxtype, nla);
-	if (err)
-		return ERR_PTR(err);
-	return nla_find_nested(nla, attrtype);
-}
--- a/drivers/block/drbd/drbd_nla.h
+++ b/drivers/block/drbd/drbd_nla.h
@@ -1,9 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-#ifndef __DRBD_NLA_H
-#define __DRBD_NLA_H
-
-extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
-				 const struct nla_policy *policy);
-extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
-
-#endif  /* __DRBD_NLA_H */
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -46,6 +46,8 @@
 #include <linux/kref.h>
 #include <linux/kfifo.h>
 #include <linux/blk-integrity.h>
+#include <linux/maple_tree.h>
+#include <linux/xarray.h>
 #include <uapi/linux/fs.h>
 #include <uapi/linux/ublk_cmd.h>

@@ -58,6 +60,11 @@
 #define UBLK_CMD_UPDATE_SIZE	_IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
 #define UBLK_CMD_QUIESCE_DEV	_IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
 #define UBLK_CMD_TRY_STOP_DEV	_IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
+#define UBLK_CMD_REG_BUF	_IOC_NR(UBLK_U_CMD_REG_BUF)
+#define UBLK_CMD_UNREG_BUF	_IOC_NR(UBLK_U_CMD_UNREG_BUF)
+
+/* Default max shmem buffer size: 4GB (may be increased in future) */
+#define UBLK_SHMEM_BUF_SIZE_MAX	(1ULL << 32)

 #define UBLK_IO_REGISTER_IO_BUF		_IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
 #define UBLK_IO_UNREGISTER_IO_BUF	_IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -81,7 +88,8 @@
 		| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
 		| UBLK_F_SAFE_STOP_DEV \
 		| UBLK_F_BATCH_IO \
-		| UBLK_F_NO_AUTO_PART_SCAN)
+		| UBLK_F_NO_AUTO_PART_SCAN \
+		| UBLK_F_SHMEM_ZC)

 #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
 		| UBLK_F_USER_RECOVERY_REISSUE \
@@ -289,6 +297,13 @@ struct ublk_queue {
 	struct ublk_io ios[] __counted_by(q_depth);
 };

+/* Maple tree value: maps a PFN range to buffer location */
+struct ublk_buf_range {
+	unsigned short buf_index;
+	unsigned short flags;
+	unsigned int base_offset;	/* byte offset within buffer */
+};
+
 struct ublk_device {
 	struct gendisk		*ub_disk;

@@ -323,6 +338,10 @@ struct ublk_device {

 	bool			block_open; /* protected by open_mutex */

+	/* shared memory zero copy */
+	struct maple_tree	buf_tree;
+	struct ida		buf_ida;
+
 	struct ublk_queue       *queues[];
 };

@@ -334,6 +353,9 @@ struct ublk_params_header {

 static void ublk_io_release(void *priv);
 static void ublk_stop_dev_unlocked(struct ublk_device *ub);
+static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
+				  u32 *buf_idx, u32 *buf_off);
+static void ublk_buf_cleanup(struct ublk_device *ub);
 static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
 static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
 		u16 q_id, u16 tag, struct ublk_io *io);
@@ -398,6 +420,22 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
 	return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
 }

+static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
+{
+	return ubq->flags & UBLK_F_SHMEM_ZC;
+}
+
+static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
+					unsigned int tag)
+{
+	return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
+}
+
+static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
+{
+	return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
+}
+
 static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
 {
 	return ubq->flags & UBLK_F_AUTO_BUF_REG;
@@ -808,7 +846,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)

 static int ublk_integrity_flags(u32 flags)
 {
-	int ret_flags = 0;
+	int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;

 	if (flags & LBMD_PI_CAP_INTEGRITY) {
 		flags &= ~LBMD_PI_CAP_INTEGRITY;
@@ -1460,6 +1498,19 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
 	iod->op_flags = ublk_op | ublk_req_build_flags(req);
 	iod->nr_sectors = blk_rq_sectors(req);
 	iod->start_sector = blk_rq_pos(req);
+
+	/* Try shmem zero-copy match before setting addr */
+	if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
+		u32 buf_idx, buf_off;
+
+		if (ublk_try_buf_match(ubq->dev, req,
+					  &buf_idx, &buf_off)) {
+			iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
+			iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
+			return BLK_STS_OK;
+		}
+	}
+
 	iod->addr = io->buf.addr;

 	return BLK_STS_OK;
@@ -1505,6 +1556,10 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
 	    req_op(req) != REQ_OP_DRV_IN)
 		goto exit;

+	/* shmem zero copy: no data to unmap, pages already shared */
+	if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
+		goto exit;
+
 	/* for READ request, writing data in iod->addr to rq buffers */
 	unmapped_bytes = ublk_unmap_io(need_map, req, io);

@@ -1663,7 +1718,13 @@ static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
 static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
 			  struct ublk_io *io)
 {
-	unsigned mapped_bytes = ublk_map_io(ubq, req, io);
+	unsigned mapped_bytes;
+
+	/* shmem zero copy: skip data copy, pages already shared */
+	if (ublk_iod_is_shmem_zc(ubq, req->tag))
+		return true;
+
+	mapped_bytes = ublk_map_io(ubq, req, io);

 	/* partially mapped, update io descriptor */
 	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
@@ -1789,7 +1850,7 @@ static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
 * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
 * Returns the new length after filtering.
 */
-static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
+static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
 					    unsigned int len)
 {
 	unsigned int i, j;
@@ -1805,6 +1866,41 @@ static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
 	return j;
 }

+static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
+		const struct ublk_batch_io_data *data,
+		unsigned short *tag_buf, size_t len, int ret)
+{
+	int i, res;
+
+	/*
+	 * Undo prep state for all IOs since userspace never received them.
+	 * This restores IOs to pre-prepared state so they can be cleanly
+	 * re-prepared when tags are pulled from FIFO again.
+	 */
+	for (i = 0; i < len; i++) {
+		struct ublk_io *io = &ubq->ios[tag_buf[i]];
+		int index = -1;
+
+		ublk_io_lock(io);
+		if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
+			index = io->buf.auto_reg.index;
+		io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		ublk_io_unlock(io);
+
+		if (index != -1)
+			io_buffer_unregister_bvec(data->cmd, index,
+					data->issue_flags);
+	}
+
+	res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
+		tag_buf, len, &ubq->evts_lock);
+
+	pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
+			"tags(%d %zu) ret %d\n", __func__, res, len,
+			ret);
+}
+
 #define MAX_NR_TAG 128
 static int __ublk_batch_dispatch(struct ublk_queue *ubq,
 				 const struct ublk_batch_io_data *data,
@@ -1848,37 +1944,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq,

 	sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
 	ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
-	if (unlikely(ret < 0)) {
-		int i, res;
-
-		/*
-		 * Undo prep state for all IOs since userspace never received them.
-		 * This restores IOs to pre-prepared state so they can be cleanly
-		 * re-prepared when tags are pulled from FIFO again.
-		 */
-		for (i = 0; i < len; i++) {
-			struct ublk_io *io = &ubq->ios[tag_buf[i]];
-			int index = -1;
-
-			ublk_io_lock(io);
-			if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
-				index = io->buf.auto_reg.index;
-			io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
-			io->flags |= UBLK_IO_FLAG_ACTIVE;
-			ublk_io_unlock(io);
-
-			if (index != -1)
-				io_buffer_unregister_bvec(data->cmd, index,
-						data->issue_flags);
-		}
-
-		res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
-			tag_buf, len, &ubq->evts_lock);
-
-		pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
-				"tags(%d %zu) ret %d\n", __func__, res, len,
-				ret);
-	}
+	if (unlikely(ret < 0))
+		ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
 	return ret;
 }

@@ -2910,22 +2977,26 @@ static void ublk_stop_dev(struct ublk_device *ub)
 	ublk_cancel_dev(ub);
 }

+static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
+{
+	/* UBLK_IO_FLAG_CANCELED can be cleared now */
+	spin_lock(&ubq->cancel_lock);
+	io->flags &= ~UBLK_IO_FLAG_CANCELED;
+	spin_unlock(&ubq->cancel_lock);
+}
+
 /* reset per-queue io flags */
 static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
 {
-	int j;
-
-	/* UBLK_IO_FLAG_CANCELED can be cleared now */
 	spin_lock(&ubq->cancel_lock);
-	for (j = 0; j < ubq->q_depth; j++)
-		ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
 	ubq->canceling = false;
 	spin_unlock(&ubq->cancel_lock);
 	ubq->fail_io = false;
 }

 /* device can only be started after all IOs are ready */
-static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
+static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
+	struct ublk_io *io)
 	__must_hold(&ub->mutex)
 {
 	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
@@ -2934,6 +3005,7 @@ static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
 		ub->unprivileged_daemons = true;

 	ubq->nr_io_ready++;
+	ublk_reset_io_flags(ubq, io);

 	/* Check if this specific queue is now fully ready */
 	if (ublk_queue_ready(ubq)) {
@@ -3196,7 +3268,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
 	if (!ret)
 		ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
 	if (!ret)
-		ublk_mark_io_ready(ub, q_id);
+		ublk_mark_io_ready(ub, q_id, io);
 	mutex_unlock(&ub->mutex);
 	return ret;
 }
@@ -3604,7 +3676,7 @@ static int ublk_batch_prep_io(struct ublk_queue *ubq,
 	ublk_io_unlock(io);

 	if (!ret)
-		ublk_mark_io_ready(data->ub, ubq->q_id);
+		ublk_mark_io_ready(data->ub, ubq->q_id, io);

 	return ret;
 }
@@ -4200,6 +4272,7 @@ static void ublk_cdev_rel(struct device *dev)
 {
 	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);

+	ublk_buf_cleanup(ub);
 	blk_mq_free_tag_set(&ub->tag_set);
 	ublk_deinit_queues(ub);
 	ublk_free_dev_number(ub);
@@ -4621,6 +4694,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
 	mutex_init(&ub->mutex);
 	spin_lock_init(&ub->lock);
 	mutex_init(&ub->cancel_mutex);
+	mt_init(&ub->buf_tree);
+	ida_init(&ub->buf_ida);
 	INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);

 	ret = ublk_alloc_dev_number(ub, header->dev_id);
@@ -5171,6 +5246,314 @@ exit:
 	return err;
 }

+/*
+ * Lock for maple tree modification: acquire ub->mutex, then freeze queue
+ * if device is started. If device is not yet started, only mutex is
+ * needed since no I/O path can access the tree.
+ *
+ * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
+ * already holds ub->mutex when calling del_gendisk() which freezes the queue.
+*/
+static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
+{
+	unsigned int memflags = 0;
+
+	mutex_lock(&ub->mutex);
+	if (ub->ub_disk)
+		memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
+
+	return memflags;
+}
+
+static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
+{
+	if (ub->ub_disk)
+		blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
+	mutex_unlock(&ub->mutex);
+}
+
+/* Erase coalesced PFN ranges from the maple tree matching buf_index */
+static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
+{
+	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+	struct ublk_buf_range *range;
+
+	mas_lock(&mas);
+	mas_for_each(&mas, range, ULONG_MAX) {
+		if (range->buf_index == buf_index) {
+			mas_erase(&mas);
+			kfree(range);
+		}
+	}
+	mas_unlock(&mas);
+}
+
+static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
+			       struct page **pages, unsigned long nr_pages,
+			       int index, unsigned short flags)
+{
+	unsigned long i;
+	int ret;
+
+	for (i = 0; i < nr_pages; i++) {
+		unsigned long pfn = page_to_pfn(pages[i]);
+		unsigned long start = i;
+		struct ublk_buf_range *range;
+
+		/* Find run of consecutive PFNs */
+		while (i + 1 < nr_pages &&
+		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
+			i++;
+
+		range = kzalloc(sizeof(*range), GFP_KERNEL);
+		if (!range) {
+			ret = -ENOMEM;
+			goto unwind;
+		}
+		range->buf_index = index;
+		range->flags = flags;
+		range->base_offset = start << PAGE_SHIFT;
+
+		ret = mtree_insert_range(&ub->buf_tree, pfn,
+					 pfn + (i - start),
+					 range, GFP_KERNEL);
+		if (ret) {
+			kfree(range);
+			goto unwind;
+		}
+	}
+	return 0;
+
+unwind:
+	ublk_buf_erase_ranges(ub, index);
+	return ret;
+}
+
+/*
+ * Register a shared memory buffer for zero-copy I/O.
+ * Pins pages, builds PFN maple tree, freezes/unfreezes the queue
+ * internally. Returns buffer index (>= 0) on success.
+ */
+static int ublk_ctrl_reg_buf(struct ublk_device *ub,
+			     struct ublksrv_ctrl_cmd *header)
+{
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_shmem_buf_reg buf_reg;
+	unsigned long nr_pages;
+	struct page **pages = NULL;
+	unsigned int gup_flags;
+	unsigned int memflags;
+	long pinned;
+	int index;
+	int ret;
+
+	if (!ublk_dev_support_shmem_zc(ub))
+		return -EOPNOTSUPP;
+
+	memset(&buf_reg, 0, sizeof(buf_reg));
+	if (copy_from_user(&buf_reg, argp,
+			   min_t(size_t, header->len, sizeof(buf_reg))))
+		return -EFAULT;
+
+	if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
+		return -EINVAL;
+
+	if (buf_reg.reserved)
+		return -EINVAL;
+
+	if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
+	    !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
+		return -EINVAL;
+
+	nr_pages = buf_reg.len >> PAGE_SHIFT;
+
+	/* Pin pages before any locks (may sleep) */
+	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	gup_flags = FOLL_LONGTERM;
+	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
+		gup_flags |= FOLL_WRITE;
+
+	pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
+	if (pinned < 0) {
+		ret = pinned;
+		goto err_free_pages;
+	}
+	if (pinned != nr_pages) {
+		ret = -EFAULT;
+		goto err_unpin;
+	}
+
+	memflags = ublk_lock_buf_tree(ub);
+
+	index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
+	if (index < 0) {
+		ret = index;
+		goto err_unlock;
+	}
+
+	ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
+	if (ret) {
+		ida_free(&ub->buf_ida, index);
+		goto err_unlock;
+	}
+
+	ublk_unlock_buf_tree(ub, memflags);
+	kvfree(pages);
+	return index;
+
+err_unlock:
+	ublk_unlock_buf_tree(ub, memflags);
+err_unpin:
+	unpin_user_pages(pages, pinned);
+err_free_pages:
+	kvfree(pages);
+	return ret;
+}
+
+static int __ublk_ctrl_unreg_buf(struct ublk_device *ub, int buf_index)
+{
+	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+	struct ublk_buf_range *range;
+	struct page *pages[32];
+	int ret = -ENOENT;
+
+	mas_lock(&mas);
+	mas_for_each(&mas, range, ULONG_MAX) {
+		unsigned long base, nr, off;
+
+		if (range->buf_index != buf_index)
+			continue;
+
+		ret = 0;
+		base = mas.index;
+		nr = mas.last - base + 1;
+		mas_erase(&mas);
+
+		for (off = 0; off < nr; ) {
+			unsigned int batch = min_t(unsigned long,
+						   nr - off, 32);
+			unsigned int j;
+
+			for (j = 0; j < batch; j++)
+				pages[j] = pfn_to_page(base + off + j);
+			unpin_user_pages(pages, batch);
+			off += batch;
+		}
+		kfree(range);
+	}
+	mas_unlock(&mas);
+
+	return ret;
+}
+
+static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
+			       struct ublksrv_ctrl_cmd *header)
+{
+	int index = (int)header->data[0];
+	unsigned int memflags;
+	int ret;
+
+	if (!ublk_dev_support_shmem_zc(ub))
+		return -EOPNOTSUPP;
+
+	if (index < 0 || index > USHRT_MAX)
+		return -EINVAL;
+
+	memflags = ublk_lock_buf_tree(ub);
+
+	ret = __ublk_ctrl_unreg_buf(ub, index);
+	if (!ret)
+		ida_free(&ub->buf_ida, index);
+
+	ublk_unlock_buf_tree(ub, memflags);
+	return ret;
+}
+
+static void ublk_buf_cleanup(struct ublk_device *ub)
+{
+	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
+	struct ublk_buf_range *range;
+	struct page *pages[32];
+
+	mas_for_each(&mas, range, ULONG_MAX) {
+		unsigned long base = mas.index;
+		unsigned long nr = mas.last - base + 1;
+		unsigned long off;
+
+		for (off = 0; off < nr; ) {
+			unsigned int batch = min_t(unsigned long,
+						   nr - off, 32);
+			unsigned int j;
+
+			for (j = 0; j < batch; j++)
+				pages[j] = pfn_to_page(base + off + j);
+			unpin_user_pages(pages, batch);
+			off += batch;
+		}
+		kfree(range);
+	}
+	mtree_destroy(&ub->buf_tree);
+	ida_destroy(&ub->buf_ida);
+}
+
+/* Check if request pages match a registered shared memory buffer */
+static bool ublk_try_buf_match(struct ublk_device *ub,
+				   struct request *rq,
+				   u32 *buf_idx, u32 *buf_off)
+{
+	struct req_iterator iter;
+	struct bio_vec bv;
+	int index = -1;
+	unsigned long expected_offset = 0;
+	bool first = true;
+
+	rq_for_each_bvec(bv, rq, iter) {
+		unsigned long pfn = page_to_pfn(bv.bv_page);
+		unsigned long end_pfn = pfn +
+			((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
+		struct ublk_buf_range *range;
+		unsigned long off;
+		MA_STATE(mas, &ub->buf_tree, pfn, pfn);
+
+		range = mas_walk(&mas);
+		if (!range)
+			return false;
+
+		/* verify all pages in this bvec fall within the range */
+		if (end_pfn > mas.last)
+			return false;
+
+		off = range->base_offset +
+			(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
+
+		if (first) {
+			/* Read-only buffer can't serve READ (kernel writes) */
+			if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
+			    req_op(rq) != REQ_OP_WRITE)
+				return false;
+			index = range->buf_index;
+			expected_offset = off;
+			*buf_off = off;
+			first = false;
+		} else {
+			if (range->buf_index != index)
+				return false;
+			if (off != expected_offset)
+				return false;
+		}
+		expected_offset += bv.bv_len;
+	}
+
+	if (first)
+		return false;
+
+	*buf_idx = index;
+	return true;
+}
+
 static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 		u32 cmd_op, struct ublksrv_ctrl_cmd *header)
 {
@@ -5228,6 +5611,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
 	case UBLK_CMD_UPDATE_SIZE:
 	case UBLK_CMD_QUIESCE_DEV:
 	case UBLK_CMD_TRY_STOP_DEV:
+	case UBLK_CMD_REG_BUF:
+	case UBLK_CMD_UNREG_BUF:
 		mask = MAY_READ | MAY_WRITE;
 		break;
 	default:
@@ -5352,6 +5737,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
 	case UBLK_CMD_TRY_STOP_DEV:
 		ret = ublk_ctrl_try_stop_dev(ub);
 		break;
+	case UBLK_CMD_REG_BUF:
+		ret = ublk_ctrl_reg_buf(ub, &header);
+		break;
+	case UBLK_CMD_UNREG_BUF:
+		ret = ublk_ctrl_unreg_buf(ub, &header);
+		break;
 	default:
 		ret = -EOPNOTSUPP;
 		break;
--- a/drivers/block/zloop.c
+++ b/drivers/block/zloop.c
@@ -17,6 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
+#include <linux/xattr.h>

 /*
 * Options for adding (and removing) a device.
@@ -34,6 +35,8 @@ enum {
 	ZLOOP_OPT_BUFFERED_IO		= (1 << 8),
 	ZLOOP_OPT_ZONE_APPEND		= (1 << 9),
 	ZLOOP_OPT_ORDERED_ZONE_APPEND	= (1 << 10),
+	ZLOOP_OPT_DISCARD_WRITE_CACHE	= (1 << 11),
+	ZLOOP_OPT_MAX_OPEN_ZONES	= (1 << 12),
 };

 static const match_table_t zloop_opt_tokens = {
@@ -48,6 +51,8 @@ static const match_table_t zloop_opt_tokens = {
 	{ ZLOOP_OPT_BUFFERED_IO,	"buffered_io"		},
 	{ ZLOOP_OPT_ZONE_APPEND,	"zone_append=%u"	},
 	{ ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append"	},
+	{ ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" },
+	{ ZLOOP_OPT_MAX_OPEN_ZONES,	"max_open_zones=%u"	},
 	{ ZLOOP_OPT_ERR,		NULL			}
 };

@@ -56,6 +61,7 @@ static const match_table_t zloop_opt_tokens = {
 #define ZLOOP_DEF_ZONE_SIZE		((256ULL * SZ_1M) >> SECTOR_SHIFT)
 #define ZLOOP_DEF_NR_ZONES		64
 #define ZLOOP_DEF_NR_CONV_ZONES		8
+#define ZLOOP_DEF_MAX_OPEN_ZONES	0
 #define ZLOOP_DEF_BASE_DIR		"/var/local/zloop"
 #define ZLOOP_DEF_NR_QUEUES		1
 #define ZLOOP_DEF_QUEUE_DEPTH		128
@@ -73,12 +79,14 @@ struct zloop_options {
 	sector_t		zone_size;
 	sector_t		zone_capacity;
 	unsigned int		nr_conv_zones;
+	unsigned int		max_open_zones;
 	char			*base_dir;
 	unsigned int		nr_queues;
 	unsigned int		queue_depth;
 	bool			buffered_io;
 	bool			zone_append;
 	bool			ordered_zone_append;
+	bool			discard_write_cache;
 };

 /*
@@ -95,7 +103,12 @@ enum zloop_zone_flags {
 	ZLOOP_ZONE_SEQ_ERROR,
 };

+/*
+ * Zone descriptor.
+ * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock
+ */
 struct zloop_zone {
+	struct list_head	open_zone_entry;
 	struct file		*file;

 	unsigned long		flags;
@@ -119,6 +132,7 @@ struct zloop_device {
 	bool			buffered_io;
 	bool			zone_append;
 	bool			ordered_zone_append;
+	bool			discard_write_cache;

 	const char		*base_dir;
 	struct file		*data_dir;
@@ -128,8 +142,13 @@ struct zloop_device {
 	sector_t		zone_capacity;
 	unsigned int		nr_zones;
 	unsigned int		nr_conv_zones;
+	unsigned int		max_open_zones;
 	unsigned int		block_size;

+	spinlock_t		open_zones_lock;
+	struct list_head	open_zones_lru_list;
+	unsigned int		nr_open_zones;
+
 	struct zloop_zone	zones[] __counted_by(nr_zones);
 };

@@ -153,6 +172,122 @@ static unsigned int rq_zone_no(struct request *rq)
 	return blk_rq_pos(rq) >> zlo->zone_shift;
 }

+/*
+ * Open an already open zone. This is mostly a no-op, except for the imp open ->
+ * exp open condition change that may happen. We also move a zone at the tail of
+ * the list of open zones so that if we need to
+ * implicitly close one open zone, we can do so in LRU order.
+ */
+static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo,
+					      struct zloop_zone *zone)
+{
+	if (zlo->max_open_zones) {
+		spin_lock(&zlo->open_zones_lock);
+		list_move_tail(&zone->open_zone_entry,
+			       &zlo->open_zones_lru_list);
+		spin_unlock(&zlo->open_zones_lock);
+	}
+}
+
+static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo,
+					      struct zloop_zone *zone)
+{
+	if (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+	    zone->cond == BLK_ZONE_COND_EXP_OPEN) {
+		spin_lock(&zlo->open_zones_lock);
+		list_del_init(&zone->open_zone_entry);
+		zlo->nr_open_zones--;
+		spin_unlock(&zlo->open_zones_lock);
+	}
+}
+
+static inline bool zloop_can_open_zone(struct zloop_device *zlo)
+{
+	return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones;
+}
+
+/*
+ * If we have reached the maximum open zones limit, attempt to close an
+ * implicitly open zone (if we have any) so that we can implicitly open another
+ * zone without exceeding the maximum number of open zones.
+ */
+static bool zloop_close_imp_open_zone(struct zloop_device *zlo)
+{
+	struct zloop_zone *zone;
+
+	lockdep_assert_held(&zlo->open_zones_lock);
+
+	if (zloop_can_open_zone(zlo))
+		return true;
+
+	list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) {
+		if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
+			zone->cond = BLK_ZONE_COND_CLOSED;
+			list_del_init(&zone->open_zone_entry);
+			zlo->nr_open_zones--;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo,
+					    struct zloop_zone *zone,
+					    bool explicit)
+{
+	spin_lock(&zlo->open_zones_lock);
+
+	if (explicit) {
+		/*
+		 * Explicit open: we cannot allow this if we have reached the
+		 * maximum open zones limit.
+		 */
+		if (!zloop_can_open_zone(zlo))
+			goto fail;
+		zone->cond = BLK_ZONE_COND_EXP_OPEN;
+	} else {
+		/*
+		 * Implicit open case: if we have reached the maximum open zones
+		 * limit, try to close an implicitly open zone first.
+		 */
+		if (!zloop_close_imp_open_zone(zlo))
+			goto fail;
+		zone->cond = BLK_ZONE_COND_IMP_OPEN;
+	}
+
+	zlo->nr_open_zones++;
+	list_add_tail(&zone->open_zone_entry,
+		      &zlo->open_zones_lru_list);
+
+	spin_unlock(&zlo->open_zones_lock);
+
+	return true;
+
+fail:
+	spin_unlock(&zlo->open_zones_lock);
+
+	return false;
+}
+
+static bool zloop_do_open_zone(struct zloop_device *zlo,
+			       struct zloop_zone *zone, bool explicit)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+		if (explicit)
+			zone->cond = BLK_ZONE_COND_EXP_OPEN;
+		zloop_lru_rotate_open_zone(zlo, zone);
+		return true;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_CLOSED:
+		return zloop_open_closed_or_empty_zone(zlo, zone, explicit);
+	default:
+		return false;
+	}
+}
+
 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
 {
 	struct zloop_zone *zone = &zlo->zones[zone_no];
@@ -186,13 +321,17 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)

 	spin_lock_irqsave(&zone->wp_lock, flags);
 	if (!file_sectors) {
+		zloop_lru_remove_open_zone(zlo, zone);
 		zone->cond = BLK_ZONE_COND_EMPTY;
 		zone->wp = zone->start;
 	} else if (file_sectors == zlo->zone_capacity) {
+		zloop_lru_remove_open_zone(zlo, zone);
 		zone->cond = BLK_ZONE_COND_FULL;
 		zone->wp = ULLONG_MAX;
 	} else {
-		zone->cond = BLK_ZONE_COND_CLOSED;
+		if (zone->cond != BLK_ZONE_COND_IMP_OPEN &&
+		    zone->cond != BLK_ZONE_COND_EXP_OPEN)
+			zone->cond = BLK_ZONE_COND_CLOSED;
 		zone->wp = zone->start + file_sectors;
 	}
 	spin_unlock_irqrestore(&zone->wp_lock, flags);
@@ -216,19 +355,8 @@ static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
 			goto unlock;
 	}

-	switch (zone->cond) {
-	case BLK_ZONE_COND_EXP_OPEN:
-		break;
-	case BLK_ZONE_COND_EMPTY:
-	case BLK_ZONE_COND_CLOSED:
-	case BLK_ZONE_COND_IMP_OPEN:
-		zone->cond = BLK_ZONE_COND_EXP_OPEN;
-		break;
-	case BLK_ZONE_COND_FULL:
-	default:
+	if (!zloop_do_open_zone(zlo, zone, true))
 		ret = -EIO;
-		break;
-	}

 unlock:
 	mutex_unlock(&zone->lock);
@@ -259,6 +387,7 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
 	case BLK_ZONE_COND_IMP_OPEN:
 	case BLK_ZONE_COND_EXP_OPEN:
 		spin_lock_irqsave(&zone->wp_lock, flags);
+		zloop_lru_remove_open_zone(zlo, zone);
 		if (zone->wp == zone->start)
 			zone->cond = BLK_ZONE_COND_EMPTY;
 		else
@@ -300,6 +429,7 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
 	}

 	spin_lock_irqsave(&zone->wp_lock, flags);
+	zloop_lru_remove_open_zone(zlo, zone);
 	zone->cond = BLK_ZONE_COND_EMPTY;
 	zone->wp = zone->start;
 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
@@ -347,6 +477,7 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
 	}

 	spin_lock_irqsave(&zone->wp_lock, flags);
+	zloop_lru_remove_open_zone(zlo, zone);
 	zone->cond = BLK_ZONE_COND_FULL;
 	zone->wp = ULLONG_MAX;
 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
@@ -378,125 +509,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret)
 	zloop_put_cmd(cmd);
 }

-static void zloop_rw(struct zloop_cmd *cmd)
+static int zloop_do_rw(struct zloop_cmd *cmd)
 {
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE;
+	unsigned int nr_bvec = blk_rq_nr_bvec(rq);
 	struct zloop_device *zlo = rq->q->queuedata;
-	unsigned int zone_no = rq_zone_no(rq);
-	sector_t sector = blk_rq_pos(rq);
-	sector_t nr_sectors = blk_rq_sectors(rq);
-	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
-	bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
-	int rw = is_write ? ITER_SOURCE : ITER_DEST;
+	struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)];
 	struct req_iterator rq_iter;
-	struct zloop_zone *zone;
 	struct iov_iter iter;
-	struct bio_vec tmp;
-	unsigned long flags;
-	sector_t zone_end;
-	unsigned int nr_bvec;
-	int ret;
-
-	atomic_set(&cmd->ref, 2);
-	cmd->sector = sector;
-	cmd->nr_sectors = nr_sectors;
-	cmd->ret = 0;
-
-	if (WARN_ON_ONCE(is_append && !zlo->zone_append)) {
-		ret = -EIO;
-		goto out;
-	}
-
-	/* We should never get an I/O beyond the device capacity. */
-	if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
-		ret = -EIO;
-		goto out;
-	}
-	zone = &zlo->zones[zone_no];
-	zone_end = zone->start + zlo->zone_capacity;
-
-	/*
-	 * The block layer should never send requests that are not fully
-	 * contained within the zone.
-	 */
-	if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
-		ret = -EIO;
-		goto out;
-	}
-
-	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
-		mutex_lock(&zone->lock);
-		ret = zloop_update_seq_zone(zlo, zone_no);
-		mutex_unlock(&zone->lock);
-		if (ret)
-			goto out;
-	}
-
-	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
-		mutex_lock(&zone->lock);
-
-		spin_lock_irqsave(&zone->wp_lock, flags);
-
-		/*
-		 * Zone append operations always go at the current write
-		 * pointer, but regular write operations must already be
-		 * aligned to the write pointer when submitted.
-		 */
-		if (is_append) {
-			/*
-			 * If ordered zone append is in use, we already checked
-			 * and set the target sector in zloop_queue_rq().
-			 */
-			if (!zlo->ordered_zone_append) {
-				if (zone->cond == BLK_ZONE_COND_FULL ||
-				    zone->wp + nr_sectors > zone_end) {
-					spin_unlock_irqrestore(&zone->wp_lock,
-							       flags);
-					ret = -EIO;
-					goto unlock;
-				}
-				sector = zone->wp;
-			}
-			cmd->sector = sector;
-		} else if (sector != zone->wp) {
-			spin_unlock_irqrestore(&zone->wp_lock, flags);
-			pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
-			       zone_no, sector, zone->wp);
-			ret = -EIO;
-			goto unlock;
-		}
-
-		/* Implicitly open the target zone. */
-		if (zone->cond == BLK_ZONE_COND_CLOSED ||
-		    zone->cond == BLK_ZONE_COND_EMPTY)
-			zone->cond = BLK_ZONE_COND_IMP_OPEN;
-
-		/*
-		 * Advance the write pointer, unless ordered zone append is in
-		 * use. If the write fails, the write pointer position will be
-		 * corrected when the next I/O starts execution.
-		 */
-		if (!is_append || !zlo->ordered_zone_append) {
-			zone->wp += nr_sectors;
-			if (zone->wp == zone_end) {
-				zone->cond = BLK_ZONE_COND_FULL;
-				zone->wp = ULLONG_MAX;
-			}
-		}
-
-		spin_unlock_irqrestore(&zone->wp_lock, flags);
-	}
-
-	nr_bvec = blk_rq_nr_bvec(rq);

 	if (rq->bio != rq->biotail) {
-		struct bio_vec *bvec;
+		struct bio_vec tmp, *bvec;

 		cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO);
-		if (!cmd->bvec) {
-			ret = -EIO;
-			goto unlock;
-		}
+		if (!cmd->bvec)
+			return -EIO;

 		/*
 		 * The bios of the request may be started from the middle of
@@ -522,7 +550,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
 		iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
 	}

-	cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
+	cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT;
 	cmd->iocb.ki_filp = zone->file;
 	cmd->iocb.ki_complete = zloop_rw_complete;
 	if (!zlo->buffered_io)
@@ -530,18 +558,166 @@ static void zloop_rw(struct zloop_cmd *cmd)
 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);

 	if (rw == ITER_SOURCE)
-		ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
-	else
-		ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
-unlock:
-	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
+		return zone->file->f_op->write_iter(&cmd->iocb, &iter);
+	return zone->file->f_op->read_iter(&cmd->iocb, &iter);
+}
+
+static int zloop_seq_write_prep(struct zloop_cmd *cmd)
+{
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	struct zloop_device *zlo = rq->q->queuedata;
+	unsigned int zone_no = rq_zone_no(rq);
+	sector_t nr_sectors = blk_rq_sectors(rq);
+	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+	struct zloop_zone *zone = &zlo->zones[zone_no];
+	sector_t zone_end = zone->start + zlo->zone_capacity;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&zone->wp_lock, flags);
+
+	/*
+	 * Zone append operations always go at the current write pointer, but
+	 * regular write operations must already be aligned to the write pointer
+	 * when submitted.
+	 */
+	if (is_append) {
+		/*
+		 * If ordered zone append is in use, we already checked and set
+		 * the target sector in zloop_queue_rq().
+		 */
+		if (!zlo->ordered_zone_append) {
+			if (zone->cond == BLK_ZONE_COND_FULL ||
+			    zone->wp + nr_sectors > zone_end) {
+				ret = -EIO;
+				goto out_unlock;
+			}
+			cmd->sector = zone->wp;
+		}
+	} else {
+		if (cmd->sector != zone->wp) {
+			pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
+			       zone_no, cmd->sector, zone->wp);
+			ret = -EIO;
+			goto out_unlock;
+		}
+	}
+
+	/* Implicitly open the target zone. */
+	if (!zloop_do_open_zone(zlo, zone, false)) {
+		ret = -EIO;
+		goto out_unlock;
+	}
+
+	/*
+	 * Advance the write pointer, unless ordered zone append is in use. If
+	 * the write fails, the write pointer position will be corrected when
+	 * the next I/O starts execution.
+	 */
+	if (!is_append || !zlo->ordered_zone_append) {
+		zone->wp += nr_sectors;
+		if (zone->wp == zone_end) {
+			zloop_lru_remove_open_zone(zlo, zone);
+			zone->cond = BLK_ZONE_COND_FULL;
+			zone->wp = ULLONG_MAX;
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&zone->wp_lock, flags);
+	return ret;
+}
+
+static void zloop_rw(struct zloop_cmd *cmd)
+{
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	struct zloop_device *zlo = rq->q->queuedata;
+	unsigned int zone_no = rq_zone_no(rq);
+	sector_t nr_sectors = blk_rq_sectors(rq);
+	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
+	bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
+	struct zloop_zone *zone;
+	int ret = -EIO;
+
+	atomic_set(&cmd->ref, 2);
+	cmd->sector = blk_rq_pos(rq);
+	cmd->nr_sectors = nr_sectors;
+	cmd->ret = 0;
+
+	if (WARN_ON_ONCE(is_append && !zlo->zone_append))
+		goto out;
+
+	/* We should never get an I/O beyond the device capacity. */
+	if (WARN_ON_ONCE(zone_no >= zlo->nr_zones))
+		goto out;
+
+	zone = &zlo->zones[zone_no];
+
+	/*
+	 * The block layer should never send requests that are not fully
+	 * contained within the zone.
+	 */
+	if (WARN_ON_ONCE(cmd->sector + nr_sectors >
+			 zone->start + zlo->zone_size))
+		goto out;
+
+	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
+		mutex_lock(&zone->lock);
+		ret = zloop_update_seq_zone(zlo, zone_no);
 		mutex_unlock(&zone->lock);
+		if (ret)
+			goto out;
+	}
+
+	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
+		mutex_lock(&zone->lock);
+		ret = zloop_seq_write_prep(cmd);
+		if (!ret)
+			ret = zloop_do_rw(cmd);
+		mutex_unlock(&zone->lock);
+	} else {
+		ret = zloop_do_rw(cmd);
+	}
 out:
 	if (ret != -EIOCBQUEUED)
 		zloop_rw_complete(&cmd->iocb, ret);
 	zloop_put_cmd(cmd);
 }

+static inline bool zloop_zone_is_active(struct zloop_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static int zloop_record_safe_wps(struct zloop_device *zlo)
+{
+	unsigned int i;
+	int ret;
+
+	for (i = 0; i < zlo->nr_zones; i++) {
+		struct zloop_zone *zone = &zlo->zones[i];
+		struct file *file = zone->file;
+
+		if (!zloop_zone_is_active(zone))
+			continue;
+		ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file),
+				"user.zloop.wp", &zone->wp, sizeof(zone->wp), 0);
+		if (ret) {
+			pr_err("%pg: failed to record write pointer (%d)\n",
+				zlo->disk->part0, ret);
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
 /*
 * Sync the entire FS containing the zone files instead of walking all files.
 */
@@ -550,6 +726,12 @@ static int zloop_flush(struct zloop_device *zlo)
 	struct super_block *sb = file_inode(zlo->data_dir)->i_sb;
 	int ret;

+	if (zlo->discard_write_cache) {
+		ret = zloop_record_safe_wps(zlo);
+		if (ret)
+			return ret;
+	}
+
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
@@ -692,6 +874,7 @@ static bool zloop_set_zone_append_sector(struct request *rq)
 	rq->__sector = zone->wp;
 	zone->wp += blk_rq_sectors(rq);
 	if (zone->wp >= zone_end) {
+		zloop_lru_remove_open_zone(zlo, zone);
 		zone->cond = BLK_ZONE_COND_FULL;
 		zone->wp = ULLONG_MAX;
 	}
@@ -889,6 +1072,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
 	int ret;

 	mutex_init(&zone->lock);
+	INIT_LIST_HEAD(&zone->open_zone_entry);
 	spin_lock_init(&zone->wp_lock);
 	zone->start = (sector_t)zone_no << zlo->zone_shift;

@@ -1009,12 +1193,20 @@ static int zloop_ctl_add(struct zloop_options *opts)
 		goto out;
 	}

+	if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) {
+		pr_err("Invalid maximum number of open zones %u\n",
+		       opts->max_open_zones);
+		goto out;
+	}
+
 	zlo = kvzalloc_flex(*zlo, zones, nr_zones);
 	if (!zlo) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	WRITE_ONCE(zlo->state, Zlo_creating);
+	spin_lock_init(&zlo->open_zones_lock);
+	INIT_LIST_HEAD(&zlo->open_zones_lru_list);

 	ret = mutex_lock_killable(&zloop_ctl_mutex);
 	if (ret)
@@ -1042,10 +1234,12 @@ static int zloop_ctl_add(struct zloop_options *opts)
 		zlo->zone_capacity = zlo->zone_size;
 	zlo->nr_zones = nr_zones;
 	zlo->nr_conv_zones = opts->nr_conv_zones;
+	zlo->max_open_zones = opts->max_open_zones;
 	zlo->buffered_io = opts->buffered_io;
 	zlo->zone_append = opts->zone_append;
 	if (zlo->zone_append)
 		zlo->ordered_zone_append = opts->ordered_zone_append;
+	zlo->discard_write_cache = opts->discard_write_cache;

 	zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
 				opts->nr_queues * opts->queue_depth, zlo->id);
@@ -1088,6 +1282,7 @@ static int zloop_ctl_add(struct zloop_options *opts)
 	lim.logical_block_size = zlo->block_size;
 	if (zlo->zone_append)
 		lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
+	lim.max_open_zones = zlo->max_open_zones;

 	zlo->tag_set.ops = &zloop_mq_ops;
 	zlo->tag_set.nr_hw_queues = opts->nr_queues;
@@ -1168,6 +1363,49 @@ out:
 	return ret;
 }

+static void zloop_truncate(struct file *file, loff_t pos)
+{
+	struct mnt_idmap *idmap = file_mnt_idmap(file);
+	struct dentry *dentry = file_dentry(file);
+	struct iattr newattrs;
+
+	newattrs.ia_size = pos;
+	newattrs.ia_valid = ATTR_SIZE;
+
+	inode_lock(dentry->d_inode);
+	notify_change(idmap, dentry, &newattrs, NULL);
+	inode_unlock(dentry->d_inode);
+}
+
+static void zloop_forget_cache(struct zloop_device *zlo)
+{
+	unsigned int i;
+	int ret;
+
+	pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0);
+
+	for (i = 0; i < zlo->nr_zones; i++) {
+		struct zloop_zone *zone = &zlo->zones[i];
+		struct file *file = zone->file;
+		sector_t old_wp;
+
+		if (!zloop_zone_is_active(zone))
+			continue;
+
+		ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file),
+				"user.zloop.wp", &old_wp, sizeof(old_wp));
+		if (ret == -ENODATA) {
+			old_wp = 0;
+		} else if (ret != sizeof(old_wp)) {
+			pr_err("%pg: failed to retrieve write pointer (%d)\n",
+				zlo->disk->part0, ret);
+			continue;
+		}
+		if (old_wp < zone->wp)
+			zloop_truncate(file, old_wp);
+	}
+}
+
 static int zloop_ctl_remove(struct zloop_options *opts)
 {
 	struct zloop_device *zlo;
@@ -1202,6 +1440,10 @@ static int zloop_ctl_remove(struct zloop_options *opts)
 		return ret;

 	del_gendisk(zlo->disk);
+
+	if (zlo->discard_write_cache)
+		zloop_forget_cache(zlo);
+
 	put_disk(zlo->disk);

 	pr_info("Removed device %d\n", opts->id);
@@ -1224,6 +1466,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
 	opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
 	opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
 	opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
+	opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES;
 	opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
 	opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
 	opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
@@ -1302,6 +1545,13 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
 			}
 			opts->nr_conv_zones = token;
 			break;
+		case ZLOOP_OPT_MAX_OPEN_ZONES:
+			if (match_uint(args, &token)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			opts->max_open_zones = token;
+			break;
 		case ZLOOP_OPT_BASE_DIR:
 			p = match_strdup(args);
 			if (!p) {
@@ -1353,6 +1603,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
 		case ZLOOP_OPT_ORDERED_ZONE_APPEND:
 			opts->ordered_zone_append = true;
 			break;
+		case ZLOOP_OPT_DISCARD_WRITE_CACHE:
+			opts->discard_write_cache = true;
+			break;
 		case ZLOOP_OPT_ERR:
 		default:
 			pr_warn("unknown parameter or missing value '%s'\n", p);
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1373,6 +1373,14 @@ static CLOSURE_CALLBACK(cached_dev_free)

 	mutex_unlock(&bch_register_lock);

+	/*
+	 * Wait for any pending sb_write to complete before free.
+	 * The sb_bio is embedded in struct cached_dev, so we must
+	 * ensure no I/O is in progress.
+	 */
+	down(&dc->sb_write_mutex);
+	up(&dc->sb_write_mutex);
+
 	if (dc->sb_disk)
 		folio_put(virt_to_folio(dc->sb_disk));

--- a/drivers/md/md-llbitmap.c
+++ b/drivers/md/md-llbitmap.c
@@ -208,6 +208,20 @@ enum llbitmap_state {
 	BitNeedSync,
 	/* data is synchronizing */
 	BitSyncing,
+	/*
+	 * Proactive sync requested for unwritten region (raid456 only).
+	 * Triggered via sysfs when user wants to pre-build XOR parity
+	 * for regions that have never been written.
+	 */
+	BitNeedSyncUnwritten,
+	/* Proactive sync in progress for unwritten region */
+	BitSyncingUnwritten,
+	/*
+	 * XOR parity has been pre-built for a region that has never had
+	 * user data written. When user writes to this region, it transitions
+	 * to BitDirty.
+	 */
+	BitCleanUnwritten,
 	BitStateCount,
 	BitNone = 0xff,
 };
@@ -232,6 +246,12 @@ enum llbitmap_action {
 	 * BitNeedSync.
 	 */
 	BitmapActionStale,
+	/*
+	 * Proactive sync trigger for raid456 - builds XOR parity for
+	 * Unwritten regions without requiring user data write first.
+	 */
+	BitmapActionProactiveSync,
+	BitmapActionClearUnwritten,
 	BitmapActionCount,
 	/* Init state is BitUnwritten */
 	BitmapActionInit,
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitNone,
 		[BitmapActionStale]		= BitNone,
+		[BitmapActionProactiveSync]	= BitNeedSyncUnwritten,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitClean] = {
 		[BitmapActionStartwrite]	= BitDirty,
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNeedSync,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitDirty] = {
 		[BitmapActionStartwrite]	= BitNone,
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitClean,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNeedSync,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitNeedSync] = {
 		[BitmapActionStartwrite]	= BitNone,
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNone,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
 	},
 	[BitSyncing] = {
 		[BitmapActionStartwrite]	= BitNone,
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
 		[BitmapActionDaemon]		= BitNone,
 		[BitmapActionDiscard]		= BitUnwritten,
 		[BitmapActionStale]		= BitNeedSync,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitNone,
+	},
+	[BitNeedSyncUnwritten] = {
+		[BitmapActionStartwrite]	= BitNeedSync,
+		[BitmapActionStartsync]		= BitSyncingUnwritten,
+		[BitmapActionEndsync]		= BitNone,
+		[BitmapActionAbortsync]		= BitUnwritten,
+		[BitmapActionReload]		= BitUnwritten,
+		[BitmapActionDaemon]		= BitNone,
+		[BitmapActionDiscard]		= BitUnwritten,
+		[BitmapActionStale]		= BitUnwritten,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitUnwritten,
+	},
+	[BitSyncingUnwritten] = {
+		[BitmapActionStartwrite]	= BitSyncing,
+		[BitmapActionStartsync]		= BitSyncingUnwritten,
+		[BitmapActionEndsync]		= BitCleanUnwritten,
+		[BitmapActionAbortsync]		= BitUnwritten,
+		[BitmapActionReload]		= BitUnwritten,
+		[BitmapActionDaemon]		= BitNone,
+		[BitmapActionDiscard]		= BitUnwritten,
+		[BitmapActionStale]		= BitUnwritten,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitUnwritten,
+	},
+	[BitCleanUnwritten] = {
+		[BitmapActionStartwrite]	= BitDirty,
+		[BitmapActionStartsync]		= BitNone,
+		[BitmapActionEndsync]		= BitNone,
+		[BitmapActionAbortsync]		= BitNone,
+		[BitmapActionReload]		= BitNone,
+		[BitmapActionDaemon]		= BitNone,
+		[BitmapActionDiscard]		= BitUnwritten,
+		[BitmapActionStale]		= BitUnwritten,
+		[BitmapActionProactiveSync]	= BitNone,
+		[BitmapActionClearUnwritten]	= BitUnwritten,
 	},
 };

@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
 			pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
 			break;
 		case BitClean:
+		case BitCleanUnwritten:
 			pctl->state[pos] = BitDirty;
 			break;
 		}
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
 }

 static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
-				    int offset)
+				    int offset, bool infect)
 {
 	struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
 	unsigned int io_size = llbitmap->io_size;
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
 	 * resync all the dirty bits, hence skip infect new dirty bits to
 	 * prevent resync unnecessary data.
 	 */
-	if (llbitmap->mddev->degraded) {
+	if (llbitmap->mddev->degraded || !infect) {
 		set_bit(block, pctl->dirty);
 		return;
 	}
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,

 	llbitmap->pctl[idx]->state[bit] = state;
 	if (state == BitDirty || state == BitNeedSync)
-		llbitmap_set_page_dirty(llbitmap, idx, bit);
+		llbitmap_set_page_dirty(llbitmap, idx, bit, true);
+	else if (state == BitNeedSyncUnwritten)
+		llbitmap_set_page_dirty(llbitmap, idx, bit, false);
 }

 static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
@@ -459,7 +528,8 @@ static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
 	rdev_for_each(rdev, mddev) {
 		sector_t sector;

-		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) ||
+		    !test_bit(In_sync, &rdev->flags))
 			continue;

 		sector = mddev->bitmap_info.offset +
@@ -584,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
 	return 0;
 }

+/*
+ * Check if all underlying disks support write_zeroes with unmap.
+ */
+static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
+{
+	struct mddev *mddev = llbitmap->mddev;
+	struct md_rdev *rdev;
+
+	rdev_for_each(rdev, mddev) {
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+			continue;
+
+		if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Issue write_zeroes to all underlying disks to zero their data regions.
+ * This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
+ * Returns true if all disks were successfully zeroed.
+ */
+static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
+{
+	struct mddev *mddev = llbitmap->mddev;
+	struct md_rdev *rdev;
+	sector_t dev_sectors = mddev->dev_sectors;
+	int ret;
+
+	rdev_for_each(rdev, mddev) {
+		if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
+			continue;
+
+		ret = blkdev_issue_zeroout(rdev->bdev,
+					   rdev->data_offset,
+					   dev_sectors,
+					   GFP_KERNEL, 0);
+		if (ret) {
+			pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
+				rdev->bdev, ret);
+			return false;
+		}
+	}
+
+	return true;
+}
+
 static void llbitmap_init_state(struct llbitmap *llbitmap)
 {
+	struct mddev *mddev = llbitmap->mddev;
 	enum llbitmap_state state = BitUnwritten;
 	unsigned long i;

-	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
+	if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
 		state = BitClean;
+	} else if (raid_is_456(mddev) &&
+		   llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
+		/*
+		 * All disks support write_zeroes with unmap. Zero all disks
+		 * to ensure parity consistency, then set BitCleanUnwritten
+		 * to skip initial sync.
+		 */
+		if (llbitmap_zero_all_disks(llbitmap))
+			state = BitCleanUnwritten;
+	}

 	for (i = 0; i < llbitmap->chunks; i++)
 		llbitmap_write(llbitmap, state, i);
@@ -626,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
 			goto write_bitmap;
 		}

-		if (c == BitNeedSync)
+		if (c == BitNeedSync || c == BitNeedSyncUnwritten)
 			need_resync = !mddev->degraded;

 		state = state_machine[c][action];
-
 write_bitmap:
 		if (unlikely(mddev->degraded)) {
 			/* For degraded array, mark new data as need sync. */
@@ -657,8 +786,7 @@ write_bitmap:
 		}

 		llbitmap_write(llbitmap, state, start);
-
-		if (state == BitNeedSync)
+		if (state == BitNeedSync || state == BitNeedSyncUnwritten)
 			need_resync = !mddev->degraded;
 		else if (state == BitDirty &&
 			 !timer_pending(&llbitmap->pending_timer))
@@ -1069,12 +1197,12 @@ static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;

-	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
-
 	while (page_start <= page_end) {
 		llbitmap_raise_barrier(llbitmap, page_start);
 		page_start++;
 	}
+
+	llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
 }

 static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
@@ -1101,12 +1229,12 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
 	int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
 	int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;

-	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
-
 	while (page_start <= page_end) {
 		llbitmap_raise_barrier(llbitmap, page_start);
 		page_start++;
 	}
+
+	llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
 }

 static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
@@ -1228,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
 	unsigned long p = offset >> llbitmap->chunkshift;
 	enum llbitmap_state c = llbitmap_read(llbitmap, p);

-	return c == BitClean || c == BitDirty;
+	return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
 }

 static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
@@ -1242,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
 	if (c == BitUnwritten)
 		return blocks;

+	/* Skip CleanUnwritten - no user data, will be reset after recovery */
+	if (c == BitCleanUnwritten)
+		return blocks;
+
 	/* For degraded array, don't skip */
 	if (mddev->degraded)
 		return 0;
@@ -1260,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
 {
 	struct llbitmap *llbitmap = mddev->bitmap;
 	unsigned long p = offset >> llbitmap->chunkshift;
+	enum llbitmap_state state;
+
+	/*
+	 * Before recovery starts, convert CleanUnwritten to Unwritten.
+	 * This ensures the new disk won't have stale parity data.
+	 */
+	if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
+		llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+				       BitmapActionClearUnwritten);
+

 	/*
 	 * Handle one bit at a time, this is much simpler. And it doesn't matter
 	 * if md_do_sync() loop more times.
 	 */
 	*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
-	return llbitmap_state_machine(llbitmap, p, p,
-				      BitmapActionStartsync) == BitSyncing;
+	state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
+	return state == BitSyncing || state == BitSyncingUnwritten;
 }

 /* Something is wrong, sync_thread stop at @offset */
@@ -1473,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
 	}

 	mutex_unlock(&mddev->bitmap_info.mutex);
-	return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
+	return sprintf(page,
+		       "unwritten %d\nclean %d\ndirty %d\n"
+		       "need sync %d\nsyncing %d\n"
+		       "need sync unwritten %d\nsyncing unwritten %d\n"
+		       "clean unwritten %d\n",
 		       bits[BitUnwritten], bits[BitClean], bits[BitDirty],
-		       bits[BitNeedSync], bits[BitSyncing]);
+		       bits[BitNeedSync], bits[BitSyncing],
+		       bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
+		       bits[BitCleanUnwritten]);
 }

 static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
@@ -1548,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)

 static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);

+static ssize_t
+proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	struct llbitmap *llbitmap;
+
+	/* Only for RAID-456 */
+	if (!raid_is_456(mddev))
+		return -EINVAL;
+
+	mutex_lock(&mddev->bitmap_info.mutex);
+	llbitmap = mddev->bitmap;
+	if (!llbitmap || !llbitmap->pctl) {
+		mutex_unlock(&mddev->bitmap_info.mutex);
+		return -ENODEV;
+	}
+
+	/* Trigger proactive sync on all Unwritten regions */
+	llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
+			       BitmapActionProactiveSync);
+
+	mutex_unlock(&mddev->bitmap_info.mutex);
+	return len;
+}
+
+static struct md_sysfs_entry llbitmap_proactive_sync =
+	__ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
+
 static struct attribute *md_llbitmap_attrs[] = {
 	&llbitmap_bits.attr,
 	&llbitmap_metadata.attr,
 	&llbitmap_daemon_sleep.attr,
 	&llbitmap_barrier_idle.attr,
+	&llbitmap_proactive_sync.attr,
 	NULL
 };

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
 static const struct kobj_type md_ktype;

 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
-static struct workqueue_struct *md_wq;

 /*
 * This workqueue is used for sync_work to register new sync_thread, and for
@@ -98,7 +97,7 @@ static struct workqueue_struct *md_misc_wq;
 static int remove_and_add_spares(struct mddev *mddev,
 				 struct md_rdev *this);
 static void mddev_detach(struct mddev *mddev);
-static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
+static void export_rdev(struct md_rdev *rdev);
 static void md_wakeup_thread_directly(struct md_thread __rcu **thread);

 /*
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)

 		spin_lock_init(&serial_tmp->serial_lock);
 		serial_tmp->serial_rb = RB_ROOT_CACHED;
-		init_waitqueue_head(&serial_tmp->serial_io_wait);
 	}

 	rdev->serial = serial;
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
 	}

 	percpu_ref_kill(&mddev->active_io);
+
+	/*
+	 * RAID456 IO can sleep in wait_for_reshape while still holding an
+	 * active_io reference. If reshape is already interrupted or frozen,
+	 * wake those waiters so they can abort and drop the reference instead
+	 * of deadlocking suspend.
+	 */
+	if (mddev->pers && mddev->pers->prepare_suspend &&
+	    reshape_interrupted(mddev))
+		mddev->pers->prepare_suspend(mddev);
+
 	if (interruptible)
 		err = wait_event_interruptible(mddev->sb_wait,
 				percpu_ref_is_zero(&mddev->active_io));
@@ -959,7 +968,7 @@ void mddev_unlock(struct mddev *mddev)
 	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
 		list_del_init(&rdev->same_set);
 		kobject_del(&rdev->kobj);
-		export_rdev(rdev, mddev);
+		export_rdev(rdev);
 	}

 	if (!legacy_async_del_gendisk) {
@@ -2632,7 +2641,7 @@ void md_autodetect_dev(dev_t dev);
 /* just for claiming the bdev */
 static struct md_rdev claim_rdev;

-static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
+static void export_rdev(struct md_rdev *rdev)
 {
 	pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
 	md_rdev_clear(rdev);
@@ -2788,7 +2797,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
 	if (!md_is_rdwr(mddev)) {
 		if (force_change)
 			set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
-		pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
+		if (!mddev_is_dm(mddev))
+			pr_err_ratelimited("%s: can't update sb for read-only array %s\n",
+					   __func__, mdname(mddev));
 		return;
 	}

@@ -4848,7 +4859,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	err = bind_rdev_to_array(rdev, mddev);
 out:
 	if (err)
-		export_rdev(rdev, mddev);
+		export_rdev(rdev);
 	mddev_unlock_and_resume(mddev);
 	if (!err)
 		md_new_event();
@@ -6128,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
 	}
 	spin_unlock(&all_mddevs_lock);
 	rv = entry->store(mddev, page, length);
-	mddev_put(mddev);

+	/*
+	 * For "array_state=clear", dropping the extra kobject reference from
+	 * sysfs_break_active_protection() can trigger md kobject deletion.
+	 * Restore active protection before mddev_put() so deletion happens
+	 * after the sysfs write path fully unwinds.
+	 */
 	if (kn)
 		sysfs_unbreak_active_protection(kn);
+	mddev_put(mddev);

 	return rv;
 }
@@ -6447,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)

 static int start_dirty_degraded;

+/*
+ * Read bitmap superblock and return the bitmap_id based on disk version.
+ * This is used as fallback when default bitmap version and on-disk version
+ * doesn't match, and mdadm is not the latest version to set bitmap_type.
+ */
+static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
+{
+	struct md_rdev *rdev;
+	struct page *sb_page;
+	bitmap_super_t *sb;
+	enum md_submodule_id id = ID_BITMAP_NONE;
+	sector_t sector;
+	u32 version;
+
+	if (!mddev->bitmap_info.offset)
+		return ID_BITMAP_NONE;
+
+	sb_page = alloc_page(GFP_KERNEL);
+	if (!sb_page) {
+		pr_warn("md: %s: failed to allocate memory for bitmap\n",
+			mdname(mddev));
+		return ID_BITMAP_NONE;
+	}
+
+	sector = mddev->bitmap_info.offset;
+
+	rdev_for_each(rdev, mddev) {
+		u32 iosize;
+
+		if (!test_bit(In_sync, &rdev->flags) ||
+		    test_bit(Faulty, &rdev->flags) ||
+		    test_bit(Bitmap_sync, &rdev->flags))
+			continue;
+
+		iosize = roundup(sizeof(bitmap_super_t),
+				 bdev_logical_block_size(rdev->bdev));
+		if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
+				 true))
+			goto read_ok;
+	}
+	pr_warn("md: %s: failed to read bitmap from any device\n",
+		mdname(mddev));
+	goto out;
+
+read_ok:
+	sb = kmap_local_page(sb_page);
+	if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
+		pr_warn("md: %s: invalid bitmap magic 0x%x\n",
+			mdname(mddev), le32_to_cpu(sb->magic));
+		goto out_unmap;
+	}
+
+	version = le32_to_cpu(sb->version);
+	switch (version) {
+	case BITMAP_MAJOR_LO:
+	case BITMAP_MAJOR_HI:
+	case BITMAP_MAJOR_CLUSTERED:
+		id = ID_BITMAP;
+		break;
+	case BITMAP_MAJOR_LOCKLESS:
+		id = ID_LLBITMAP;
+		break;
+	default:
+		pr_warn("md: %s: unknown bitmap version %u\n",
+			mdname(mddev), version);
+		break;
+	}
+
+out_unmap:
+	kunmap_local(sb);
+out:
+	__free_page(sb_page);
+	return id;
+}
+
 static int md_bitmap_create(struct mddev *mddev)
 {
+	enum md_submodule_id orig_id = mddev->bitmap_id;
+	enum md_submodule_id sb_id;
+	int err;
+
 	if (mddev->bitmap_id == ID_BITMAP_NONE)
 		return -EINVAL;

 	if (!mddev_set_bitmap_ops(mddev))
 		return -ENOENT;

-	return mddev->bitmap_ops->create(mddev);
+	err = mddev->bitmap_ops->create(mddev);
+	if (!err)
+		return 0;
+
+	/*
+	 * Create failed, if default bitmap version and on-disk version
+	 * doesn't match, and mdadm is not the latest version to set
+	 * bitmap_type, set bitmap_ops based on the disk version.
+	 */
+	mddev_clear_bitmap_ops(mddev);
+
+	sb_id = md_bitmap_get_id_from_sb(mddev);
+	if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
+		return err;
+
+	pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
+		mdname(mddev), orig_id, sb_id);
+
+	mddev->bitmap_id = sb_id;
+	if (!mddev_set_bitmap_ops(mddev)) {
+		mddev->bitmap_id = orig_id;
+		return -ENOENT;
+	}
+
+	err = mddev->bitmap_ops->create(mddev);
+	if (err) {
+		mddev_clear_bitmap_ops(mddev);
+		mddev->bitmap_id = orig_id;
+	}
+
+	return err;
 }

 static void md_bitmap_destroy(struct mddev *mddev)
@@ -7140,7 +7266,7 @@ static void autorun_devices(int part)
 			rdev_for_each_list(rdev, tmp, &candidates) {
 				list_del_init(&rdev->same_set);
 				if (bind_rdev_to_array(rdev, mddev))
-					export_rdev(rdev, mddev);
+					export_rdev(rdev);
 			}
 			autorun_array(mddev);
 			mddev_unlock_and_resume(mddev);
@@ -7150,7 +7276,7 @@ static void autorun_devices(int part)
 		 */
 		rdev_for_each_list(rdev, tmp, &candidates) {
 			list_del_init(&rdev->same_set);
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 		}
 		mddev_put(mddev);
 	}
@@ -7338,13 +7464,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 				pr_warn("md: %pg has different UUID to %pg\n",
 					rdev->bdev,
 					rdev0->bdev);
-				export_rdev(rdev, mddev);
+				export_rdev(rdev);
 				return -EINVAL;
 			}
 		}
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err)
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 		return err;
 	}

@@ -7387,7 +7513,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 			/* This was a hot-add request, but events doesn't
 			 * match, so reject it.
 			 */
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 			return -EINVAL;
 		}

@@ -7413,7 +7539,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 				}
 			}
 			if (has_journal || mddev->bitmap) {
-				export_rdev(rdev, mddev);
+				export_rdev(rdev);
 				return -EBUSY;
 			}
 			set_bit(Journal, &rdev->flags);
@@ -7428,7 +7554,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 				/* --add initiated by this node */
 				err = mddev->cluster_ops->add_new_disk(mddev, rdev);
 				if (err) {
-					export_rdev(rdev, mddev);
+					export_rdev(rdev);
 					return err;
 				}
 			}
@@ -7438,7 +7564,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
 		err = bind_rdev_to_array(rdev, mddev);

 		if (err)
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);

 		if (mddev_is_clustered(mddev)) {
 			if (info->state & (1 << MD_DISK_CANDIDATE)) {
@@ -7501,7 +7627,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)

 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
-			export_rdev(rdev, mddev);
+			export_rdev(rdev);
 			return err;
 		}
 	}
@@ -7613,7 +7739,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	return 0;

 abort_export:
-	export_rdev(rdev, mddev);
+	export_rdev(rdev);
 	return err;
 }

@@ -10503,10 +10629,6 @@ static int __init md_init(void)
 		goto err_bitmap;

 	ret = -ENOMEM;
-	md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
-	if (!md_wq)
-		goto err_wq;
-
 	md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
 	if (!md_misc_wq)
 		goto err_misc_wq;
@@ -10531,8 +10653,6 @@ err_mdp:
 err_md:
 	destroy_workqueue(md_misc_wq);
 err_misc_wq:
-	destroy_workqueue(md_wq);
-err_wq:
 	md_llbitmap_exit();
 err_bitmap:
 	md_bitmap_exit();
@@ -10841,7 +10961,6 @@ static __exit void md_exit(void)
 	spin_unlock(&all_mddevs_lock);

 	destroy_workqueue(md_misc_wq);
-	destroy_workqueue(md_wq);
 	md_bitmap_exit();
 }

--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -126,7 +126,6 @@ enum sync_action {
 struct serial_in_rdev {
 	struct rb_root_cached serial_rb;
 	spinlock_t serial_lock;
-	wait_queue_head_t serial_io_wait;
 };

 /*
@@ -381,7 +380,11 @@ struct serial_info {
 	struct rb_node node;
 	sector_t start;		/* start sector of rb node */
 	sector_t last;		/* end sector of rb node */
+	sector_t wnode_start; /* address of waiting nodes on the same list */
 	sector_t _subtree_last; /* highest sector in subtree of rb node */
+	struct list_head	list_node;
+	struct list_head	waiters;
+	struct completion	ready;
 };

 /*
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	}

 	err = -ENOMEM;
-	conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
+	conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
 	if (!conf->strip_zone)
 		goto abort;
-	conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
-					    conf->nr_strip_zones,
-					    mddev->raid_disks),
-				GFP_KERNEL);
+	conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
+					     conf->nr_strip_zones,
+					     mddev->raid_disks),
+				 GFP_KERNEL);
 	if (!conf->devlist)
 		goto abort;

@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)

 	return 0;
 abort:
-	kfree(conf->strip_zone);
-	kfree(conf->devlist);
+	kvfree(conf->strip_zone);
+	kvfree(conf->devlist);
 	kfree(conf);
 	*private_conf = ERR_PTR(err);
 	return err;
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
 {
 	struct r0conf *conf = priv;

-	kfree(conf->strip_zone);
-	kfree(conf->devlist);
+	kvfree(conf->strip_zone);
+	kvfree(conf->devlist);
 	kfree(conf);
 }

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
 		     START, LAST, static inline, raid1_rb);

 static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
-				struct serial_info *si, int idx)
+				struct serial_info *si)
 {
 	unsigned long flags;
 	int ret = 0;
 	sector_t lo = r1_bio->sector;
-	sector_t hi = lo + r1_bio->sectors;
+	sector_t hi = lo + r1_bio->sectors - 1;
+	int idx = sector_to_idx(r1_bio->sector);
 	struct serial_in_rdev *serial = &rdev->serial[idx];
+	struct serial_info *head_si;

 	spin_lock_irqsave(&serial->serial_lock, flags);
 	/* collision happened */
-	if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
-		ret = -EBUSY;
-	else {
+	head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
+	if (head_si && head_si != si) {
 		si->start = lo;
 		si->last = hi;
+		si->wnode_start = head_si->wnode_start;
+		list_add_tail(&si->list_node, &head_si->waiters);
+		ret = -EBUSY;
+	} else if (!head_si) {
+		si->start = lo;
+		si->last = hi;
+		si->wnode_start = si->start;
 		raid1_rb_insert(si, &serial->serial_rb);
 	}
 	spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
 {
 	struct mddev *mddev = rdev->mddev;
 	struct serial_info *si;
-	int idx = sector_to_idx(r1_bio->sector);
-	struct serial_in_rdev *serial = &rdev->serial[idx];

 	if (WARN_ON(!mddev->serial_info_pool))
 		return;
 	si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
-	wait_event(serial->serial_io_wait,
-		   check_and_add_serial(rdev, r1_bio, si, idx) == 0);
+	INIT_LIST_HEAD(&si->waiters);
+	INIT_LIST_HEAD(&si->list_node);
+	init_completion(&si->ready);
+	while (check_and_add_serial(rdev, r1_bio, si)) {
+		wait_for_completion(&si->ready);
+		reinit_completion(&si->ready);
+	}
 }

 static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 {
-	struct serial_info *si;
+	struct serial_info *si, *iter_si;
 	unsigned long flags;
 	int found = 0;
 	struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
 	for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
 	     si; si = raid1_rb_iter_next(si, lo, hi)) {
 		if (si->start == lo && si->last == hi) {
-			raid1_rb_remove(si, &serial->serial_rb);
-			mempool_free(si, mddev->serial_info_pool);
 			found = 1;
 			break;
 		}
 	}
-	if (!found)
+	if (found) {
+		raid1_rb_remove(si, &serial->serial_rb);
+		if (!list_empty(&si->waiters)) {
+			list_for_each_entry(iter_si, &si->waiters, list_node) {
+				if (iter_si->wnode_start == si->wnode_start) {
+					list_del_init(&iter_si->list_node);
+					list_splice_init(&si->waiters, &iter_si->waiters);
+					raid1_rb_insert(iter_si, &serial->serial_rb);
+					complete(&iter_si->ready);
+					break;
+				}
+			}
+		}
+		mempool_free(si, mddev->serial_info_pool);
+	} else {
 		WARN(1, "The write IO is not recorded for serialization\n");
+	}
 	spin_unlock_irqrestore(&serial->serial_lock, flags);
-	wake_up(&serial->serial_io_wait);
 }

 /*
@@ -452,7 +475,7 @@ static void raid1_end_write_request(struct bio *bio)
 	int mirror = find_bio_disk(r1_bio, bio);
 	struct md_rdev *rdev = conf->mirrors[mirror].rdev;
 	sector_t lo = r1_bio->sector;
-	sector_t hi = r1_bio->sector + r1_bio->sectors;
+	sector_t hi = r1_bio->sector + r1_bio->sectors - 1;
 	bool ignore_error = !raid1_should_handle_error(bio) ||
 		(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);

@@ -1878,7 +1901,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
 	if (info->rdev)
 		return false;

-	if (bdev_nonrot(rdev->bdev)) {
+	if (!bdev_rot(rdev->bdev)) {
 		set_bit(Nonrot, &rdev->flags);
 		WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
 	}
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
 		if (!do_balance)
 			break;

-		nonrot = bdev_nonrot(rdev->bdev);
+		nonrot = !bdev_rot(rdev->bdev);
 		has_nonrot_disk |= nonrot;
 		pending = atomic_read(&rdev->nr_pending);
 		if (min_pending > pending && nonrot) {
@@ -1184,7 +1184,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 	}

 	if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
-		raid_end_bio_io(r10_bio);
+		free_r10bio(r10_bio);
 		return;
 	}

@@ -1372,7 +1372,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,

 	sectors = r10_bio->sectors;
 	if (!regular_request_wait(mddev, conf, bio, sectors)) {
-		raid_end_bio_io(r10_bio);
+		free_r10bio(r10_bio);
 		return;
 	}

--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 		return -ENOMEM;

 	while (mb_offset < le32_to_cpu(mb->meta_size)) {
+		sector_t payload_len;
+
 		payload = (void *)mb + mb_offset;
 		payload_flush = (void *)mb + mb_offset;

 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
+			payload_len = sizeof(struct r5l_payload_data_parity) +
+				(sector_t)sizeof(__le32) *
+				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+			if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+				goto mismatch;
 			if (r5l_recovery_verify_data_checksum(
 				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
 				goto mismatch;
 		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
+			payload_len = sizeof(struct r5l_payload_data_parity) +
+				(sector_t)sizeof(__le32) *
+				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+			if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+				goto mismatch;
 			if (r5l_recovery_verify_data_checksum(
 				    log, ctx, page, log_offset,
 				    payload->checksum[0]) < 0)
@@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
 				    payload->checksum[1]) < 0)
 				goto mismatch;
 		} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
-			/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
+			payload_len = sizeof(struct r5l_payload_flush) +
+				(sector_t)le32_to_cpu(payload_flush->size);
+			if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+				goto mismatch;
 		} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
 			goto mismatch;

-		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
-			mb_offset += sizeof(struct r5l_payload_flush) +
-				le32_to_cpu(payload_flush->size);
-		} else {
-			/* DATA or PARITY payload */
+		if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) {
 			log_offset = r5l_ring_add(log, log_offset,
 						  le32_to_cpu(payload->size));
-			mb_offset += sizeof(struct r5l_payload_data_parity) +
-				sizeof(__le32) *
-				(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
 		}
-
+		mb_offset += payload_len;
 	}

 	put_page(page);
@@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 	log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);

 	while (mb_offset < le32_to_cpu(mb->meta_size)) {
+		sector_t payload_len;
 		int dd;

 		payload = (void *)mb + mb_offset;
@@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
 			int i, count;

+			payload_len = sizeof(struct r5l_payload_flush) +
+				(sector_t)le32_to_cpu(payload_flush->size);
+			if (mb_offset + payload_len >
+			    le32_to_cpu(mb->meta_size))
+				return -EINVAL;
+
 			count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
 			for (i = 0; i < count; ++i) {
 				stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
@@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 				}
 			}

-			mb_offset += sizeof(struct r5l_payload_flush) +
-				le32_to_cpu(payload_flush->size);
+			mb_offset += payload_len;
 			continue;
 		}

 		/* DATA or PARITY payload */
+		payload_len = sizeof(struct r5l_payload_data_parity) +
+			(sector_t)sizeof(__le32) *
+			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+		if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
+			return -EINVAL;
+
 		stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
 			raid5_compute_sector(
 				conf, le64_to_cpu(payload->location), 0, &dd,
@@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 		log_offset = r5l_ring_add(log, log_offset,
 					  le32_to_cpu(payload->size));

-		mb_offset += sizeof(struct r5l_payload_data_parity) +
-			sizeof(__le32) *
-			(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
+		mb_offset += payload_len;
 	}

 	return 0;
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3916,6 +3916,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 					break;
 			}
 			BUG_ON(other < 0);
+			if (test_bit(R5_LOCKED, &sh->dev[other].flags))
+				return 0;
 			pr_debug("Computing stripe %llu blocks %d,%d\n",
 			       (unsigned long long)sh->sector,
 			       disk_idx, other);
@@ -4594,20 +4596,6 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 	async_tx_quiesce(&tx);
 }

-/*
- * handle_stripe - do things to a stripe.
- *
- * We lock the stripe by setting STRIPE_ACTIVE and then examine the
- * state of various bits to see what needs to be done.
- * Possible results:
- *    return some read requests which now have data
- *    return some write requests which are safely on storage
- *    schedule a read on some buffers
- *    schedule a write of some buffers
- *    return confirmation of parity correctness
- *
- */
-
 static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -4901,6 +4889,18 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 		set_bit(STRIPE_HANDLE, &head_sh->state);
 }

+/*
+ * handle_stripe - do things to a stripe.
+ *
+ * We lock the stripe by setting STRIPE_ACTIVE and then examine the
+ * state of various bits to see what needs to be done.
+ * Possible results:
+ *    return some read requests which now have data
+ *    return some write requests which are safely on storage
+ *    schedule a read on some buffers
+ *    schedule a write of some buffers
+ *    return confirmation of parity correctness
+ */
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -6641,7 +6641,13 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
 		}

 		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
-			raid5_release_stripe(sh);
+			int hash;
+
+			spin_lock_irq(&conf->device_lock);
+			hash = sh->hash_lock_index;
+			__release_stripe(conf, sh,
+					 &conf->temp_inactive_list[hash]);
+			spin_unlock_irq(&conf->device_lock);
 			conf->retry_read_aligned = raid_bio;
 			conf->retry_read_offset = scnt;
 			return handled;
@@ -7541,7 +7547,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		if (test_bit(Journal, &rdev->flags))
 			continue;
-		if (bdev_nonrot(rdev->bdev)) {
+		if (!bdev_rot(rdev->bdev)) {
 			conf->batch_bio_dispatch = false;
 			break;
 		}
@@ -7780,6 +7786,7 @@ static int raid5_set_limits(struct mddev *mddev)
 	lim.logical_block_size = mddev->logical_block_size;
 	lim.io_min = mddev->chunk_sectors << 9;
 	lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
+	lim.chunk_sectors = lim.io_opt >> 9;
 	lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
 	lim.discard_granularity = stripe;
 	lim.max_write_zeroes_sectors = 0;
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -801,7 +801,6 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
 }
 #endif

-void md_raid5_kick_device(struct r5conf *conf);
 int raid5_set_cache_size(struct mddev *mddev, int size);
 sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
 void raid5_release_stripe(struct stripe_head *sh);
--- a/drivers/nvme/common/.kunitconfig
+++ b/drivers/nvme/common/.kunitconfig
@@ -0,0 +1,6 @@
+CONFIG_KUNIT=y
+CONFIG_PCI=y
+CONFIG_BLOCK=y
+CONFIG_BLK_DEV_NVME=y
+CONFIG_NVME_HOST_AUTH=y
+CONFIG_NVME_AUTH_KUNIT_TEST=y
--- a/drivers/nvme/common/Kconfig
+++ b/drivers/nvme/common/Kconfig
@@ -7,9 +7,15 @@ config NVME_KEYRING
 config NVME_AUTH
 	tristate
 	select CRYPTO
-	select CRYPTO_HMAC
-	select CRYPTO_SHA256
-	select CRYPTO_SHA512
 	select CRYPTO_DH
 	select CRYPTO_DH_RFC7919_GROUPS
-	select CRYPTO_HKDF
+	select CRYPTO_LIB_SHA256
+	select CRYPTO_LIB_SHA512
+
+config NVME_AUTH_KUNIT_TEST
+	tristate "KUnit tests for NVMe authentication" if !KUNIT_ALL_TESTS
+	depends on KUNIT && NVME_AUTH
+	default KUNIT_ALL_TESTS
+	help
+	  Enable KUnit tests for some of the common code for NVMe over Fabrics
+	  In-Band Authentication.
--- a/drivers/nvme/common/Makefile
+++ b/drivers/nvme/common/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_NVME_KEYRING)	+= nvme-keyring.o

 nvme-auth-y			+= auth.o
 nvme-keyring-y			+= keyring.o
+
+obj-$(CONFIG_NVME_AUTH_KUNIT_TEST) += tests/auth_kunit.o
--- a/drivers/nvme/common/auth.c
+++ b/drivers/nvme/common/auth.c
@@ -9,14 +9,11 @@
 #include <linux/prandom.h>
 #include <linux/scatterlist.h>
 #include <linux/unaligned.h>
-#include <crypto/hash.h>
 #include <crypto/dh.h>
-#include <crypto/hkdf.h>
+#include <crypto/sha2.h>
 #include <linux/nvme.h>
 #include <linux/nvme-auth.h>

-#define HKDF_MAX_HASHLEN 64
-
 static u32 nvme_dhchap_seqnum;
 static DEFINE_MUTEX(nvme_dhchap_mutex);

@@ -38,9 +35,9 @@ u32 nvme_auth_get_seqnum(void)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);

-static struct nvme_auth_dhgroup_map {
-	const char name[16];
-	const char kpp[16];
+static const struct nvme_auth_dhgroup_map {
+	char name[16];
+	char kpp[16];
 } dhgroup_map[] = {
 	[NVME_AUTH_DHGROUP_NULL] = {
 		.name = "null", .kpp = "null" },
@@ -89,25 +86,21 @@ u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);

-static struct nvme_dhchap_hash_map {
+static const struct nvme_dhchap_hash_map {
 	int len;
-	const char hmac[15];
-	const char digest[8];
+	char hmac[15];
 } hash_map[] = {
 	[NVME_AUTH_HASH_SHA256] = {
 		.len = 32,
 		.hmac = "hmac(sha256)",
-		.digest = "sha256",
 	},
 	[NVME_AUTH_HASH_SHA384] = {
 		.len = 48,
 		.hmac = "hmac(sha384)",
-		.digest = "sha384",
 	},
 	[NVME_AUTH_HASH_SHA512] = {
 		.len = 64,
 		.hmac = "hmac(sha512)",
-		.digest = "sha512",
 	},
 };

@@ -119,14 +112,6 @@ const char *nvme_auth_hmac_name(u8 hmac_id)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);

-const char *nvme_auth_digest_name(u8 hmac_id)
-{
-	if (hmac_id >= ARRAY_SIZE(hash_map))
-		return NULL;
-	return hash_map[hmac_id].digest;
-}
-EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
-
 u8 nvme_auth_hmac_id(const char *hmac_name)
 {
 	int i;
@@ -161,11 +146,10 @@ u32 nvme_auth_key_struct_size(u32 key_len)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size);

-struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
-					      u8 key_hash)
+struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash)
 {
 	struct nvme_dhchap_key *key;
-	unsigned char *p;
+	const char *p;
 	u32 crc;
 	int ret, key_len;
 	size_t allocated_len = strlen(secret);
@@ -183,14 +167,14 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 		pr_debug("base64 key decoding error %d\n",
 			 key_len);
 		ret = key_len;
-		goto out_free_secret;
+		goto out_free_key;
 	}

 	if (key_len != 36 && key_len != 52 &&
 	    key_len != 68) {
 		pr_err("Invalid key len %d\n", key_len);
 		ret = -EINVAL;
-		goto out_free_secret;
+		goto out_free_key;
 	}

 	/* The last four bytes is the CRC in little-endian format */
@@ -205,12 +189,12 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
 		pr_err("key crc mismatch (key %08x, crc %08x)\n",
 		       get_unaligned_le32(key->key + key_len), crc);
 		ret = -EKEYREJECTED;
-		goto out_free_secret;
+		goto out_free_key;
 	}
 	key->len = key_len;
 	key->hash = key_hash;
 	return key;
-out_free_secret:
+out_free_key:
 	nvme_auth_free_key(key);
 	return ERR_PTR(ret);
 }
@@ -237,12 +221,106 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key)
 }
 EXPORT_SYMBOL_GPL(nvme_auth_free_key);

-struct nvme_dhchap_key *nvme_auth_transform_key(
-		struct nvme_dhchap_key *key, char *nqn)
+/*
+ * Start computing an HMAC value, given the algorithm ID and raw key.
+ *
+ * The context should be zeroized at the end of its lifetime.  The caller can do
+ * that implicitly by calling nvme_auth_hmac_final(), or explicitly (needed when
+ * a context is abandoned without finalizing it) by calling memzero_explicit().
+ */
+int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id,
+			const u8 *key, size_t key_len)
 {
-	const char *hmac_name;
-	struct crypto_shash *key_tfm;
-	SHASH_DESC_ON_STACK(shash, key_tfm);
+	hmac->hmac_id = hmac_id;
+	switch (hmac_id) {
+	case NVME_AUTH_HASH_SHA256:
+		hmac_sha256_init_usingrawkey(&hmac->sha256, key, key_len);
+		return 0;
+	case NVME_AUTH_HASH_SHA384:
+		hmac_sha384_init_usingrawkey(&hmac->sha384, key, key_len);
+		return 0;
+	case NVME_AUTH_HASH_SHA512:
+		hmac_sha512_init_usingrawkey(&hmac->sha512, key, key_len);
+		return 0;
+	}
+	pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id);
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_init);
+
+void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data,
+			   size_t data_len)
+{
+	switch (hmac->hmac_id) {
+	case NVME_AUTH_HASH_SHA256:
+		hmac_sha256_update(&hmac->sha256, data, data_len);
+		return;
+	case NVME_AUTH_HASH_SHA384:
+		hmac_sha384_update(&hmac->sha384, data, data_len);
+		return;
+	case NVME_AUTH_HASH_SHA512:
+		hmac_sha512_update(&hmac->sha512, data, data_len);
+		return;
+	}
+	/* Unreachable because nvme_auth_hmac_init() validated hmac_id */
+	WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_update);
+
+/* Finish computing an HMAC value.  Note that this zeroizes the HMAC context. */
+void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out)
+{
+	switch (hmac->hmac_id) {
+	case NVME_AUTH_HASH_SHA256:
+		hmac_sha256_final(&hmac->sha256, out);
+		return;
+	case NVME_AUTH_HASH_SHA384:
+		hmac_sha384_final(&hmac->sha384, out);
+		return;
+	case NVME_AUTH_HASH_SHA512:
+		hmac_sha512_final(&hmac->sha512, out);
+		return;
+	}
+	/* Unreachable because nvme_auth_hmac_init() validated hmac_id */
+	WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL_GPL(nvme_auth_hmac_final);
+
+static int nvme_auth_hmac(u8 hmac_id, const u8 *key, size_t key_len,
+			  const u8 *data, size_t data_len, u8 *out)
+{
+	struct nvme_auth_hmac_ctx hmac;
+	int ret;
+
+	ret = nvme_auth_hmac_init(&hmac, hmac_id, key, key_len);
+	if (ret == 0) {
+		nvme_auth_hmac_update(&hmac, data, data_len);
+		nvme_auth_hmac_final(&hmac, out);
+	}
+	return ret;
+}
+
+static int nvme_auth_hash(u8 hmac_id, const u8 *data, size_t data_len, u8 *out)
+{
+	switch (hmac_id) {
+	case NVME_AUTH_HASH_SHA256:
+		sha256(data, data_len, out);
+		return 0;
+	case NVME_AUTH_HASH_SHA384:
+		sha384(data, data_len, out);
+		return 0;
+	case NVME_AUTH_HASH_SHA512:
+		sha512(data, data_len, out);
+		return 0;
+	}
+	pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id);
+	return -EINVAL;
+}
+
+struct nvme_dhchap_key *nvme_auth_transform_key(
+		const struct nvme_dhchap_key *key, const char *nqn)
+{
+	struct nvme_auth_hmac_ctx hmac;
 	struct nvme_dhchap_key *transformed_key;
 	int ret, key_len;

@@ -257,118 +335,33 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
 			return ERR_PTR(-ENOMEM);
 		return transformed_key;
 	}
-	hmac_name = nvme_auth_hmac_name(key->hash);
-	if (!hmac_name) {
-		pr_warn("Invalid key hash id %d\n", key->hash);
-		return ERR_PTR(-EINVAL);
-	}
-
-	key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
-	if (IS_ERR(key_tfm))
-		return ERR_CAST(key_tfm);
-
-	key_len = crypto_shash_digestsize(key_tfm);
+	ret = nvme_auth_hmac_init(&hmac, key->hash, key->key, key->len);
+	if (ret)
+		return ERR_PTR(ret);
+	key_len = nvme_auth_hmac_hash_len(key->hash);
 	transformed_key = nvme_auth_alloc_key(key_len, key->hash);
 	if (!transformed_key) {
-		ret = -ENOMEM;
-		goto out_free_key;
+		memzero_explicit(&hmac, sizeof(hmac));
+		return ERR_PTR(-ENOMEM);
 	}
-
-	shash->tfm = key_tfm;
-	ret = crypto_shash_setkey(key_tfm, key->key, key->len);
-	if (ret < 0)
-		goto out_free_transformed_key;
-	ret = crypto_shash_init(shash);
-	if (ret < 0)
-		goto out_free_transformed_key;
-	ret = crypto_shash_update(shash, nqn, strlen(nqn));
-	if (ret < 0)
-		goto out_free_transformed_key;
-	ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
-	if (ret < 0)
-		goto out_free_transformed_key;
-	ret = crypto_shash_final(shash, transformed_key->key);
-	if (ret < 0)
-		goto out_free_transformed_key;
-
-	crypto_free_shash(key_tfm);
-
+	nvme_auth_hmac_update(&hmac, nqn, strlen(nqn));
+	nvme_auth_hmac_update(&hmac, "NVMe-over-Fabrics", 17);
+	nvme_auth_hmac_final(&hmac, transformed_key->key);
 	return transformed_key;
-
-out_free_transformed_key:
-	nvme_auth_free_key(transformed_key);
-out_free_key:
-	crypto_free_shash(key_tfm);
-
-	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL_GPL(nvme_auth_transform_key);

-static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
+int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len,
+				  const u8 *challenge, u8 *aug, size_t hlen)
 {
-	const char *digest_name;
-	struct crypto_shash *tfm;
+	u8 hashed_key[NVME_AUTH_MAX_DIGEST_SIZE];
 	int ret;

-	digest_name = nvme_auth_digest_name(hmac_id);
-	if (!digest_name) {
-		pr_debug("%s: failed to get digest for %d\n", __func__,
-			 hmac_id);
-		return -EINVAL;
-	}
-	tfm = crypto_alloc_shash(digest_name, 0, 0);
-	if (IS_ERR(tfm))
-		return -ENOMEM;
-
-	ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
-	if (ret < 0)
-		pr_debug("%s: Failed to hash digest len %zu\n", __func__,
-			 skey_len);
-
-	crypto_free_shash(tfm);
-	return ret;
-}
-
-int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
-		u8 *challenge, u8 *aug, size_t hlen)
-{
-	struct crypto_shash *tfm;
-	u8 *hashed_key;
-	const char *hmac_name;
-	int ret;
-
-	hashed_key = kmalloc(hlen, GFP_KERNEL);
-	if (!hashed_key)
-		return -ENOMEM;
-
-	ret = nvme_auth_hash_skey(hmac_id, skey,
-				  skey_len, hashed_key);
-	if (ret < 0)
-		goto out_free_key;
-
-	hmac_name = nvme_auth_hmac_name(hmac_id);
-	if (!hmac_name) {
-		pr_warn("%s: invalid hash algorithm %d\n",
-			__func__, hmac_id);
-		ret = -EINVAL;
-		goto out_free_key;
-	}
-
-	tfm = crypto_alloc_shash(hmac_name, 0, 0);
-	if (IS_ERR(tfm)) {
-		ret = PTR_ERR(tfm);
-		goto out_free_key;
-	}
-
-	ret = crypto_shash_setkey(tfm, hashed_key, hlen);
+	ret = nvme_auth_hash(hmac_id, skey, skey_len, hashed_key);
 	if (ret)
-		goto out_free_hash;
-
-	ret = crypto_shash_tfm_digest(tfm, challenge, hlen, aug);
-out_free_hash:
-	crypto_free_shash(tfm);
-out_free_key:
-	kfree_sensitive(hashed_key);
+		return ret;
+	ret = nvme_auth_hmac(hmac_id, hashed_key, hlen, challenge, hlen, aug);
+	memzero_explicit(hashed_key, sizeof(hashed_key));
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
@@ -411,7 +404,7 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
 EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);

 int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
-		u8 *ctrl_key, size_t ctrl_key_len,
+		const u8 *ctrl_key, size_t ctrl_key_len,
 		u8 *sess_key, size_t sess_key_len)
 {
 	struct kpp_request *req;
@@ -438,7 +431,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
 }
 EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);

-int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
+int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key)
 {
 	struct nvme_dhchap_key *key;
 	u8 key_hash;
@@ -461,7 +454,7 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
 	*ret_key = key;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
+EXPORT_SYMBOL_GPL(nvme_auth_parse_key);

 /**
 * nvme_auth_generate_psk - Generate a PSK for TLS
@@ -486,66 +479,32 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
 * Returns 0 on success with a valid generated PSK pointer in @ret_psk and
 * the length of @ret_psk in @ret_len, or a negative error number otherwise.
 */
-int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len,
-		u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len)
+int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len,
+			   const u8 *c1, const u8 *c2, size_t hash_len,
+			   u8 **ret_psk, size_t *ret_len)
 {
-	struct crypto_shash *tfm;
-	SHASH_DESC_ON_STACK(shash, tfm);
+	size_t psk_len = nvme_auth_hmac_hash_len(hmac_id);
+	struct nvme_auth_hmac_ctx hmac;
 	u8 *psk;
-	const char *hmac_name;
-	int ret, psk_len;
+	int ret;

 	if (!c1 || !c2)
 		return -EINVAL;

-	hmac_name = nvme_auth_hmac_name(hmac_id);
-	if (!hmac_name) {
-		pr_warn("%s: invalid hash algorithm %d\n",
-			__func__, hmac_id);
-		return -EINVAL;
-	}
-
-	tfm = crypto_alloc_shash(hmac_name, 0, 0);
-	if (IS_ERR(tfm))
-		return PTR_ERR(tfm);
-
-	psk_len = crypto_shash_digestsize(tfm);
+	ret = nvme_auth_hmac_init(&hmac, hmac_id, skey, skey_len);
+	if (ret)
+		return ret;
 	psk = kzalloc(psk_len, GFP_KERNEL);
 	if (!psk) {
-		ret = -ENOMEM;
-		goto out_free_tfm;
+		memzero_explicit(&hmac, sizeof(hmac));
+		return -ENOMEM;
 	}
-
-	shash->tfm = tfm;
-	ret = crypto_shash_setkey(tfm, skey, skey_len);
-	if (ret)
-		goto out_free_psk;
-
-	ret = crypto_shash_init(shash);
-	if (ret)
-		goto out_free_psk;
-
-	ret = crypto_shash_update(shash, c1, hash_len);
-	if (ret)
-		goto out_free_psk;
-
-	ret = crypto_shash_update(shash, c2, hash_len);
-	if (ret)
-		goto out_free_psk;
-
-	ret = crypto_shash_final(shash, psk);
-	if (!ret) {
-		*ret_psk = psk;
-		*ret_len = psk_len;
-	}
-
-out_free_psk:
-	if (ret)
-		kfree_sensitive(psk);
-out_free_tfm:
-	crypto_free_shash(tfm);
-
-	return ret;
+	nvme_auth_hmac_update(&hmac, c1, hash_len);
+	nvme_auth_hmac_update(&hmac, c2, hash_len);
+	nvme_auth_hmac_final(&hmac, psk);
+	*ret_psk = psk;
+	*ret_len = psk_len;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(nvme_auth_generate_psk);

@@ -584,158 +543,70 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_psk);
 * Returns 0 on success with a valid digest pointer in @ret_digest, or a
 * negative error number on failure.
 */
-int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len,
-		char *subsysnqn, char *hostnqn, u8 **ret_digest)
+int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len,
+			      const char *subsysnqn, const char *hostnqn,
+			      char **ret_digest)
 {
-	struct crypto_shash *tfm;
-	SHASH_DESC_ON_STACK(shash, tfm);
-	u8 *digest, *enc;
-	const char *hmac_name;
-	size_t digest_len, hmac_len;
+	struct nvme_auth_hmac_ctx hmac;
+	u8 digest[NVME_AUTH_MAX_DIGEST_SIZE];
+	size_t hash_len = nvme_auth_hmac_hash_len(hmac_id);
+	char *enc;
+	size_t enc_len;
 	int ret;

 	if (WARN_ON(!subsysnqn || !hostnqn))
 		return -EINVAL;

-	hmac_name = nvme_auth_hmac_name(hmac_id);
-	if (!hmac_name) {
+	if (hash_len == 0) {
 		pr_warn("%s: invalid hash algorithm %d\n",
 			__func__, hmac_id);
 		return -EINVAL;
 	}

-	switch (nvme_auth_hmac_hash_len(hmac_id)) {
+	switch (hash_len) {
 	case 32:
-		hmac_len = 44;
+		enc_len = 44;
 		break;
 	case 48:
-		hmac_len = 64;
+		enc_len = 64;
 		break;
 	default:
 		pr_warn("%s: invalid hash algorithm '%s'\n",
-			__func__, hmac_name);
+			__func__, nvme_auth_hmac_name(hmac_id));
 		return -EINVAL;
 	}

-	enc = kzalloc(hmac_len + 1, GFP_KERNEL);
-	if (!enc)
-		return -ENOMEM;
-
-	tfm = crypto_alloc_shash(hmac_name, 0, 0);
-	if (IS_ERR(tfm)) {
-		ret = PTR_ERR(tfm);
-		goto out_free_enc;
-	}
-
-	digest_len = crypto_shash_digestsize(tfm);
-	digest = kzalloc(digest_len, GFP_KERNEL);
-	if (!digest) {
+	enc = kzalloc(enc_len + 1, GFP_KERNEL);
+	if (!enc) {
 		ret = -ENOMEM;
-		goto out_free_tfm;
+		goto out;
 	}

-	shash->tfm = tfm;
-	ret = crypto_shash_setkey(tfm, psk, psk_len);
+	ret = nvme_auth_hmac_init(&hmac, hmac_id, psk, psk_len);
 	if (ret)
-		goto out_free_digest;
+		goto out;
+	nvme_auth_hmac_update(&hmac, hostnqn, strlen(hostnqn));
+	nvme_auth_hmac_update(&hmac, " ", 1);
+	nvme_auth_hmac_update(&hmac, subsysnqn, strlen(subsysnqn));
+	nvme_auth_hmac_update(&hmac, " NVMe-over-Fabrics", 18);
+	nvme_auth_hmac_final(&hmac, digest);

-	ret = crypto_shash_init(shash);
-	if (ret)
-		goto out_free_digest;
-
-	ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn));
-	if (ret)
-		goto out_free_digest;
-
-	ret = crypto_shash_update(shash, " ", 1);
-	if (ret)
-		goto out_free_digest;
-
-	ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn));
-	if (ret)
-		goto out_free_digest;
-
-	ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18);
-	if (ret)
-		goto out_free_digest;
-
-	ret = crypto_shash_final(shash, digest);
-	if (ret)
-		goto out_free_digest;
-
-	ret = base64_encode(digest, digest_len, enc, true, BASE64_STD);
-	if (ret < hmac_len) {
+	ret = base64_encode(digest, hash_len, enc, true, BASE64_STD);
+	if (ret < enc_len) {
 		ret = -ENOKEY;
-		goto out_free_digest;
+		goto out;
 	}
 	*ret_digest = enc;
 	ret = 0;

-out_free_digest:
-	kfree_sensitive(digest);
-out_free_tfm:
-	crypto_free_shash(tfm);
-out_free_enc:
+out:
 	if (ret)
 		kfree_sensitive(enc);
-
+	memzero_explicit(digest, sizeof(digest));
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_auth_generate_digest);

-/**
- * hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1)
- * @hmac_tfm: hash context keyed with pseudorandom key
- * @label: ASCII label without "tls13 " prefix
- * @labellen: length of @label
- * @context: context bytes
- * @contextlen: length of @context
- * @okm: output keying material
- * @okmlen: length of @okm
- *
- * Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand().
- *
- * Returns 0 on success with output keying material stored in @okm,
- * or a negative errno value otherwise.
- */
-static int hkdf_expand_label(struct crypto_shash *hmac_tfm,
-		const u8 *label, unsigned int labellen,
-		const u8 *context, unsigned int contextlen,
-		u8 *okm, unsigned int okmlen)
-{
-	int err;
-	u8 *info;
-	unsigned int infolen;
-	const char *tls13_prefix = "tls13 ";
-	unsigned int prefixlen = strlen(tls13_prefix);
-
-	if (WARN_ON(labellen > (255 - prefixlen)))
-		return -EINVAL;
-	if (WARN_ON(contextlen > 255))
-		return -EINVAL;
-
-	infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen);
-	info = kzalloc(infolen, GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	/* HkdfLabel.Length */
-	put_unaligned_be16(okmlen, info);
-
-	/* HkdfLabel.Label */
-	info[2] = prefixlen + labellen;
-	memcpy(info + 3, tls13_prefix, prefixlen);
-	memcpy(info + 3 + prefixlen, label, labellen);
-
-	/* HkdfLabel.Context */
-	info[3 + prefixlen + labellen] = contextlen;
-	memcpy(info + 4 + prefixlen + labellen, context, contextlen);
-
-	err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen);
-	kfree_sensitive(info);
-	return err;
-}
-
 /**
 * nvme_auth_derive_tls_psk - Derive TLS PSK
 * @hmac_id: Hash function identifier
@@ -763,82 +634,92 @@ static int hkdf_expand_label(struct crypto_shash *hmac_tfm,
 * Returns 0 on success with a valid psk pointer in @ret_psk or a negative
 * error number otherwise.
 */
-int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
-		u8 *psk_digest, u8 **ret_psk)
+int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len,
+			     const char *psk_digest, u8 **ret_psk)
 {
-	struct crypto_shash *hmac_tfm;
-	const char *hmac_name;
-	const char *label = "nvme-tls-psk";
-	static const char default_salt[HKDF_MAX_HASHLEN];
-	size_t prk_len;
-	const char *ctx;
-	unsigned char *prk, *tls_key;
+	static const u8 default_salt[NVME_AUTH_MAX_DIGEST_SIZE];
+	static const char label[] = "tls13 nvme-tls-psk";
+	const size_t label_len = sizeof(label) - 1;
+	u8 prk[NVME_AUTH_MAX_DIGEST_SIZE];
+	size_t hash_len, ctx_len;
+	u8 *hmac_data = NULL, *tls_key;
+	size_t i;
 	int ret;

-	hmac_name = nvme_auth_hmac_name(hmac_id);
-	if (!hmac_name) {
+	hash_len = nvme_auth_hmac_hash_len(hmac_id);
+	if (hash_len == 0) {
 		pr_warn("%s: invalid hash algorithm %d\n",
 			__func__, hmac_id);
 		return -EINVAL;
 	}
 	if (hmac_id == NVME_AUTH_HASH_SHA512) {
 		pr_warn("%s: unsupported hash algorithm %s\n",
-			__func__, hmac_name);
+			__func__, nvme_auth_hmac_name(hmac_id));
 		return -EINVAL;
 	}

-	hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
-	if (IS_ERR(hmac_tfm))
-		return PTR_ERR(hmac_tfm);
-
-	prk_len = crypto_shash_digestsize(hmac_tfm);
-	prk = kzalloc(prk_len, GFP_KERNEL);
-	if (!prk) {
-		ret = -ENOMEM;
-		goto out_free_shash;
+	if (psk_len != hash_len) {
+		pr_warn("%s: unexpected psk_len %zu\n", __func__, psk_len);
+		return -EINVAL;
 	}

-	if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) {
+	/* HKDF-Extract */
+	ret = nvme_auth_hmac(hmac_id, default_salt, hash_len, psk, psk_len,
+			     prk);
+	if (ret)
+		goto out;
+
+	/*
+	 * HKDF-Expand-Label (RFC 8446 section 7.1), with output length equal to
+	 * the hash length (so only a single HMAC operation is needed)
+	 */
+
+	hmac_data = kmalloc(/* output length */ 2 +
+			    /* label */ 1 + label_len +
+			    /* context (max) */ 1 + 3 + 1 + strlen(psk_digest) +
+			    /* counter */ 1,
+			    GFP_KERNEL);
+	if (!hmac_data) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	/* output length */
+	i = 0;
+	hmac_data[i++] = hash_len >> 8;
+	hmac_data[i++] = hash_len;
+
+	/* label */
+	static_assert(label_len <= 255);
+	hmac_data[i] = label_len;
+	memcpy(&hmac_data[i + 1], label, label_len);
+	i += 1 + label_len;
+
+	/* context */
+	ctx_len = sprintf(&hmac_data[i + 1], "%02d %s", hmac_id, psk_digest);
+	if (ctx_len > 255) {
 		ret = -EINVAL;
-		goto out_free_prk;
+		goto out;
 	}
-	ret = hkdf_extract(hmac_tfm, psk, psk_len,
-			   default_salt, prk_len, prk);
-	if (ret)
-		goto out_free_prk;
+	hmac_data[i] = ctx_len;
+	i += 1 + ctx_len;

-	ret = crypto_shash_setkey(hmac_tfm, prk, prk_len);
-	if (ret)
-		goto out_free_prk;
-
-	ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest);
-	if (!ctx) {
-		ret = -ENOMEM;
-		goto out_free_prk;
-	}
+	/* counter (this overwrites the NUL terminator written by sprintf) */
+	hmac_data[i++] = 1;

 	tls_key = kzalloc(psk_len, GFP_KERNEL);
 	if (!tls_key) {
 		ret = -ENOMEM;
-		goto out_free_ctx;
+		goto out;
 	}
-	ret = hkdf_expand_label(hmac_tfm,
-				label, strlen(label),
-				ctx, strlen(ctx),
-				tls_key, psk_len);
+	ret = nvme_auth_hmac(hmac_id, prk, hash_len, hmac_data, i, tls_key);
 	if (ret) {
-		kfree(tls_key);
-		goto out_free_ctx;
+		kfree_sensitive(tls_key);
+		goto out;
 	}
 	*ret_psk = tls_key;
-
-out_free_ctx:
-	kfree(ctx);
-out_free_prk:
-	kfree(prk);
-out_free_shash:
-	crypto_free_shash(hmac_tfm);
-
+out:
+	kfree_sensitive(hmac_data);
+	memzero_explicit(prk, sizeof(prk));
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk);
--- a/drivers/nvme/common/tests/auth_kunit.c
+++ b/drivers/nvme/common/tests/auth_kunit.c
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Unit tests for NVMe authentication functions
+ *
+ * Copyright 2026 Google LLC
+ */
+
+#include <crypto/sha2.h>
+#include <kunit/test.h>
+#include <linux/nvme.h>
+#include <linux/nvme-auth.h>
+#include <linux/slab.h>
+
+struct nvme_auth_test_values {
+	u8 hmac_id;
+	size_t hash_len;
+	u8 expected_psk[NVME_AUTH_MAX_DIGEST_SIZE];
+	char *expected_psk_digest;
+	u8 expected_tls_psk[NVME_AUTH_MAX_DIGEST_SIZE];
+};
+
+static void kfree_action(void *ptr)
+{
+	kfree(ptr);
+}
+
+static void kunit_add_kfree_action(struct kunit *test, void *ptr)
+{
+	KUNIT_ASSERT_EQ(test, 0,
+			kunit_add_action_or_reset(test, kfree_action, ptr));
+}
+
+/*
+ * Test the derivation of a TLS PSK from the initial skey.  The vals parameter
+ * gives the expected value of tls_psk as well as the intermediate values psk
+ * and psk_digest.  The inputs are implicitly the fixed values set below.
+ */
+static void
+test_nvme_auth_derive_tls_psk(struct kunit *test,
+			      const struct nvme_auth_test_values *vals)
+{
+	const u8 hmac_id = vals->hmac_id;
+	const size_t hash_len = vals->hash_len;
+	const size_t skey_len = hash_len;
+	u8 skey[NVME_AUTH_MAX_DIGEST_SIZE];
+	u8 c1[NVME_AUTH_MAX_DIGEST_SIZE];
+	u8 c2[NVME_AUTH_MAX_DIGEST_SIZE];
+	const char *subsysnqn = "subsysnqn";
+	const char *hostnqn = "hostnqn";
+	u8 *psk = NULL, *tls_psk = NULL;
+	char *psk_digest = NULL;
+	size_t psk_len;
+	int ret;
+
+	for (int i = 0; i < NVME_AUTH_MAX_DIGEST_SIZE; i++) {
+		skey[i] = 'A' + i;
+		c1[i] = i;
+		c2[i] = 0xff - i;
+	}
+
+	ret = nvme_auth_generate_psk(hmac_id, skey, skey_len, c1, c2, hash_len,
+				     &psk, &psk_len);
+	kunit_add_kfree_action(test, psk);
+	KUNIT_ASSERT_EQ(test, 0, ret);
+	KUNIT_ASSERT_EQ(test, hash_len, psk_len);
+	KUNIT_ASSERT_MEMEQ(test, vals->expected_psk, psk, psk_len);
+
+	ret = nvme_auth_generate_digest(hmac_id, psk, psk_len, subsysnqn,
+					hostnqn, &psk_digest);
+	kunit_add_kfree_action(test, psk_digest);
+	if (vals->expected_psk_digest == NULL) {
+		/*
+		 * Algorithm has an ID assigned but is not supported by
+		 * nvme_auth_generate_digest().
+		 */
+		KUNIT_ASSERT_EQ(test, -EINVAL, ret);
+		return;
+	}
+	KUNIT_ASSERT_EQ(test, 0, ret);
+	KUNIT_ASSERT_STREQ(test, vals->expected_psk_digest, psk_digest);
+
+	ret = nvme_auth_derive_tls_psk(hmac_id, psk, psk_len, psk_digest,
+				       &tls_psk);
+	kunit_add_kfree_action(test, tls_psk);
+	KUNIT_ASSERT_EQ(test, 0, ret);
+	KUNIT_ASSERT_MEMEQ(test, vals->expected_tls_psk, tls_psk, psk_len);
+}
+
+static void test_nvme_auth_derive_tls_psk_hmac_sha256(struct kunit *test)
+{
+	static const struct nvme_auth_test_values vals = {
+		.hmac_id = NVME_AUTH_HASH_SHA256,
+		.hash_len = SHA256_DIGEST_SIZE,
+		.expected_psk = {
+			0x17, 0x33, 0xc5, 0x9f, 0xa7, 0xf4, 0x8f, 0xcf,
+			0x37, 0xf5, 0xf2, 0x6f, 0xc4, 0xff, 0x02, 0x68,
+			0xad, 0x4f, 0x78, 0xe0, 0x30, 0xf4, 0xf3, 0xb0,
+			0xbf, 0xd1, 0xd4, 0x7e, 0x7b, 0xb1, 0x44, 0x7a,
+		},
+		.expected_psk_digest = "OldoKuTfKddMuyCznAZojkWD7P4D9/AtzDzLimtOxqI=",
+		.expected_tls_psk = {
+			0x3c, 0x17, 0xda, 0x62, 0x84, 0x74, 0xa0, 0x4d,
+			0x22, 0x47, 0xc4, 0xca, 0xb4, 0x79, 0x68, 0xc9,
+			0x15, 0x38, 0x81, 0x93, 0xf7, 0xc0, 0x71, 0xbd,
+			0x94, 0x89, 0xcc, 0x36, 0x66, 0xcd, 0x7c, 0xc8,
+		},
+	};
+
+	test_nvme_auth_derive_tls_psk(test, &vals);
+}
+
+static void test_nvme_auth_derive_tls_psk_hmac_sha384(struct kunit *test)
+{
+	static const struct nvme_auth_test_values vals = {
+		.hmac_id = NVME_AUTH_HASH_SHA384,
+		.hash_len = SHA384_DIGEST_SIZE,
+		.expected_psk = {
+			0xf1, 0x4b, 0x2d, 0xd3, 0x23, 0x4c, 0x45, 0x96,
+			0x94, 0xd3, 0xbc, 0x63, 0xf8, 0x96, 0x8b, 0xd6,
+			0xb3, 0x7c, 0x2c, 0x6d, 0xe8, 0x49, 0xe2, 0x2e,
+			0x11, 0x87, 0x49, 0x00, 0x1c, 0xe4, 0xbb, 0xe8,
+			0x64, 0x0b, 0x9e, 0x3a, 0x74, 0x8c, 0xb1, 0x1c,
+			0xe4, 0xb1, 0xd7, 0x1d, 0x35, 0x9c, 0xce, 0x39,
+		},
+		.expected_psk_digest = "cffMWk8TSS7HOQebjgYEIkrPrjWPV4JE5cdPB8WhEvY4JBW5YynKyv66XscN4A9n",
+		.expected_tls_psk = {
+			0x27, 0x74, 0x75, 0x32, 0x33, 0x53, 0x7b, 0x3f,
+			0xa5, 0x0e, 0xb7, 0xd1, 0x6a, 0x8e, 0x43, 0x45,
+			0x7d, 0x85, 0xf4, 0x90, 0x6c, 0x00, 0x5b, 0x22,
+			0x36, 0x61, 0x6c, 0x5d, 0x80, 0x93, 0x9d, 0x08,
+			0x98, 0xff, 0xf1, 0x5b, 0xb8, 0xb7, 0x71, 0x19,
+			0xd2, 0xbe, 0x0a, 0xac, 0x42, 0x3e, 0x75, 0x90,
+		},
+	};
+
+	test_nvme_auth_derive_tls_psk(test, &vals);
+}
+
+static void test_nvme_auth_derive_tls_psk_hmac_sha512(struct kunit *test)
+{
+	static const struct nvme_auth_test_values vals = {
+		.hmac_id = NVME_AUTH_HASH_SHA512,
+		.hash_len = SHA512_DIGEST_SIZE,
+		.expected_psk = {
+			0x9c, 0x9f, 0x08, 0x9a, 0x61, 0x8b, 0x47, 0xd2,
+			0xd7, 0x5f, 0x4b, 0x6c, 0x28, 0x07, 0x04, 0x24,
+			0x48, 0x7b, 0x44, 0x5d, 0xd9, 0x6e, 0x70, 0xc4,
+			0xc0, 0x9b, 0x55, 0xe8, 0xb6, 0x00, 0x01, 0x52,
+			0xa3, 0x36, 0x3c, 0x34, 0x54, 0x04, 0x3f, 0x38,
+			0xf0, 0xb8, 0x50, 0x36, 0xde, 0xd4, 0x06, 0x55,
+			0x35, 0x0a, 0xa8, 0x7b, 0x8b, 0x6a, 0x28, 0x2b,
+			0x5c, 0x1a, 0xca, 0xe1, 0x62, 0x33, 0xdd, 0x5b,
+		},
+		/* nvme_auth_generate_digest() doesn't support SHA-512 yet. */
+		.expected_psk_digest = NULL,
+	};
+
+	test_nvme_auth_derive_tls_psk(test, &vals);
+}
+
+static struct kunit_case nvme_auth_test_cases[] = {
+	KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha256),
+	KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha384),
+	KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha512),
+	{},
+};
+
+static struct kunit_suite nvme_auth_test_suite = {
+	.name = "nvme-auth",
+	.test_cases = nvme_auth_test_cases,
+};
+kunit_test_suite(nvme_auth_test_suite);
+
+MODULE_DESCRIPTION("Unit tests for NVMe authentication functions");
+MODULE_LICENSE("GPL");
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -7,7 +7,6 @@
 #include <linux/base64.h>
 #include <linux/prandom.h>
 #include <linux/unaligned.h>
-#include <crypto/hash.h>
 #include <crypto/dh.h>
 #include "nvme.h"
 #include "fabrics.h"
@@ -22,7 +21,6 @@ struct nvme_dhchap_queue_context {
 	struct list_head entry;
 	struct work_struct auth_work;
 	struct nvme_ctrl *ctrl;
-	struct crypto_shash *shash_tfm;
 	struct crypto_kpp *dh_tfm;
 	struct nvme_dhchap_key *transformed_key;
 	void *buf;
@@ -38,9 +36,9 @@ struct nvme_dhchap_queue_context {
 	u8 hash_id;
 	u8 sc_c;
 	size_t hash_len;
-	u8 c1[64];
-	u8 c2[64];
-	u8 response[64];
+	u8 c1[NVME_AUTH_MAX_DIGEST_SIZE];
+	u8 c2[NVME_AUTH_MAX_DIGEST_SIZE];
+	u8 response[NVME_AUTH_MAX_DIGEST_SIZE];
 	u8 *ctrl_key;
 	u8 *host_key;
 	u8 *sess_key;
@@ -125,6 +123,8 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
 {
 	struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
 	size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
+	u8 dh_list_offset = NVME_AUTH_DHCHAP_MAX_DH_IDS;
+	u8 *idlist = data->auth_protocol[0].dhchap.idlist;

 	if (size > CHAP_BUF_SIZE) {
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
@@ -141,21 +141,22 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
 			data->sc_c = NVME_AUTH_SECP_NEWTLSPSK;
 	} else
 		data->sc_c = NVME_AUTH_SECP_NOSC;
+	chap->sc_c = data->sc_c;
 	data->napd = 1;
 	data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
 	data->auth_protocol[0].dhchap.halen = 3;
-	data->auth_protocol[0].dhchap.dhlen = 6;
-	data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256;
-	data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384;
-	data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512;
-	data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL;
-	data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048;
-	data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072;
-	data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096;
-	data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144;
-	data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192;
-
-	chap->sc_c = data->sc_c;
+	idlist[0] = NVME_AUTH_HASH_SHA256;
+	idlist[1] = NVME_AUTH_HASH_SHA384;
+	idlist[2] = NVME_AUTH_HASH_SHA512;
+	if (chap->sc_c == NVME_AUTH_SECP_NOSC)
+		idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_NULL;
+	idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_2048;
+	idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_3072;
+	idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_4096;
+	idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_6144;
+	idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_8192;
+	data->auth_protocol[0].dhchap.dhlen =
+		dh_list_offset - NVME_AUTH_DHCHAP_MAX_DH_IDS;

 	return size;
 }
@@ -183,38 +184,17 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
 		return -EPROTO;
 	}

-	if (chap->hash_id == data->hashid && chap->shash_tfm &&
-	    !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) &&
-	    crypto_shash_digestsize(chap->shash_tfm) == data->hl) {
+	if (chap->hash_id == data->hashid && chap->hash_len == data->hl) {
 		dev_dbg(ctrl->device,
 			"qid %d: reuse existing hash %s\n",
 			chap->qid, hmac_name);
 		goto select_kpp;
 	}

-	/* Reset if hash cannot be reused */
-	if (chap->shash_tfm) {
-		crypto_free_shash(chap->shash_tfm);
-		chap->hash_id = 0;
-		chap->hash_len = 0;
-	}
-	chap->shash_tfm = crypto_alloc_shash(hmac_name, 0,
-					     CRYPTO_ALG_ALLOCATES_MEMORY);
-	if (IS_ERR(chap->shash_tfm)) {
-		dev_warn(ctrl->device,
-			 "qid %d: failed to allocate hash %s, error %ld\n",
-			 chap->qid, hmac_name, PTR_ERR(chap->shash_tfm));
-		chap->shash_tfm = NULL;
-		chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
-		return -ENOMEM;
-	}
-
-	if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) {
+	if (nvme_auth_hmac_hash_len(data->hashid) != data->hl) {
 		dev_warn(ctrl->device,
 			 "qid %d: invalid hash length %d\n",
 			 chap->qid, data->hl);
-		crypto_free_shash(chap->shash_tfm);
-		chap->shash_tfm = NULL;
 		chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
 		return -EPROTO;
 	}
@@ -434,7 +414,7 @@ static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl,
 static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 		struct nvme_dhchap_queue_context *chap)
 {
-	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	struct nvme_auth_hmac_ctx hmac;
 	u8 buf[4], *challenge = chap->c1;
 	int ret;

@@ -454,13 +434,11 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 			__func__, chap->qid);
 	}

-	ret = crypto_shash_setkey(chap->shash_tfm,
-			chap->transformed_key->key, chap->transformed_key->len);
-	if (ret) {
-		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
-			 chap->qid, ret);
+	ret = nvme_auth_hmac_init(&hmac, chap->hash_id,
+				  chap->transformed_key->key,
+				  chap->transformed_key->len);
+	if (ret)
 		goto out;
-	}

 	if (chap->dh_tfm) {
 		challenge = kmalloc(chap->hash_len, GFP_KERNEL);
@@ -477,51 +455,36 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
 			goto out;
 	}

-	shash->tfm = chap->shash_tfm;
-	ret = crypto_shash_init(shash);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, challenge, chap->hash_len);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, challenge, chap->hash_len);
+
 	put_unaligned_le32(chap->s1, buf);
-	ret = crypto_shash_update(shash, buf, 4);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 4);
+
 	put_unaligned_le16(chap->transaction, buf);
-	ret = crypto_shash_update(shash, buf, 2);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 2);
+
 	*buf = chap->sc_c;
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, "HostHost", 8);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
-				  strlen(ctrl->opts->host->nqn));
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, "HostHost", 8);
+	nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn,
+			      strlen(ctrl->opts->host->nqn));
 	memset(buf, 0, sizeof(buf));
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
-			    strlen(ctrl->opts->subsysnqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_final(shash, chap->response);
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn,
+			      strlen(ctrl->opts->subsysnqn));
+	nvme_auth_hmac_final(&hmac, chap->response);
+	ret = 0;
 out:
 	if (challenge != chap->c1)
 		kfree(challenge);
+	memzero_explicit(&hmac, sizeof(hmac));
 	return ret;
 }

 static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 		struct nvme_dhchap_queue_context *chap)
 {
-	SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
+	struct nvme_auth_hmac_ctx hmac;
 	struct nvme_dhchap_key *transformed_key;
 	u8 buf[4], *challenge = chap->c2;
 	int ret;
@@ -533,10 +496,10 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 		return ret;
 	}

-	ret = crypto_shash_setkey(chap->shash_tfm,
-			transformed_key->key, transformed_key->len);
+	ret = nvme_auth_hmac_init(&hmac, chap->hash_id, transformed_key->key,
+				  transformed_key->len);
 	if (ret) {
-		dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
+		dev_warn(ctrl->device, "qid %d: failed to init hmac, error %d\n",
 			 chap->qid, ret);
 		goto out;
 	}
@@ -563,43 +526,29 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
 		__func__, chap->qid, ctrl->opts->subsysnqn);
 	dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n",
 		__func__, chap->qid, ctrl->opts->host->nqn);
-	shash->tfm = chap->shash_tfm;
-	ret = crypto_shash_init(shash);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, challenge, chap->hash_len);
-	if (ret)
-		goto out;
+
+	nvme_auth_hmac_update(&hmac, challenge, chap->hash_len);
+
 	put_unaligned_le32(chap->s2, buf);
-	ret = crypto_shash_update(shash, buf, 4);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 4);
+
 	put_unaligned_le16(chap->transaction, buf);
-	ret = crypto_shash_update(shash, buf, 2);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 2);
+
 	memset(buf, 0, 4);
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, "Controller", 10);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
-				  strlen(ctrl->opts->subsysnqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
-				  strlen(ctrl->opts->host->nqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_final(shash, chap->response);
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, "Controller", 10);
+	nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn,
+			      strlen(ctrl->opts->subsysnqn));
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn,
+			      strlen(ctrl->opts->host->nqn));
+	nvme_auth_hmac_final(&hmac, chap->response);
+	ret = 0;
 out:
 	if (challenge != chap->c2)
 		kfree(challenge);
+	memzero_explicit(&hmac, sizeof(hmac));
 	nvme_auth_free_key(transformed_key);
 	return ret;
 }
@@ -689,8 +638,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
 {
 	nvme_auth_reset_dhchap(chap);
 	chap->authenticated = false;
-	if (chap->shash_tfm)
-		crypto_free_shash(chap->shash_tfm);
 	if (chap->dh_tfm)
 		crypto_free_kpp(chap->dh_tfm);
 }
@@ -708,7 +655,8 @@ EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key);
 static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
 				   struct nvme_dhchap_queue_context *chap)
 {
-	u8 *psk, *digest, *tls_psk;
+	u8 *psk, *tls_psk;
+	char *digest;
 	struct key *tls_key;
 	size_t psk_len;
 	int ret = 0;
@@ -1071,12 +1019,11 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
 	INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work);
 	if (!ctrl->opts)
 		return 0;
-	ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret,
-			&ctrl->host_key);
+	ret = nvme_auth_parse_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
 	if (ret)
 		return ret;
-	ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret,
-			&ctrl->ctrl_key);
+	ret = nvme_auth_parse_key(ctrl->opts->dhchap_ctrl_secret,
+				  &ctrl->ctrl_key);
 	if (ret)
 		goto err_free_dhchap_secret;

--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1875,6 +1875,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
 		break;
 	}

+	bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE;
 	bi->metadata_size = head->ms;
 	if (bi->csum_type) {
 		bi->pi_tuple_size = head->pi_size;
@@ -1883,26 +1884,6 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
 	return true;
 }

-static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
-{
-	struct nvme_ctrl *ctrl = ns->ctrl;
-
-	if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
-		lim->max_hw_discard_sectors =
-			nvme_lba_to_sect(ns->head, ctrl->dmrsl);
-	else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
-		lim->max_hw_discard_sectors = UINT_MAX;
-	else
-		lim->max_hw_discard_sectors = 0;
-
-	lim->discard_granularity = lim->logical_block_size;
-
-	if (ctrl->dmrl)
-		lim->max_discard_segments = ctrl->dmrl;
-	else
-		lim->max_discard_segments = NVME_DSM_MAX_RANGES;
-}
-
 static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
 {
 	return uuid_equal(&a->uuid, &b->uuid) &&
@@ -2078,12 +2059,15 @@ static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
 }

 static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
-		struct queue_limits *lim)
+		struct nvme_id_ns_nvm *nvm, struct queue_limits *lim)
 {
 	struct nvme_ns_head *head = ns->head;
+	struct nvme_ctrl *ctrl = ns->ctrl;
 	u32 bs = 1U << head->lba_shift;
 	u32 atomic_bs, phys_bs, io_opt = 0;
+	u32 npdg = 1, npda = 1;
 	bool valid = true;
+	u8 optperf;

 	/*
 	 * The block layer can't support LBA sizes larger than the page size
@@ -2098,7 +2082,12 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
 	phys_bs = bs;
 	atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);

-	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
+	optperf = id->nsfeat >> NVME_NS_FEAT_OPTPERF_SHIFT;
+	if (ctrl->vs >= NVME_VS(2, 1, 0))
+		optperf &= NVME_NS_FEAT_OPTPERF_MASK_2_1;
+	else
+		optperf &= NVME_NS_FEAT_OPTPERF_MASK;
+	if (optperf) {
 		/* NPWG = Namespace Preferred Write Granularity */
 		phys_bs = bs * (1 + le16_to_cpu(id->npwg));
 		/* NOWS = Namespace Optimal Write Size */
@@ -2115,11 +2104,54 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
 	lim->physical_block_size = min(phys_bs, atomic_bs);
 	lim->io_min = phys_bs;
 	lim->io_opt = io_opt;
-	if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
-	    (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
+	if ((ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
+	    (ctrl->oncs & NVME_CTRL_ONCS_DSM))
 		lim->max_write_zeroes_sectors = UINT_MAX;
 	else
-		lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
+		lim->max_write_zeroes_sectors = ctrl->max_zeroes_sectors;
+
+	if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+		lim->max_hw_discard_sectors =
+			nvme_lba_to_sect(ns->head, ctrl->dmrsl);
+	else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
+		lim->max_hw_discard_sectors = UINT_MAX;
+	else
+		lim->max_hw_discard_sectors = 0;
+
+	/*
+	 * NVMe namespaces advertise both a preferred deallocate granularity
+	 * (for a discard length) and alignment (for a discard starting offset).
+	 * However, Linux block devices advertise a single discard_granularity.
+	 * From NVM Command Set specification 1.1 section 5.2.2, the NPDGL/NPDAL
+	 * fields in the NVM Command Set Specific Identify Namespace structure
+	 * are preferred to NPDG/NPDA in the Identify Namespace structure since
+	 * they can represent larger values. However, NPDGL or NPDAL may be 0 if
+	 * unsupported. NPDG and NPDA are 0's based.
+	 * From Figure 115 of NVM Command Set specification 1.1, NPDGL and NPDAL
+	 * are supported if the high bit of OPTPERF is set. NPDG is supported if
+	 * the low bit of OPTPERF is set. NPDA is supported if either is set.
+	 * NPDG should be a multiple of NPDA, and likewise NPDGL should be a
+	 * multiple of NPDAL, but the spec doesn't say anything about NPDG vs.
+	 * NPDAL or NPDGL vs. NPDA. So compute the maximum instead of assuming
+	 * NPDG(L) is the larger. If neither NPDG, NPDGL, NPDA, nor NPDAL are
+	 * supported, default the discard_granularity to the logical block size.
+	 */
+	if (optperf & 0x2 && nvm && nvm->npdgl)
+		npdg = le32_to_cpu(nvm->npdgl);
+	else if (optperf & 0x1)
+		npdg = from0based(id->npdg);
+	if (optperf & 0x2 && nvm && nvm->npdal)
+		npda = le32_to_cpu(nvm->npdal);
+	else if (optperf)
+		npda = from0based(id->npda);
+	if (check_mul_overflow(max(npdg, npda), lim->logical_block_size,
+			       &lim->discard_granularity))
+		lim->discard_granularity = lim->logical_block_size;
+
+	if (ctrl->dmrl)
+		lim->max_discard_segments = ctrl->dmrl;
+	else
+		lim->max_discard_segments = NVME_DSM_MAX_RANGES;
 	return valid;
 }

@@ -2353,7 +2385,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	}
 	lbaf = nvme_lbaf_index(id->flbas);

-	if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
+	if (nvme_id_cns_ok(ns->ctrl, NVME_ID_CNS_CS_NS)) {
 		ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
 		if (ret < 0)
 			goto out;
@@ -2381,10 +2413,9 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	nvme_set_ctrl_limits(ns->ctrl, &lim, false);
 	nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
 	nvme_set_chunk_sectors(ns, id, &lim);
-	if (!nvme_update_disk_info(ns, id, &lim))
+	if (!nvme_update_disk_info(ns, id, nvm, &lim))
 		capacity = 0;

-	nvme_config_discard(ns, &lim);
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    ns->head->ids.csi == NVME_CSI_ZNS)
 		nvme_update_zone_info(ns, &lim, &zi);
@@ -3388,7 +3419,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)

 	ctrl->dmrl = id->dmrl;
 	ctrl->dmrsl = le32_to_cpu(id->dmrsl);
-	if (id->wzsl)
+	if (id->wzsl && !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
 		ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);

 free_data:
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -154,21 +154,8 @@ void nvme_failover_req(struct request *req)
 	}

 	spin_lock_irqsave(&ns->head->requeue_lock, flags);
-	for (bio = req->bio; bio; bio = bio->bi_next) {
+	for (bio = req->bio; bio; bio = bio->bi_next)
 		bio_set_dev(bio, ns->head->disk->part0);
-		if (bio->bi_opf & REQ_POLLED) {
-			bio->bi_opf &= ~REQ_POLLED;
-			bio->bi_cookie = BLK_QC_T_NONE;
-		}
-		/*
-		 * The alternate request queue that we may end up submitting
-		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
-		 * will fail the I/O immediately with EAGAIN to the issuer.
-		 * We are not in the issuer context which cannot block. Clear
-		 * the flag to avoid spurious EAGAIN I/O failures.
-		 */
-		bio->bi_opf &= ~REQ_NOWAIT;
-	}
 	blk_steal_bios(&ns->head->requeue_list, req);
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);

--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -762,6 +762,12 @@ static inline u32 nvme_bytes_to_numd(size_t len)
 	return (len >> 2) - 1;
 }

+/* Decode a 2-byte "0's based"/"0-based" field */
+static inline u32 from0based(__le16 value)
+{
+	return (u32)le16_to_cpu(value) + 1;
+}
+
 static inline bool nvme_is_ana_error(u16 status)
 {
 	switch (status & NVME_SCT_SC_MASK) {
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -4178,6 +4178,8 @@ static const struct pci_device_id nvme_id_table[] = {
 		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
 	{ PCI_DEVICE(0x2646, 0x501E),   /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */
 		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
+	{ PCI_DEVICE(0x2646, 0x502F),   /* KINGSTON OM3SGP4xxxxK NVMe SSD */
+		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
 	{ PCI_DEVICE(0x1f40, 0x1202),   /* Netac Technologies Co. NV3000 NVMe SSD */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x1f40, 0x5236),   /* Netac Technologies Co. NV7000 NVMe SSD */
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -658,7 +658,7 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
 		struct nvme_dhchap_key *key, *host_key;
 		int ret;

-		ret = nvme_auth_generate_key(dhchap_secret, &key);
+		ret = nvme_auth_parse_key(dhchap_secret, &key);
 		if (ret) {
 			kfree(dhchap_secret);
 			return ret;
@@ -716,7 +716,7 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
 		struct nvme_dhchap_key *key, *ctrl_key;
 		int ret;

-		ret = nvme_auth_generate_key(dhchap_secret, &key);
+		ret = nvme_auth_parse_key(dhchap_secret, &key);
 		if (ret) {
 			kfree(dhchap_secret);
 			return ret;
@@ -829,7 +829,49 @@ static ssize_t tls_configured_key_show(struct device *dev,

 	return sysfs_emit(buf, "%08x\n", key_serial(key));
 }
-static DEVICE_ATTR_RO(tls_configured_key);
+
+static ssize_t tls_configured_key_store(struct device *dev,
+					struct device_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int error, qid;
+
+	error = kstrtoint(buf, 10, &qid);
+	if (error)
+		return error;
+
+	/*
+	 * We currently only allow userspace to write a `0` indicating
+	 * generate a new key.
+	 */
+	if (qid)
+		return -EINVAL;
+
+	if (!ctrl->opts || !ctrl->opts->concat)
+		return -EOPNOTSUPP;
+
+	error = nvme_auth_negotiate(ctrl, 0);
+	if (error < 0) {
+		nvme_reset_ctrl(ctrl);
+		return error;
+	}
+
+	error = nvme_auth_wait(ctrl, 0);
+	if (error < 0) {
+		nvme_reset_ctrl(ctrl);
+		return error;
+	}
+
+	/*
+	 * We need to reset the TLS connection, so let's just
+	 * reset the controller.
+	 */
+	nvme_reset_ctrl(ctrl);
+
+	return count;
+}
+static DEVICE_ATTR_RW(tls_configured_key);

 static ssize_t tls_keyring_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
@@ -861,7 +903,7 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj,
 	    !ctrl->opts->tls && !ctrl->opts->concat)
 		return 0;
 	if (a == &dev_attr_tls_configured_key.attr &&
-	    (!ctrl->opts->tls_key || ctrl->opts->concat))
+	    !ctrl->opts->concat)
 		return 0;
 	if (a == &dev_attr_tls_keyring.attr &&
 	    !ctrl->opts->keyring)
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -1057,6 +1057,8 @@ static void nvme_execute_identify_ns_nvm(struct nvmet_req *req)
 		status = NVME_SC_INTERNAL;
 		goto out;
 	}
+	if (req->ns->bdev)
+		nvmet_bdev_set_nvm_limits(req->ns->bdev, id);
 	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
 	kfree(id);
 out:
@@ -1603,7 +1605,7 @@ void nvmet_execute_keep_alive(struct nvmet_req *req)

 	pr_debug("ctrl %d update keep-alive timer for %d secs\n",
 		ctrl->cntlid, ctrl->kato);
-	mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
+	mod_delayed_work(system_percpu_wq, &ctrl->ka_work, ctrl->kato * HZ);
 out:
 	nvmet_req_complete(req, status);
 }
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -9,7 +9,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/err.h>
-#include <crypto/hash.h>
 #include <linux/crc32.h>
 #include <linux/base64.h>
 #include <linux/ctype.h>
@@ -45,15 +44,6 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
 			 key_hash);
 		return -EINVAL;
 	}
-	if (key_hash > 0) {
-		/* Validate selected hash algorithm */
-		const char *hmac = nvme_auth_hmac_name(key_hash);
-
-		if (!crypto_has_shash(hmac, 0, 0)) {
-			pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac);
-			return -ENOTSUPP;
-		}
-	}
 	dhchap_secret = kstrdup(secret, GFP_KERNEL);
 	if (!dhchap_secret)
 		return -ENOMEM;
@@ -140,7 +130,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
 	return ret;
 }

-u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
+u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset)
 {
 	int ret = 0;
 	struct nvmet_host_link *p;
@@ -166,7 +156,7 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
 		goto out_unlock;
 	}

-	if (nvmet_queue_tls_keyid(sq)) {
+	if (!reset && nvmet_queue_tls_keyid(sq)) {
 		pr_debug("host %s tls enabled\n", ctrl->hostnqn);
 		goto out_unlock;
 	}
@@ -292,47 +282,30 @@ bool nvmet_check_auth_status(struct nvmet_req *req)
 int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 			 unsigned int shash_len)
 {
-	struct crypto_shash *shash_tfm;
-	SHASH_DESC_ON_STACK(shash, shash_tfm);
+	struct nvme_auth_hmac_ctx hmac;
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
-	const char *hash_name;
 	u8 *challenge = req->sq->dhchap_c1;
 	struct nvme_dhchap_key *transformed_key;
 	u8 buf[4];
 	int ret;

-	hash_name = nvme_auth_hmac_name(ctrl->shash_id);
-	if (!hash_name) {
-		pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
-		return -EINVAL;
-	}
-
-	shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
-	if (IS_ERR(shash_tfm)) {
-		pr_err("failed to allocate shash %s\n", hash_name);
-		return PTR_ERR(shash_tfm);
-	}
-
-	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
-		pr_err("%s: hash len mismatch (len %d digest %d)\n",
-			__func__, shash_len,
-			crypto_shash_digestsize(shash_tfm));
-		ret = -EINVAL;
-		goto out_free_tfm;
-	}
-
 	transformed_key = nvme_auth_transform_key(ctrl->host_key,
 						  ctrl->hostnqn);
-	if (IS_ERR(transformed_key)) {
-		ret = PTR_ERR(transformed_key);
-		goto out_free_tfm;
-	}
+	if (IS_ERR(transformed_key))
+		return PTR_ERR(transformed_key);

-	ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
+	ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key,
 				  transformed_key->len);
 	if (ret)
 		goto out_free_response;

+	if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) {
+		pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__,
+		       shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id));
+		ret = -EINVAL;
+		goto out_free_response;
+	}
+
 	if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
 		challenge = kmalloc(shash_len, GFP_KERNEL);
 		if (!challenge) {
@@ -345,101 +318,67 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 						    req->sq->dhchap_c1,
 						    challenge, shash_len);
 		if (ret)
-			goto out;
+			goto out_free_challenge;
 	}

 	pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
 		 ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
 		 req->sq->dhchap_tid);

-	shash->tfm = shash_tfm;
-	ret = crypto_shash_init(shash);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, challenge, shash_len);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, challenge, shash_len);
+
 	put_unaligned_le32(req->sq->dhchap_s1, buf);
-	ret = crypto_shash_update(shash, buf, 4);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 4);
+
 	put_unaligned_le16(req->sq->dhchap_tid, buf);
-	ret = crypto_shash_update(shash, buf, 2);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 2);
+
 	*buf = req->sq->sc_c;
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, "HostHost", 8);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, "HostHost", 8);
 	memset(buf, 0, 4);
-	ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn,
-				  strlen(ctrl->subsys->subsysnqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_final(shash, response);
-out:
+	nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn));
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn,
+			      strlen(ctrl->subsys->subsysnqn));
+	nvme_auth_hmac_final(&hmac, response);
+	ret = 0;
+out_free_challenge:
 	if (challenge != req->sq->dhchap_c1)
 		kfree(challenge);
 out_free_response:
+	memzero_explicit(&hmac, sizeof(hmac));
 	nvme_auth_free_key(transformed_key);
-out_free_tfm:
-	crypto_free_shash(shash_tfm);
 	return ret;
 }

 int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
 			 unsigned int shash_len)
 {
-	struct crypto_shash *shash_tfm;
-	struct shash_desc *shash;
+	struct nvme_auth_hmac_ctx hmac;
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
-	const char *hash_name;
 	u8 *challenge = req->sq->dhchap_c2;
 	struct nvme_dhchap_key *transformed_key;
 	u8 buf[4];
 	int ret;

-	hash_name = nvme_auth_hmac_name(ctrl->shash_id);
-	if (!hash_name) {
-		pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
-		return -EINVAL;
-	}
-
-	shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
-	if (IS_ERR(shash_tfm)) {
-		pr_err("failed to allocate shash %s\n", hash_name);
-		return PTR_ERR(shash_tfm);
-	}
-
-	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
-		pr_debug("%s: hash len mismatch (len %d digest %d)\n",
-			 __func__, shash_len,
-			 crypto_shash_digestsize(shash_tfm));
-		ret = -EINVAL;
-		goto out_free_tfm;
-	}
-
 	transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
 						ctrl->subsys->subsysnqn);
-	if (IS_ERR(transformed_key)) {
-		ret = PTR_ERR(transformed_key);
-		goto out_free_tfm;
-	}
+	if (IS_ERR(transformed_key))
+		return PTR_ERR(transformed_key);

-	ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
+	ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key,
 				  transformed_key->len);
 	if (ret)
 		goto out_free_response;

+	if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) {
+		pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__,
+		       shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id));
+		ret = -EINVAL;
+		goto out_free_response;
+	}
+
 	if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
 		challenge = kmalloc(shash_len, GFP_KERNEL);
 		if (!challenge) {
@@ -455,55 +394,29 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
 			goto out_free_challenge;
 	}

-	shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
-			GFP_KERNEL);
-	if (!shash) {
-		ret = -ENOMEM;
-		goto out_free_challenge;
-	}
-	shash->tfm = shash_tfm;
+	nvme_auth_hmac_update(&hmac, challenge, shash_len);

-	ret = crypto_shash_init(shash);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, challenge, shash_len);
-	if (ret)
-		goto out;
 	put_unaligned_le32(req->sq->dhchap_s2, buf);
-	ret = crypto_shash_update(shash, buf, 4);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 4);
+
 	put_unaligned_le16(req->sq->dhchap_tid, buf);
-	ret = crypto_shash_update(shash, buf, 2);
-	if (ret)
-		goto out;
+	nvme_auth_hmac_update(&hmac, buf, 2);
+
 	memset(buf, 0, 4);
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, "Controller", 10);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn,
-			    strlen(ctrl->subsys->subsysnqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, buf, 1);
-	if (ret)
-		goto out;
-	ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
-	if (ret)
-		goto out;
-	ret = crypto_shash_final(shash, response);
-out:
-	kfree(shash);
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, "Controller", 10);
+	nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn,
+			      strlen(ctrl->subsys->subsysnqn));
+	nvme_auth_hmac_update(&hmac, buf, 1);
+	nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn));
+	nvme_auth_hmac_final(&hmac, response);
+	ret = 0;
 out_free_challenge:
 	if (challenge != req->sq->dhchap_c2)
 		kfree(challenge);
 out_free_response:
+	memzero_explicit(&hmac, sizeof(hmac));
 	nvme_auth_free_key(transformed_key);
-out_free_tfm:
-	crypto_free_shash(shash_tfm);
 	return ret;
 }

@@ -531,7 +444,7 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
 }

 int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
-			    u8 *pkey, int pkey_size)
+			    const u8 *pkey, int pkey_size)
 {
 	struct nvmet_ctrl *ctrl = req->sq->ctrl;
 	int ret;
@@ -557,7 +470,8 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
 void nvmet_auth_insert_psk(struct nvmet_sq *sq)
 {
 	int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id);
-	u8 *psk, *digest, *tls_psk;
+	u8 *psk, *tls_psk;
+	char *digest;
 	size_t psk_len;
 	int ret;
 #ifdef CONFIG_NVME_TARGET_TCP_TLS
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -17,7 +17,6 @@
 #include <linux/nvme-auth.h>
 #endif
 #include <linux/nvme-keyring.h>
-#include <crypto/hash.h>
 #include <crypto/kpp.h>
 #include <linux/nospec.h>

@@ -2181,8 +2180,6 @@ static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item,
 	hmac_id = nvme_auth_hmac_id(page);
 	if (hmac_id == NVME_AUTH_HASH_INVALID)
 		return -EINVAL;
-	if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0))
-		return -ENOTSUPP;
 	host->dhchap_hash_id = hmac_id;
 	return count;
 }
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1688,7 +1688,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
 	if (args->hostid)
 		uuid_copy(&ctrl->hostid, args->hostid);

-	dhchap_status = nvmet_setup_auth(ctrl, args->sq);
+	dhchap_status = nvmet_setup_auth(ctrl, args->sq, false);
 	if (dhchap_status) {
 		pr_err("Failed to setup authentication, dhchap status %u\n",
 		       dhchap_status);
@@ -1944,12 +1944,13 @@ static int __init nvmet_init(void)
 	if (!nvmet_bvec_cache)
 		return -ENOMEM;

-	zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
+	zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM | WQ_PERCPU,
+				 0);
 	if (!zbd_wq)
 		goto out_destroy_bvec_cache;

 	buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
-			WQ_MEM_RECLAIM, 0);
+			WQ_MEM_RECLAIM | WQ_PERCPU, 0);
 	if (!buffered_io_wq)
 		goto out_free_zbd_work_queue;

--- a/drivers/nvme/target/fabrics-cmd-auth.c
+++ b/drivers/nvme/target/fabrics-cmd-auth.c
@@ -8,7 +8,6 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/nvme-auth.h>
-#include <crypto/hash.h>
 #include <crypto/kpp.h>
 #include "nvmet.h"

@@ -75,8 +74,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
 	for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) {
 		u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i];

-		if (!fallback_hash_id &&
-		    crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0))
+		if (!fallback_hash_id && nvme_auth_hmac_hash_len(host_hmac_id))
 			fallback_hash_id = host_hmac_id;
 		if (ctrl->shash_id != host_hmac_id)
 			continue;
@@ -293,7 +291,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
 			pr_debug("%s: ctrl %d qid %d reset negotiation\n",
 				 __func__, ctrl->cntlid, req->sq->qid);
 			if (!req->sq->qid) {
-				dhchap_status = nvmet_setup_auth(ctrl, req->sq);
+				dhchap_status = nvmet_setup_auth(ctrl, req->sq,
+								 true);
 				if (dhchap_status) {
 					pr_err("ctrl %d qid 0 failed to setup re-authentication\n",
 					       ctrl->cntlid);
@@ -391,14 +390,15 @@ done:
 	    req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
 		unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120;

-		mod_delayed_work(system_wq, &req->sq->auth_expired_work,
+		mod_delayed_work(system_percpu_wq, &req->sq->auth_expired_work,
 				 auth_expire_secs * HZ);
 		goto complete;
 	}
 	/* Final states, clear up variables */
-	nvmet_auth_sq_free(req->sq);
-	if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2)
+	if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
+		nvmet_auth_sq_free(req->sq);
 		nvmet_ctrl_fatal_error(ctrl);
+	}

 complete:
 	nvmet_req_complete(req, status);
@@ -574,9 +574,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req)
 	status = nvmet_copy_to_sgl(req, 0, d, al);
 	kfree(d);
 done:
-	if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2)
-		nvmet_auth_sq_free(req->sq);
-	else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
+	if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
 		nvmet_auth_sq_free(req->sq);
 		nvmet_ctrl_fatal_error(ctrl);
 	}
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -792,9 +792,9 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
 	if (!queue)
 		return NULL;

-	queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0,
-				assoc->tgtport->fc_target_port.port_num,
-				assoc->a_id, qid);
+	queue->work_q = alloc_workqueue("ntfc%d.%d.%d", WQ_PERCPU, 0,
+					assoc->tgtport->fc_target_port.port_num,
+					assoc->a_id, qid);
 	if (!queue->work_q)
 		goto out_free_queue;

--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -30,11 +30,11 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
 	id->nacwu = lpp0b;

 	/*
-	 * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and
-	 * NOWS are defined for this namespace and should be used by
-	 * the host for I/O optimization.
+	 * OPTPERF = 11b indicates that the fields NPWG, NPWA, NPDG, NPDA,
+	 * NPDGL, NPDAL, and NOWS are defined for this namespace and should be
+	 * used by the host for I/O optimization.
 	 */
-	id->nsfeat |= 1 << 4;
+	id->nsfeat |= 0x3 << NVME_NS_FEAT_OPTPERF_SHIFT;
 	/* NPWG = Namespace Preferred Write Granularity. 0's based */
 	id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev));
 	/* NPWA = Namespace Preferred Write Alignment. 0's based */
@@ -52,6 +52,17 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
 		id->dlfeat = (1 << 3) | 0x1;
 }

+void nvmet_bdev_set_nvm_limits(struct block_device *bdev,
+			       struct nvme_id_ns_nvm *id)
+{
+	/*
+	 * NPDGL = Namespace Preferred Deallocate Granularity Large
+	 * NPDAL = Namespace Preferred Deallocate Alignment Large
+	 */
+	id->npdgl = id->npdal = cpu_to_le32(bdev_discard_granularity(bdev) /
+					    bdev_logical_block_size(bdev));
+}
+
 void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
 {
 	if (ns->bdev_file) {
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -419,7 +419,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 {
 	if (ctrl->ctrl.queue_count > 1) {
 		nvme_quiesce_io_queues(&ctrl->ctrl);
-		nvme_cancel_tagset(&ctrl->ctrl);
 		nvme_loop_destroy_io_queues(ctrl);
 	}

@@ -427,7 +426,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 	if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE)
 		nvme_disable_ctrl(&ctrl->ctrl, true);

-	nvme_cancel_admin_tagset(&ctrl->ctrl);
 	nvme_loop_destroy_admin_queue(ctrl);
 }

--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -550,6 +550,8 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl);
 u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
 u32 nvmet_connect_cmd_data_len(struct nvmet_req *req);
 void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id);
+void nvmet_bdev_set_nvm_limits(struct block_device *bdev,
+			       struct nvme_id_ns_nvm *id);
 u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
 u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req);
@@ -896,7 +898,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req);
 int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
 		       bool set_ctrl);
 int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
-u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq);
+u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset);
 void nvmet_auth_sq_init(struct nvmet_sq *sq);
 void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
 void nvmet_auth_sq_free(struct nvmet_sq *sq);
@@ -913,11 +915,11 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
 int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
 				u8 *buf, int buf_size);
 int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
-			    u8 *buf, int buf_size);
+			    const u8 *pkey, int pkey_size);
 void nvmet_auth_insert_psk(struct nvmet_sq *sq);
 #else
 static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl,
-				  struct nvmet_sq *sq)
+				  struct nvmet_sq *sq, bool reset)
 {
 	return 0;
 }
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -2225,7 +2225,7 @@ static int __init nvmet_tcp_init(void)
 	int ret;

 	nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
-				WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
+				WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU, 0);
 	if (!nvmet_tcp_wq)
 		return -ENOMEM;

--- a/drivers/scsi/scsi_bsg.c
+++ b/drivers/scsi/scsi_bsg.c
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include <linux/bsg.h>
+#include <linux/io_uring/cmd.h>
 #include <scsi/scsi.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi_cmnd.h>
@@ -9,6 +10,178 @@

 #define uptr64(val) ((void __user *)(uintptr_t)(val))

+/*
+ * Per-command BSG SCSI PDU stored in io_uring_cmd.pdu[32].
+ * Holds temporary state between submission, completion and task_work.
+ */
+struct scsi_bsg_uring_cmd_pdu {
+	struct bio *bio;		/* mapped user buffer, unmap in task work */
+	struct request *req;		/* block request, freed in task work */
+	u64 response_addr;		/* user space response buffer address */
+};
+static_assert(sizeof(struct scsi_bsg_uring_cmd_pdu) <= sizeof_field(struct io_uring_cmd, pdu));
+
+static inline struct scsi_bsg_uring_cmd_pdu *scsi_bsg_uring_cmd_pdu(
+	struct io_uring_cmd *ioucmd)
+{
+	return io_uring_cmd_to_pdu(ioucmd, struct scsi_bsg_uring_cmd_pdu);
+}
+
+/* Task work: build res2 (layout in uapi/linux/bsg.h) and copy sense to user. */
+static void scsi_bsg_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw)
+{
+	struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req);
+	struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd);
+	struct request *rq = pdu->req;
+	struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
+	u64 res2;
+	int ret = 0;
+	u8 driver_status = 0;
+	u8 sense_len_wr = 0;
+
+	if (pdu->bio)
+		blk_rq_unmap_user(pdu->bio);
+
+	if (scsi_status_is_check_condition(scmd->result)) {
+		driver_status = DRIVER_SENSE;
+		if (pdu->response_addr)
+			sense_len_wr = min_t(u8, scmd->sense_len,
+					     SCSI_SENSE_BUFFERSIZE);
+	}
+
+	if (sense_len_wr) {
+		if (copy_to_user(uptr64(pdu->response_addr), scmd->sense_buffer,
+				 sense_len_wr))
+			ret = -EFAULT;
+	}
+
+	res2 = bsg_scsi_res2_build(status_byte(scmd->result), driver_status,
+				  host_byte(scmd->result), sense_len_wr,
+				  scmd->resid_len);
+
+	blk_mq_free_request(rq);
+	io_uring_cmd_done32(ioucmd, ret, res2,
+			    IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
+}
+
+static enum rq_end_io_ret scsi_bsg_uring_cmd_done(struct request *req,
+						  blk_status_t status,
+						  const struct io_comp_batch *iocb)
+{
+	struct io_uring_cmd *ioucmd = req->end_io_data;
+
+	io_uring_cmd_do_in_task_lazy(ioucmd, scsi_bsg_uring_task_cb);
+	return RQ_END_IO_NONE;
+}
+
+static int scsi_bsg_map_user_buffer(struct request *req,
+				    struct io_uring_cmd *ioucmd,
+				    unsigned int issue_flags, gfp_t gfp_mask)
+{
+	const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd);
+	bool is_write = cmd->dout_xfer_len > 0;
+	u64 buf_addr = is_write ? cmd->dout_xferp : cmd->din_xferp;
+	unsigned long buf_len = is_write ? cmd->dout_xfer_len : cmd->din_xfer_len;
+	struct iov_iter iter;
+	int ret;
+
+	if (ioucmd->flags & IORING_URING_CMD_FIXED) {
+		ret = io_uring_cmd_import_fixed(buf_addr, buf_len,
+						is_write ? WRITE : READ,
+						&iter, ioucmd, issue_flags);
+		if (ret < 0)
+			return ret;
+		ret = blk_rq_map_user_iov(req->q, req, NULL, &iter, gfp_mask);
+	} else {
+		ret = blk_rq_map_user(req->q, req, NULL, uptr64(buf_addr),
+				      buf_len, gfp_mask);
+	}
+
+	return ret;
+}
+
+static int scsi_bsg_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd,
+			       unsigned int issue_flags, bool open_for_write)
+{
+	struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd);
+	const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd);
+	struct scsi_cmnd *scmd;
+	struct request *req;
+	blk_mq_req_flags_t blk_flags = 0;
+	gfp_t gfp_mask = GFP_KERNEL;
+	int ret;
+
+	if (cmd->protocol != BSG_PROTOCOL_SCSI ||
+	    cmd->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD)
+		return -EINVAL;
+
+	if (!cmd->request || cmd->request_len == 0)
+		return -EINVAL;
+
+	if (cmd->dout_xfer_len && cmd->din_xfer_len) {
+		pr_warn_once("BIDI support in bsg has been removed.\n");
+		return -EOPNOTSUPP;
+	}
+
+	if (cmd->dout_iovec_count > 0 || cmd->din_iovec_count > 0)
+		return -EOPNOTSUPP;
+
+	if (issue_flags & IO_URING_F_NONBLOCK) {
+		blk_flags = BLK_MQ_REQ_NOWAIT;
+		gfp_mask = GFP_NOWAIT;
+	}
+
+	req = scsi_alloc_request(q, cmd->dout_xfer_len ?
+				 REQ_OP_DRV_OUT : REQ_OP_DRV_IN, blk_flags);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	scmd = blk_mq_rq_to_pdu(req);
+	if (cmd->request_len > sizeof(scmd->cmnd)) {
+		ret = -EINVAL;
+		goto out_free_req;
+	}
+	scmd->cmd_len = cmd->request_len;
+	scmd->allowed = SG_DEFAULT_RETRIES;
+
+	if (copy_from_user(scmd->cmnd, uptr64(cmd->request), cmd->request_len)) {
+		ret = -EFAULT;
+		goto out_free_req;
+	}
+
+	if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) {
+		ret = -EPERM;
+		goto out_free_req;
+	}
+
+	pdu->response_addr = cmd->response;
+	scmd->sense_len = cmd->max_response_len ?
+		min(cmd->max_response_len, SCSI_SENSE_BUFFERSIZE) : SCSI_SENSE_BUFFERSIZE;
+
+	if (cmd->dout_xfer_len || cmd->din_xfer_len) {
+		ret = scsi_bsg_map_user_buffer(req, ioucmd, issue_flags, gfp_mask);
+		if (ret)
+			goto out_free_req;
+		pdu->bio = req->bio;
+	} else {
+		pdu->bio = NULL;
+	}
+
+	req->timeout = cmd->timeout_ms ?
+		msecs_to_jiffies(cmd->timeout_ms) : BLK_DEFAULT_SG_TIMEOUT;
+
+	req->end_io = scsi_bsg_uring_cmd_done;
+	req->end_io_data = ioucmd;
+	pdu->req = req;
+
+	blk_execute_rq_nowait(req, false);
+	return -EIOCBQUEUED;
+
+out_free_req:
+	blk_mq_free_request(req);
+	return ret;
+}
+
 static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 		bool open_for_write, unsigned int timeout)
 {
@@ -99,5 +272,6 @@ out_put_request:
 struct bsg_device *scsi_bsg_register_queue(struct scsi_device *sdev)
 {
 	return bsg_register_queue(sdev->request_queue, &sdev->sdev_gendev,
-			dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn);
+			dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn,
+			scsi_bsg_uring_cmd);
 }
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -173,7 +173,7 @@ static int fd_configure_device(struct se_device *dev)
 		 */
 		dev->dev_attrib.max_write_same_len = 0xFFFF;

-		if (bdev_nonrot(bdev))
+		if (!bdev_rot(bdev))
 			dev->dev_attrib.is_nonrot = 1;
 	} else {
 		if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) {
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -148,7 +148,7 @@ static int iblock_configure_device(struct se_device *dev)
 	else
 		dev->dev_attrib.max_write_same_len = 0xFFFF;

-	if (bdev_nonrot(bd))
+	if (!bdev_rot(bd))
 		dev->dev_attrib.is_nonrot = 1;

 	target_configure_write_atomic_from_bdev(&dev->dev_attrib, bd);
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -694,7 +694,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	}

-	if (!bdev_nonrot(file_bdev(bdev_file)))
+	if (bdev_rot(file_bdev(bdev_file)))
 		fs_devices->rotating = true;

 	if (bdev_max_discard_sectors(file_bdev(bdev_file)))
@@ -2919,7 +2919,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path

 	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);

-	if (!bdev_nonrot(device->bdev))
+	if (bdev_rot(device->bdev))
 		fs_devices->rotating = true;

 	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -73,7 +73,7 @@ static int mbt_mb_init(struct super_block *sb)
 	ext4_fsblk_t block;
 	int ret;

-	/* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+	/* needed by ext4_mb_init->bdev_rot(sb->s_bdev) */
 	sb->s_bdev = kzalloc_obj(*sb->s_bdev);
 	if (sb->s_bdev == NULL)
 		return -ENOMEM;
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3840,7 +3840,7 @@ int ext4_mb_init(struct super_block *sb)
 		spin_lock_init(&lg->lg_prealloc_lock);
 	}

-	if (bdev_nonrot(sb->s_bdev))
+	if (!bdev_rot(sb->s_bdev))
 		sbi->s_mb_max_linear_groups = 0;
 	else
 		sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;
--- a/fs/xfs/xfs_zone_gc.c
+++ b/fs/xfs/xfs_zone_gc.c
@@ -670,7 +670,6 @@ xfs_zone_gc_start_chunk(
 	struct xfs_inode	*ip;
 	struct bio		*bio;
 	xfs_daddr_t		daddr;
-	unsigned int		len;
 	bool			is_seq;

 	if (xfs_is_shutdown(mp))
@@ -685,15 +684,16 @@ xfs_zone_gc_start_chunk(
 		return false;
 	}

-	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
-	bio = bio_alloc_bioset(bdev,
-			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
-			REQ_OP_READ, GFP_NOFS, &data->bio_set);
-
+	/*
+	 * Scratch allocation can wrap around to the same buffer again,
+	 * provision an extra bvec for that case.
+	 */
+	bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS,
+			&data->bio_set);
 	chunk = container_of(bio, struct xfs_gc_bio, bio);
 	chunk->ip = ip;
 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
-	chunk->len = len;
+	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
 	chunk->old_startblock =
 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
 	chunk->new_daddr = daddr;
@@ -707,8 +707,9 @@ xfs_zone_gc_start_chunk(
 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
 	bio->bi_end_io = xfs_zone_gc_end_io;
 	xfs_zone_gc_add_data(chunk);
-	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
-	data->scratch_available -= len;
+	data->scratch_head =
+		(data->scratch_head + chunk->len) % data->scratch_size;
+	data->scratch_available -= chunk->len;

 	XFS_STATS_INC(mp, xs_gc_read_calls);

@@ -899,9 +900,10 @@ out:

 static void
 xfs_submit_zone_reset_bio(
-	struct xfs_rtgroup	*rtg,
-	struct bio		*bio)
+	struct bio		*bio,
+	void			*priv)
 {
+	struct xfs_rtgroup	*rtg = priv;
 	struct xfs_mount	*mp = rtg_mount(rtg);

 	trace_xfs_zone_reset(rtg);
@@ -933,26 +935,16 @@ xfs_submit_zone_reset_bio(
 	submit_bio(bio);
 }

-static void xfs_bio_wait_endio(struct bio *bio)
-{
-	complete(bio->bi_private);
-}
-
 int
 xfs_zone_gc_reset_sync(
 	struct xfs_rtgroup	*rtg)
 {
-	DECLARE_COMPLETION_ONSTACK(done);
 	struct bio		bio;
 	int			error;

 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
 			REQ_OP_ZONE_RESET | REQ_SYNC);
-	bio.bi_private = &done;
-	bio.bi_end_io = xfs_bio_wait_endio;
-	xfs_submit_zone_reset_bio(rtg, &bio);
-	wait_for_completion_io(&done);
-
+	bio_await(&bio, rtg, xfs_submit_zone_reset_bio);
 	error = blk_status_to_errno(bio.bi_status);
 	bio_uninit(&bio);
 	return error;
@@ -989,7 +981,7 @@ xfs_zone_gc_reset_zones(
 		chunk->data = data;
 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
 		list_add_tail(&chunk->entry, &data->resetting);
-		xfs_submit_zone_reset_bio(rtg, bio);
+		xfs_submit_zone_reset_bio(bio, rtg);
 	} while (next);
 }

--- a/include/crypto/hkdf.h
+++ b/include/crypto/hkdf.h
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * HKDF: HMAC-based Key Derivation Function (HKDF), RFC 5869
- *
- * Extracted from fs/crypto/hkdf.c, which has
- * Copyright 2019 Google LLC
- */
-
-#ifndef _CRYPTO_HKDF_H
-#define _CRYPTO_HKDF_H
-
-#include <crypto/hash.h>
-
-int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
-		 unsigned int ikmlen, const u8 *salt, unsigned int saltlen,
-		 u8 *prk);
-int hkdf_expand(struct crypto_shash *hmac_tfm,
-		const u8 *info, unsigned int infolen,
-		u8 *okm, unsigned int okmlen);
-#endif
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *);
 extern int biovec_init_pool(mempool_t *pool, int pool_entries);

 struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
-			     blk_opf_t opf, gfp_t gfp_mask,
-			     struct bio_set *bs);
+			     blk_opf_t opf, gfp_t gfp, struct bio_set *bs);
 struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask);
 extern void bio_put(struct bio *);

@@ -433,6 +432,8 @@ extern void bio_uninit(struct bio *);
 void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
 void bio_reuse(struct bio *bio, blk_opf_t opf);
 void bio_chain(struct bio *, struct bio *);
+void bio_await(struct bio *bio, void *priv,
+	       void (*submit)(struct bio *bio, void *priv));

 int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len,
 			      unsigned off);
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -14,6 +14,7 @@ enum blk_integrity_flags {
 	BLK_INTEGRITY_DEVICE_CAPABLE	= 1 << 2,
 	BLK_INTEGRITY_REF_TAG		= 1 << 3,
 	BLK_INTEGRITY_STACKED		= 1 << 4,
+	BLK_SPLIT_INTERVAL_CAPABLE	= 1 << 5,
 };

 const char *blk_integrity_profile_name(struct blk_integrity *bi);
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -13,6 +13,7 @@
 #include <linux/minmax.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
+#include <linux/completion.h>
 #include <linux/wait.h>
 #include <linux/bio.h>
 #include <linux/gfp.h>
@@ -201,10 +202,14 @@ struct gendisk {
 	u8 __rcu		*zones_cond;
 	unsigned int		zone_wplugs_hash_bits;
 	atomic_t		nr_zone_wplugs;
-	spinlock_t		zone_wplugs_lock;
+	spinlock_t		zone_wplugs_hash_lock;
 	struct mempool		*zone_wplugs_pool;
 	struct hlist_head	*zone_wplugs_hash;
 	struct workqueue_struct *zone_wplugs_wq;
+	spinlock_t		zone_wplugs_list_lock;
+	struct list_head	zone_wplugs_list;
+	struct task_struct	*zone_wplugs_worker;
+	struct completion	zone_wplugs_worker_bio_done;
 #endif /* CONFIG_BLK_DEV_ZONED */

 #if IS_ENABLED(CONFIG_CDROM)
@@ -503,7 +508,7 @@ struct request_queue {

 	/* hw dispatch queues */
 	unsigned int		nr_hw_queues;
-	struct blk_mq_hw_ctx * __rcu *queue_hw_ctx;
+	struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues);

 	struct percpu_ref	q_usage_counter;
 	struct lock_class_key	io_lock_cls_key;
@@ -669,6 +674,7 @@ enum {
 	QUEUE_FLAG_NO_ELV_SWITCH,	/* can't switch elevator any more */
 	QUEUE_FLAG_QOS_ENABLED,		/* qos is enabled */
 	QUEUE_FLAG_BIO_ISSUE_TIME,	/* record bio->issue_time_ns */
+	QUEUE_FLAG_ZONED_QD1_WRITES,	/* Limit zoned devices writes to QD=1 */
 	QUEUE_FLAG_MAX
 };

@@ -708,6 +714,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
 	test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags)
 #define blk_queue_no_elv_switch(q)	\
 	test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags)
+#define blk_queue_zoned_qd1_writes(q)	\
+	test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags)

 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
@@ -1468,11 +1476,6 @@ static inline bool bdev_rot(struct block_device *bdev)
 	return blk_queue_rot(bdev_get_queue(bdev));
 }

-static inline bool bdev_nonrot(struct block_device *bdev)
-{
-	return !bdev_rot(bdev);
-}
-
 static inline bool bdev_synchronous(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS;
--- a/include/linux/bsg.h
+++ b/include/linux/bsg.h
@@ -7,13 +7,17 @@
 struct bsg_device;
 struct device;
 struct request_queue;
+struct io_uring_cmd;

 typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr,
 		bool open_for_write, unsigned int timeout);

+typedef int (bsg_uring_cmd_fn)(struct request_queue *q, struct io_uring_cmd *ioucmd,
+			       unsigned int issue_flags, bool open_for_write);
+
 struct bsg_device *bsg_register_queue(struct request_queue *q,
 		struct device *parent, const char *name,
-		bsg_sg_io_fn *sg_io_fn);
+		bsg_sg_io_fn *sg_io_fn, bsg_uring_cmd_fn *uring_cmd_fn);
 void bsg_unregister_queue(struct bsg_device *bcd);

 #endif /* _LINUX_BSG_H */
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -203,15 +203,6 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv,
 		((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1);	\
 	     bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))

-/* for iterating one bio from start to end */
-#define BVEC_ITER_ALL_INIT (struct bvec_iter)				\
-{									\
-	.bi_sector	= 0,						\
-	.bi_size	= UINT_MAX,					\
-	.bi_idx		= 0,						\
-	.bi_bvec_done	= 0,						\
-}
-
 static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
 {
 	iter_all->done = 0;
--- a/Show More
+++ b/Show More