Merge tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull block updates from Jens Axboe:

 - Add shared memory zero-copy I/O support for ublk, bypassing per-I/O
   copies between kernel and userspace by matching registered buffer
   PFNs at I/O time. Includes selftests.

 - Refactor bio integrity to support filesystem initiated integrity
   operations and arbitrary buffer alignment.

 - Clean up bio allocation, splitting bio_alloc_bioset() into clear fast
   and slow paths. Add bio_await() and bio_submit_or_kill() helpers,
   unify synchronous bi_end_io callbacks.

 - Fix zone write plug refcount handling and plug removal races. Add
   support for serializing zone writes at QD=1 for rotational zoned
   devices, yielding significant throughput improvements.

 - Add SED-OPAL ioctls for Single User Mode management and a STACK_RESET
   command.

 - Add io_uring passthrough (uring_cmd) support to the BSG layer.

 - Replace pp_buf in partition scanning with struct seq_buf.

 - zloop improvements and cleanups.

 - drbd genl cleanup, switching to pre_doit/post_doit.

 - NVMe pull request via Keith:
      - Fabrics authentication updates
      - Enhanced block queue limits support
      - Workqueue usage updates
      - A new write zeroes device quirk
      - Tagset cleanup fix for loop device

 - MD pull requests via Yu Kuai:
      - Fix raid5 soft lockup in retry_aligned_read()
      - Fix raid10 deadlock with check operation and nowait requests
      - Fix raid1 overlapping writes on writemostly disks
      - Fix sysfs deadlock on array_state=clear
      - Proactive RAID-5 parity building with llbitmap, with
        write_zeroes_unmap optimization for initial sync
      - Fix llbitmap barrier ordering, rdev skipping, and bitmap_ops
        version mismatch fallback
      - Fix bcache use-after-free and uninitialized closure
      - Validate raid5 journal metadata payload size
      - Various cleanups

 - Various other fixes, improvements, and cleanups

* tag 'for-7.1/block-20260411' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (146 commits)
  ublk: fix tautological comparison warning in ublk_ctrl_reg_buf
  scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd()
  block: refactor blkdev_zone_mgmt_ioctl
  MAINTAINERS: update ublk driver maintainer email
  Documentation: ublk: address review comments for SHMEM_ZC docs
  ublk: allow buffer registration before device is started
  ublk: replace xarray with IDA for shmem buffer index allocation
  ublk: simplify PFN range loop in __ublk_ctrl_reg_buf
  ublk: verify all pages in multi-page bvec fall within registered range
  ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support
  xfs: use bio_await in xfs_zone_gc_reset_sync
  block: add a bio_submit_or_kill helper
  block: factor out a bio_await helper
  block: unify the synchronous bi_end_io callbacks
  xfs: fix number of GC bvecs
  selftests/ublk: add read-only buffer registration test
  selftests/ublk: add filesystem fio verify test for shmem_zc
  selftests/ublk: add hugetlbfs shmem_zc test for loop target
  selftests/ublk: add shared memory zero-copy test
  selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target
  ...
This commit is contained in:
Linus Torvalds
2026-04-13 15:51:31 -07:00
121 changed files with 5504 additions and 3124 deletions

View File

@@ -886,6 +886,21 @@ Description:
zone commands, they will be treated as regular block devices and
zoned will report "none".
What: /sys/block/<disk>/queue/zoned_qd1_writes
Date: January 2026
Contact: Damien Le Moal <dlemoal@kernel.org>
Description:
[RW] zoned_qd1_writes indicates if write operations to a zoned
block device are being handled using a single issuer context (a
kernel thread) operating at a maximum queue depth of 1. This
attribute is visible only for zoned block devices. The default
value for zoned block devices that are not rotational devices
(e.g. ZNS SSDs or zoned UFS devices) is 0. For rotational zoned
block devices (e.g. SMR HDDs) the default value is 1. Since
this default may not be appropriate for some devices, e.g.
remotely connected devices over high latency networks, the user
can disable this feature by setting this attribute to 0.
What: /sys/block/<disk>/hidden
Date: March 2023

View File

@@ -0,0 +1,13 @@
What: /sys/devices/virtual/nvme-fabrics/ctl/.../tls_configured_key
Date: November 2025
KernelVersion: 6.19
Contact: Linux NVMe mailing list <linux-nvme@lists.infradead.org>
Description:
The file is avaliable when using a secure concatanation
connection to a NVMe target. Reading the file will return
the serial of the currently negotiated key.
Writing 0 to the file will trigger a PSK reauthentication
(REPLACETLSPSK) with the target. After a reauthentication
the value returned by tls_configured_key will be the new
serial.

View File

@@ -62,7 +62,7 @@ The options available for the add command can be listed by reading the
/dev/zloop-control device::
$ cat /dev/zloop-control
add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io
add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,max_open_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io,zone_append=%u,ordered_zone_append,discard_write_cache
remove id=%d
In more details, the options that can be used with the "add" command are as
@@ -80,6 +80,9 @@ zone_capacity_mb Device zone capacity (must always be equal to or lower
conv_zones Total number of conventioanl zones starting from
sector 0
Default: 8
max_open_zones Maximum number of open sequential write required zones
(0 for no limit).
Default: 0
base_dir Path to the base directory where to create the directory
containing the zone files of the device.
Default=/var/local/zloop.
@@ -104,6 +107,11 @@ ordered_zone_append Enable zloop mitigation of zone append reordering.
(extents), as when enabled, this can significantly reduce
the number of data extents needed to for a file data
mapping.
discard_write_cache Discard all data that was not explicitly persisted using a
flush operation when the device is removed by truncating
each zone file to the size recorded during the last flush
operation. This simulates power fail events where
uncommitted data is lost.
=================== =========================================================
3) Deleting a Zoned Device

View File

@@ -153,7 +153,7 @@ blk-crypto-fallback completes the original bio. If the original bio is too
large, multiple bounce bios may be required; see the code for details.
For decryption, blk-crypto-fallback "wraps" the bio's completion callback
(``bi_complete``) and private data (``bi_private``) with its own, unsets the
(``bi_end_io``) and private data (``bi_private``) with its own, unsets the
bio's encryption context, then submits the bio. If the read completes
successfully, blk-crypto-fallback restores the bio's original completion
callback and private data, then decrypts the bio's data in-place using the

View File

@@ -485,6 +485,125 @@ Limitations
in case that too many ublk devices are handled by this single io_ring_ctx
and each one has very large queue depth
Shared Memory Zero Copy (UBLK_F_SHMEM_ZC)
------------------------------------------
The ``UBLK_F_SHMEM_ZC`` feature provides an alternative zero-copy path
that works by sharing physical memory pages between the client application
and the ublk server. Unlike the io_uring fixed buffer approach above,
shared memory zero copy does not require io_uring buffer registration
per I/O — instead, it relies on the kernel matching physical pages
at I/O time. This allows the ublk server to access the shared
buffer directly, which is unlikely for the io_uring fixed buffer
approach.
Motivation
~~~~~~~~~~
Shared memory zero copy takes a different approach: if the client
application and the ublk server both map the same physical memory, there is
nothing to copy. The kernel detects the shared pages automatically and
tells the server where the data already lives.
``UBLK_F_SHMEM_ZC`` can be thought of as a supplement for optimized client
applications — when the client is willing to allocate I/O buffers from
shared memory, the entire data path becomes zero-copy.
Use Cases
~~~~~~~~~
This feature is useful when the client application can be configured to
use a specific shared memory region for its I/O buffers:
- **Custom storage clients** that allocate I/O buffers from shared memory
(memfd, hugetlbfs) and issue direct I/O to the ublk device
- **Database engines** that use pre-allocated buffer pools with O_DIRECT
How It Works
~~~~~~~~~~~~
1. The ublk server and client both ``mmap()`` the same file (memfd or
hugetlbfs) with ``MAP_SHARED``. This gives both processes access to the
same physical pages.
2. The ublk server registers its mapping with the kernel::
struct ublk_shmem_buf_reg buf = { .addr = mmap_va, .len = size };
ublk_ctrl_cmd(UBLK_U_CMD_REG_BUF, .addr = &buf);
The kernel pins the pages and builds a PFN lookup tree.
3. When the client issues direct I/O (``O_DIRECT``) to ``/dev/ublkb*``,
the kernel checks whether the I/O buffer pages match any registered
pages by comparing PFNs.
4. On a match, the kernel sets ``UBLK_IO_F_SHMEM_ZC`` in the I/O
descriptor and encodes the buffer index and offset in ``addr``::
if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) {
/* Data is already in our shared mapping — zero copy */
index = ublk_shmem_zc_index(iod->addr);
offset = ublk_shmem_zc_offset(iod->addr);
buf = shmem_table[index].mmap_base + offset;
}
5. If pages do not match (e.g., the client used a non-shared buffer),
the I/O falls back to the normal copy path silently.
The shared memory can be set up via two methods:
- **Socket-based**: the client sends a memfd to the ublk server via
``SCM_RIGHTS`` on a unix socket. The server mmaps and registers it.
- **Hugetlbfs-based**: both processes ``mmap(MAP_SHARED)`` the same
hugetlbfs file. No IPC needed — same file gives same physical pages.
Advantages
~~~~~~~~~~
- **Simple**: no per-I/O buffer registration or unregistration commands.
Once the shared buffer is registered, all matching I/O is zero-copy
automatically.
- **Direct buffer access**: the ublk server can read and write the shared
buffer directly via its own mmap, without going through io_uring fixed
buffer operations. This is more friendly for server implementations.
- **Fast**: PFN matching is a single maple tree lookup per bvec. No
io_uring command round-trips for buffer management.
- **Compatible**: non-matching I/O silently falls back to the copy path.
The device works normally for any client, with zero-copy as an
optimization when shared memory is available.
Limitations
~~~~~~~~~~~
- **Requires client cooperation**: the client must allocate its I/O
buffers from the shared memory region. This requires a custom or
configured client — standard applications using their own buffers
will not benefit.
- **Direct I/O only**: buffered I/O (without ``O_DIRECT``) goes through
the page cache, which allocates its own pages. These kernel-allocated
pages will never match the registered shared buffer. Only ``O_DIRECT``
puts the client's buffer pages directly into the block I/O.
- **Contiguous data only**: each I/O request's data must be contiguous
within a single registered buffer. Scatter/gather I/O that spans
multiple non-adjacent registered buffers cannot use the zero-copy path.
Control Commands
~~~~~~~~~~~~~~~~
- ``UBLK_U_CMD_REG_BUF``
Register a shared memory buffer. ``ctrl_cmd.addr`` points to a
``struct ublk_shmem_buf_reg`` containing the buffer virtual address and size.
Returns the assigned buffer index (>= 0) on success. The kernel pins
pages and builds the PFN lookup tree. Queue freeze is handled
internally.
- ``UBLK_U_CMD_UNREG_BUF``
Unregister a previously registered buffer. ``ctrl_cmd.data[0]`` is the
buffer index. Unpins pages and removes PFN entries from the lookup
tree.
References
==========

View File

@@ -27015,7 +27015,7 @@ F: Documentation/filesystems/ubifs.rst
F: fs/ubifs/
UBLK USERSPACE BLOCK DRIVER
M: Ming Lei <ming.lei@redhat.com>
M: Ming Lei <tom.leiming@gmail.com>
L: linux-block@vger.kernel.org
S: Maintained
F: Documentation/block/ublk.rst

View File

@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
#include <linux/kmemleak.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -34,6 +35,8 @@ struct bio_alloc_cache {
unsigned int nr_irq;
};
#define BIO_INLINE_VECS 4
static struct biovec_slab {
int nr_vecs;
char *name;
@@ -114,6 +117,11 @@ static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
return bs->front_pad + sizeof(struct bio) + bs->back_pad;
}
static inline void *bio_slab_addr(struct bio *bio)
{
return (void *)bio - bio->bi_pool->front_pad;
}
static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
{
unsigned int size = bs_bio_slab_size(bs);
@@ -159,57 +167,16 @@ out:
mutex_unlock(&bio_slab_lock);
}
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
BUG_ON(nr_vecs > BIO_MAX_VECS);
if (nr_vecs == BIO_MAX_VECS)
mempool_free(bv, pool);
else if (nr_vecs > BIO_INLINE_VECS)
kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
}
/*
* Make the first allocation restricted and don't dump info on allocation
* failures, since we'll fall back to the mempool in case of failure.
*/
static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
static inline gfp_t try_alloc_gfp(gfp_t gfp)
{
return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
}
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask)
{
struct biovec_slab *bvs = biovec_slab(*nr_vecs);
if (WARN_ON_ONCE(!bvs))
return NULL;
/*
* Upgrade the nr_vecs request to take full advantage of the allocation.
* We also rely on this in the bvec_free path.
*/
*nr_vecs = bvs->nr_vecs;
/*
* Try a slab allocation first for all smaller allocations. If that
* fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
* The mempool is sized to handle up to BIO_MAX_VECS entries.
*/
if (*nr_vecs < BIO_MAX_VECS) {
struct bio_vec *bvl;
bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
return bvl;
*nr_vecs = BIO_MAX_VECS;
}
return mempool_alloc(pool, gfp_mask);
}
void bio_uninit(struct bio *bio)
{
#ifdef CONFIG_BLK_CGROUP
@@ -231,9 +198,14 @@ static void bio_free(struct bio *bio)
void *p = bio;
WARN_ON_ONCE(!bs);
WARN_ON_ONCE(bio->bi_max_vecs > BIO_MAX_VECS);
bio_uninit(bio);
bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
if (bio->bi_max_vecs == BIO_MAX_VECS)
mempool_free(bio->bi_io_vec, &bs->bvec_pool);
else if (bio->bi_max_vecs > BIO_INLINE_VECS)
kmem_cache_free(biovec_slab(bio->bi_max_vecs)->slab,
bio->bi_io_vec);
mempool_free(p - bs->front_pad, &bs->bio_pool);
}
@@ -430,13 +402,31 @@ static void bio_alloc_rescue(struct work_struct *work)
}
}
/*
* submit_bio_noacct() converts recursion to iteration; this means if we're
* running beneath it, any bios we allocate and submit will not be submitted
* (and thus freed) until after we return.
*
* This exposes us to a potential deadlock if we allocate multiple bios from the
* same bio_set while running underneath submit_bio_noacct(). If we were to
* allocate multiple bios (say a stacking block driver that was splitting bios),
* we would deadlock if we exhausted the mempool's reserve.
*
* We solve this, and guarantee forward progress by punting the bios on
* current->bio_list to a per bio_set rescuer workqueue before blocking to wait
* for elements being returned to the mempool.
*/
static void punt_bios_to_rescuer(struct bio_set *bs)
{
struct bio_list punt, nopunt;
struct bio *bio;
if (WARN_ON_ONCE(!bs->rescue_workqueue))
if (!current->bio_list || !bs->rescue_workqueue)
return;
if (bio_list_empty(&current->bio_list[0]) &&
bio_list_empty(&current->bio_list[1]))
return;
/*
* In order to guarantee forward progress we must punt only bios that
* were allocated from this bio_set; otherwise, if there was a bio on
@@ -483,9 +473,7 @@ static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
local_irq_restore(flags);
}
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
static struct bio *bio_alloc_percpu_cache(struct bio_set *bs)
{
struct bio_alloc_cache *cache;
struct bio *bio;
@@ -503,12 +491,10 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
cache->free_list = bio->bi_next;
cache->nr--;
put_cpu();
if (nr_vecs)
bio_init_inline(bio, bdev, nr_vecs, opf);
else
bio_init(bio, bdev, NULL, nr_vecs, opf);
bio->bi_pool = bs;
kmemleak_alloc(bio_slab_addr(bio),
kmem_cache_size(bs->bio_slab), 1, GFP_NOIO);
return bio;
}
@@ -517,7 +503,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* @bdev: block device to allocate the bio for (can be %NULL)
* @nr_vecs: number of bvecs to pre-allocate
* @opf: operation and flags for bio
* @gfp_mask: the GFP_* mask given to the slab allocator
* @gfp: the GFP_* mask given to the slab allocator
* @bs: the bio_set to allocate from.
*
* Allocate a bio from the mempools in @bs.
@@ -547,91 +533,77 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
blk_opf_t opf, gfp_t gfp_mask,
struct bio_set *bs)
blk_opf_t opf, gfp_t gfp, struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
struct bio *bio;
struct bio_vec *bvecs = NULL;
struct bio *bio = NULL;
gfp_t saved_gfp = gfp;
void *p;
/* should not use nobvec bioset for nr_vecs > 0 */
if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
return NULL;
gfp = try_alloc_gfp(gfp);
if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf,
gfp_mask, bs);
if (bio)
return bio;
/*
* No cached bio available, bio returned below marked with
* REQ_ALLOC_CACHE to participate in per-cpu alloc cache.
* Set REQ_ALLOC_CACHE even if no cached bio is available to
* return the allocated bio to the percpu cache when done.
*/
} else
opf &= ~REQ_ALLOC_CACHE;
/*
* submit_bio_noacct() converts recursion to iteration; this means if
* we're running beneath it, any bios we allocate and submit will not be
* submitted (and thus freed) until after we return.
*
* This exposes us to a potential deadlock if we allocate multiple bios
* from the same bio_set() while running underneath submit_bio_noacct().
* If we were to allocate multiple bios (say a stacking block driver
* that was splitting bios), we would deadlock if we exhausted the
* mempool's reserve.
*
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are bios on
* current->bio_list, we first try the allocation without
* __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
* blocking to the rescuer workqueue before we retry with the original
* gfp_flags.
*/
if (current->bio_list &&
(!bio_list_empty(&current->bio_list[0]) ||
!bio_list_empty(&current->bio_list[1])) &&
bs->rescue_workqueue)
gfp_mask &= ~__GFP_DIRECT_RECLAIM;
p = mempool_alloc(&bs->bio_pool, gfp_mask);
if (!p && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
p = mempool_alloc(&bs->bio_pool, gfp_mask);
}
if (unlikely(!p))
return NULL;
if (!mempool_is_saturated(&bs->bio_pool))
opf &= ~REQ_ALLOC_CACHE;
bio = p + bs->front_pad;
if (nr_vecs > BIO_INLINE_VECS) {
struct bio_vec *bvl = NULL;
bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
}
if (unlikely(!bvl))
goto err_free;
bio_init(bio, bdev, bvl, nr_vecs, opf);
} else if (nr_vecs) {
bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf);
opf |= REQ_ALLOC_CACHE;
bio = bio_alloc_percpu_cache(bs);
} else {
bio_init(bio, bdev, NULL, 0, opf);
opf &= ~REQ_ALLOC_CACHE;
p = kmem_cache_alloc(bs->bio_slab, gfp);
if (p)
bio = p + bs->front_pad;
}
if (bio && nr_vecs > BIO_INLINE_VECS) {
struct biovec_slab *bvs = biovec_slab(nr_vecs);
/*
* Upgrade nr_vecs to take full advantage of the allocation.
* We also rely on this in bio_free().
*/
nr_vecs = bvs->nr_vecs;
bvecs = kmem_cache_alloc(bvs->slab, gfp);
if (unlikely(!bvecs)) {
kmem_cache_free(bs->bio_slab, p);
bio = NULL;
}
}
if (unlikely(!bio)) {
/*
* Give up if we are not allow to sleep as non-blocking mempool
* allocations just go back to the slab allocation.
*/
if (!(saved_gfp & __GFP_DIRECT_RECLAIM))
return NULL;
punt_bios_to_rescuer(bs);
/*
* Don't rob the mempools by returning to the per-CPU cache if
* we're tight on memory.
*/
opf &= ~REQ_ALLOC_CACHE;
p = mempool_alloc(&bs->bio_pool, saved_gfp);
bio = p + bs->front_pad;
if (nr_vecs > BIO_INLINE_VECS) {
nr_vecs = BIO_MAX_VECS;
bvecs = mempool_alloc(&bs->bvec_pool, saved_gfp);
}
}
if (nr_vecs && nr_vecs <= BIO_INLINE_VECS)
bio_init_inline(bio, bdev, nr_vecs, opf);
else
bio_init(bio, bdev, bvecs, nr_vecs, opf);
bio->bi_pool = bs;
return bio;
err_free:
mempool_free(p, &bs->bio_pool);
return NULL;
}
EXPORT_SYMBOL(bio_alloc_bioset);
@@ -765,6 +737,9 @@ static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
while ((bio = cache->free_list) != NULL) {
cache->free_list = bio->bi_next;
cache->nr--;
kmemleak_alloc(bio_slab_addr(bio),
kmem_cache_size(bio->bi_pool->bio_slab),
1, GFP_KERNEL);
bio_free(bio);
if (++i == nr)
break;
@@ -828,6 +803,7 @@ static inline void bio_put_percpu_cache(struct bio *bio)
bio->bi_bdev = NULL;
cache->free_list = bio;
cache->nr++;
kmemleak_free(bio_slab_addr(bio));
} else if (in_hardirq()) {
lockdep_assert_irqs_disabled();
@@ -835,6 +811,7 @@ static inline void bio_put_percpu_cache(struct bio *bio)
bio->bi_next = cache->free_list_irq;
cache->free_list_irq = bio;
cache->nr_irq++;
kmemleak_free(bio_slab_addr(bio));
} else {
goto out_free;
}
@@ -897,10 +874,11 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
* @gfp: allocation priority
* @bs: bio_set to allocate from
*
* Allocate a new bio that is a clone of @bio_src. The caller owns the returned
* bio, but not the actual data it points to.
*
* The caller must ensure that the return bio is not freed before @bio_src.
* Allocate a new bio that is a clone of @bio_src. This reuses the bio_vecs
* pointed to by @bio_src->bi_io_vec, and clones the iterator pointing to
* the current position in it. The caller owns the returned bio, but not
* the bio_vecs, and must ensure the bio is freed before the memory
* pointed to by @bio_Src->bi_io_vecs.
*/
struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
gfp_t gfp, struct bio_set *bs)
@@ -929,9 +907,7 @@ EXPORT_SYMBOL(bio_alloc_clone);
* @gfp: allocation priority
*
* Initialize a new bio in caller provided memory that is a clone of @bio_src.
* The caller owns the returned bio, but not the actual data it points to.
*
* The caller must ensure that @bio_src is not freed before @bio.
* The same bio_vecs reuse and bio lifetime rules as bio_alloc_clone() apply.
*/
int bio_init_clone(struct block_device *bdev, struct bio *bio,
struct bio *bio_src, gfp_t gfp)
@@ -1064,6 +1040,8 @@ int bio_add_page(struct bio *bio, struct page *page,
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (WARN_ON_ONCE(len == 0))
return 0;
if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
return 0;
@@ -1484,11 +1462,41 @@ void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
}
static void submit_bio_wait_endio(struct bio *bio)
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);
}
/**
* bio_await - call a function on a bio, and wait until it completes
* @bio: the bio which describes the I/O
* @submit: function called to submit the bio
* @priv: private data passed to @submit
*
* Wait for the bio as well as any bio chained off it after executing the
* passed in callback @submit. The wait for the bio is set up before calling
* @submit to ensure that the completion is captured. If @submit is %NULL,
* submit_bio() is used instead to submit the bio.
*
* Note: this overrides the bi_private and bi_end_io fields in the bio.
*/
void bio_await(struct bio *bio, void *priv,
void (*submit)(struct bio *bio, void *priv))
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = bio_wait_end_io;
bio->bi_opf |= REQ_SYNC;
if (submit)
submit(bio, priv);
else
submit_bio(bio);
blk_wait_io(&done);
}
EXPORT_SYMBOL_GPL(bio_await);
/**
* submit_bio_wait - submit a bio, and wait until it completes
* @bio: The &struct bio which describes the I/O
@@ -1502,19 +1510,30 @@ static void submit_bio_wait_endio(struct bio *bio)
*/
int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
blk_wait_io(&done);
bio_await(bio, NULL, NULL);
return blk_status_to_errno(bio->bi_status);
}
EXPORT_SYMBOL(submit_bio_wait);
static void bio_endio_cb(struct bio *bio, void *priv)
{
bio_endio(bio);
}
/*
* Submit @bio synchronously, or call bio_endio on it if the current process
* is being killed.
*/
int bio_submit_or_kill(struct bio *bio, unsigned int flags)
{
if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) {
bio_await(bio, NULL, bio_endio_cb);
return -EINTR;
}
return submit_bio_wait(bio);
}
/**
* bdev_rw_virt - synchronously read into / write from kernel mapping
* @bdev: block device to access
@@ -1545,26 +1564,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data,
}
EXPORT_SYMBOL_GPL(bdev_rw_virt);
static void bio_wait_end_io(struct bio *bio)
{
complete(bio->bi_private);
bio_put(bio);
}
/*
* bio_await_chain - ends @bio and waits for every chained bio to complete
*/
void bio_await_chain(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = bio_wait_end_io;
bio_endio(bio);
blk_wait_io(&done);
}
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))

View File

@@ -24,6 +24,7 @@
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/wait_bit.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/resume_user_mode.h>
@@ -611,6 +612,8 @@ restart:
q->root_blkg = NULL;
spin_unlock_irq(&q->queue_lock);
wake_up_var(&q->root_blkg);
}
static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
@@ -1498,6 +1501,18 @@ int blkcg_init_disk(struct gendisk *disk)
struct blkcg_gq *new_blkg, *blkg;
bool preloaded;
/*
* If the queue is shared across disk rebind (e.g., SCSI), the
* previous disk's blkcg state is cleaned up asynchronously via
* disk_release() -> blkcg_exit_disk(). Wait for that cleanup to
* finish (indicated by root_blkg becoming NULL) before setting up
* new blkcg state. Otherwise, we may overwrite q->root_blkg while
* the old one is still alive, and radix_tree_insert() in
* blkg_create() will fail with -EEXIST because the old entries
* still occupy the same queue id slot in blkcg->blkg_tree.
*/
wait_var_event(&q->root_blkg, !READ_ONCE(q->root_blkg));
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
@@ -2022,6 +2037,7 @@ void blkcg_maybe_throttle_current(void)
return;
out:
rcu_read_unlock();
put_disk(disk);
}
/**

View File

@@ -18,7 +18,7 @@ struct blk_crypto_kobj {
struct blk_crypto_attr {
struct attribute attr;
ssize_t (*show)(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page);
const struct blk_crypto_attr *attr, char *page);
};
static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
@@ -26,39 +26,39 @@ static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
}
static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
static const struct blk_crypto_attr *attr_to_crypto_attr(const struct attribute *attr)
{
return container_of(attr, struct blk_crypto_attr, attr);
return container_of_const(attr, struct blk_crypto_attr, attr);
}
static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
const struct blk_crypto_attr *attr, char *page)
{
/* Always show supported, since the file doesn't exist otherwise. */
return sysfs_emit(page, "supported\n");
}
static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
const struct blk_crypto_attr *attr, char *page)
{
return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
}
static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
const struct blk_crypto_attr *attr, char *page)
{
return sysfs_emit(page, "%u\n", profile->num_slots);
}
static ssize_t raw_keys_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
const struct blk_crypto_attr *attr, char *page)
{
/* Always show supported, since the file doesn't exist otherwise. */
return sysfs_emit(page, "supported\n");
}
#define BLK_CRYPTO_RO_ATTR(_name) \
static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
static const struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
BLK_CRYPTO_RO_ATTR(hw_wrapped_keys);
BLK_CRYPTO_RO_ATTR(max_dun_bits);
@@ -66,10 +66,10 @@ BLK_CRYPTO_RO_ATTR(num_keyslots);
BLK_CRYPTO_RO_ATTR(raw_keys);
static umode_t blk_crypto_is_visible(struct kobject *kobj,
struct attribute *attr, int n)
const struct attribute *attr, int n)
{
struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
const struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
if (a == &hw_wrapped_keys_attr &&
!(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED))
@@ -81,7 +81,7 @@ static umode_t blk_crypto_is_visible(struct kobject *kobj,
return 0444;
}
static struct attribute *blk_crypto_attrs[] = {
static const struct attribute *const blk_crypto_attrs[] = {
&hw_wrapped_keys_attr.attr,
&max_dun_bits_attr.attr,
&num_keyslots_attr.attr,
@@ -90,8 +90,8 @@ static struct attribute *blk_crypto_attrs[] = {
};
static const struct attribute_group blk_crypto_attr_group = {
.attrs = blk_crypto_attrs,
.is_visible = blk_crypto_is_visible,
.attrs_const = blk_crypto_attrs,
.is_visible_const = blk_crypto_is_visible,
};
/*
@@ -99,13 +99,13 @@ static const struct attribute_group blk_crypto_attr_group = {
* modes, these are initialized at boot time by blk_crypto_sysfs_init().
*/
static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
static const struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
struct attribute *attr, int n)
const struct attribute *attr, int n)
{
struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
const struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
int mode_num = a - __blk_crypto_mode_attrs;
if (profile->modes_supported[mode_num])
@@ -114,7 +114,7 @@ static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
}
static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
struct blk_crypto_attr *attr, char *page)
const struct blk_crypto_attr *attr, char *page)
{
int mode_num = attr - __blk_crypto_mode_attrs;
@@ -123,8 +123,8 @@ static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
static const struct attribute_group blk_crypto_modes_attr_group = {
.name = "modes",
.attrs = blk_crypto_mode_attrs,
.is_visible = blk_crypto_mode_is_visible,
.attrs_const = blk_crypto_mode_attrs,
.is_visible_const = blk_crypto_mode_is_visible,
};
static const struct attribute_group *blk_crypto_attr_groups[] = {
@@ -137,7 +137,7 @@ static ssize_t blk_crypto_attr_show(struct kobject *kobj,
struct attribute *attr, char *page)
{
struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
const struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
return a->show(profile, a, page);
}

View File

@@ -30,17 +30,17 @@ struct blk_ia_range_sysfs_entry {
ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
};
static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
static const struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
.attr = { .name = "sector", .mode = 0444 },
.show = blk_ia_range_sector_show,
};
static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
static const struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
.attr = { .name = "nr_sectors", .mode = 0444 },
.show = blk_ia_range_nr_sectors_show,
};
static struct attribute *blk_ia_range_attrs[] = {
static const struct attribute *const blk_ia_range_attrs[] = {
&blk_ia_range_sector_entry.attr,
&blk_ia_range_nr_sectors_entry.attr,
NULL,

View File

@@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p,
u32 *nr_done)
{
u32 nr_met[2] = { };
u32 nr_missed[2] = { };
@@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
ioc->period_us * NSEC_PER_USEC);
*nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE];
}
/* was iocg idle this period? */
@@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer)
u64 usage_us_sum = 0;
u32 ppm_rthr;
u32 ppm_wthr;
u32 missed_ppm[2], rq_wait_pct;
u32 missed_ppm[2], rq_wait_pct, nr_done;
u64 period_vtime;
int prev_busy_level;
/* how were the latencies during the period? */
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);
/* take care of active iocgs */
spin_lock_irq(&ioc->lock);
@@ -2397,9 +2400,17 @@ static void ioc_timer_fn(struct timer_list *timer)
* and should increase vtime rate.
*/
prev_busy_level = ioc->busy_level;
if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
missed_ppm[READ] > ppm_rthr ||
missed_ppm[WRITE] > ppm_wthr) {
if (!nr_done && nr_lagging) {
/*
* When there are lagging IOs but no completions, we don't
* know if the IO latency will meet the QoS targets. The
* disk might be saturated or not. We should not reset
* busy_level to 0 (which would prevent vrate from scaling
* up or down), but rather to keep it unchanged.
*/
} else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
missed_ppm[READ] > ppm_rthr ||
missed_ppm[WRITE] > ppm_wthr) {
/* clearly missing QoS targets, slow down vrate */
ioc->busy_level = max(ioc->busy_level, 0);
ioc->busy_level++;

View File

@@ -155,13 +155,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector,
__blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio,
flags, limit);
if (bio) {
if ((flags & BLKDEV_ZERO_KILLABLE) &&
fatal_signal_pending(current)) {
bio_await_chain(bio);
blk_finish_plug(&plug);
return -EINTR;
}
ret = submit_bio_wait(bio);
ret = bio_submit_or_kill(bio, flags);
bio_put(bio);
}
blk_finish_plug(&plug);
@@ -236,13 +230,7 @@ static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector,
blk_start_plug(&plug);
__blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags);
if (bio) {
if ((flags & BLKDEV_ZERO_KILLABLE) &&
fatal_signal_pending(current)) {
bio_await_chain(bio);
blk_finish_plug(&plug);
return -EINTR;
}
ret = submit_bio_wait(bio);
ret = bio_submit_or_kill(bio, flags);
bio_put(bio);
}
blk_finish_plug(&plug);

View File

@@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(NO_ELV_SWITCH),
QUEUE_FLAG_NAME(QOS_ENABLED),
QUEUE_FLAG_NAME(BIO_ISSUE_TIME),
QUEUE_FLAG_NAME(ZONED_QD1_WRITES),
};
#undef QUEUE_FLAG_NAME

View File

@@ -53,7 +53,7 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
struct request_queue *q;
ssize_t res;
entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
entry = container_of_const(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
q = hctx->queue;
@@ -101,20 +101,20 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
return pos + ret;
}
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
.attr = {.name = "nr_tags", .mode = 0444 },
.show = blk_mq_hw_sysfs_nr_tags_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = {
.attr = {.name = "nr_reserved_tags", .mode = 0444 },
.show = blk_mq_hw_sysfs_nr_reserved_tags_show,
};
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = {
.attr = {.name = "cpu_list", .mode = 0444 },
.show = blk_mq_hw_sysfs_cpus_show,
};
static struct attribute *default_hw_ctx_attrs[] = {
static const struct attribute *const default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_nr_tags.attr,
&blk_mq_hw_sysfs_nr_reserved_tags.attr,
&blk_mq_hw_sysfs_cpus.attr,

View File

@@ -3424,6 +3424,25 @@ EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
*/
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
struct bio *bio;
for (bio = rq->bio; bio; bio = bio->bi_next) {
if (bio->bi_opf & REQ_POLLED) {
bio->bi_opf &= ~REQ_POLLED;
bio->bi_cookie = BLK_QC_T_NONE;
}
/*
* The alternate request queue that we may end up submitting
* the bio to may be frozen temporarily, in this case REQ_NOWAIT
* will fail the I/O immediately with EAGAIN to the issuer.
* We are not in the issuer context which cannot block. Clear
* the flag to avoid spurious EAGAIN I/O failures.
*/
bio->bi_opf &= ~REQ_NOWAIT;
bio_clear_flag(bio, BIO_QOS_THROTTLED);
bio_clear_flag(bio, BIO_QOS_MERGED);
}
if (rq->bio) {
if (list->tail)
list->tail->bi_next = rq->bio;

View File

@@ -189,11 +189,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim)
}
/*
* The PI generation / validation helpers do not expect intervals to
* straddle multiple bio_vecs. Enforce alignment so that those are
* Some IO controllers can not handle data intervals straddling
* multiple bio_vecs. For those, enforce alignment so that those are
* never generated, and that each buffer is aligned as expected.
*/
if (bi->csum_type) {
if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) {
lim->dma_alignment = max(lim->dma_alignment,
(1U << bi->interval_exp) - 1);
}
@@ -992,10 +992,14 @@ bool queue_limits_stack_integrity(struct queue_limits *t,
if ((ti->flags & BLK_INTEGRITY_REF_TAG) !=
(bi->flags & BLK_INTEGRITY_REF_TAG))
goto incompatible;
if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) &&
!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE))
ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE;
} else {
ti->flags = BLK_INTEGRITY_STACKED;
ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) |
(bi->flags & BLK_INTEGRITY_REF_TAG);
(bi->flags & BLK_INTEGRITY_REF_TAG) |
(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE);
ti->csum_type = bi->csum_type;
ti->pi_tuple_size = bi->pi_tuple_size;
ti->metadata_size = bi->metadata_size;

View File

@@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page)
return queue_var_show(disk_nr_zones(disk), page);
}
static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page)
{
return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue),
page);
}
static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk,
const char *page, size_t count)
{
struct request_queue *q = disk->queue;
unsigned long qd1_writes;
unsigned int memflags;
ssize_t ret;
ret = queue_var_store(&qd1_writes, page, count);
if (ret < 0)
return ret;
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
if (qd1_writes)
blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q);
else
blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q, memflags);
return count;
}
static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
{
return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page);
@@ -551,27 +581,27 @@ static int queue_wc_store(struct gendisk *disk, const char *page,
return 0;
}
#define QUEUE_RO_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0444 }, \
.show = _prefix##_show, \
#define QUEUE_RO_ENTRY(_prefix, _name) \
static const struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0444 }, \
.show = _prefix##_show, \
};
#define QUEUE_RW_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
.show = _prefix##_show, \
.store = _prefix##_store, \
#define QUEUE_RW_ENTRY(_prefix, _name) \
static const struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
.show = _prefix##_show, \
.store = _prefix##_store, \
};
#define QUEUE_LIM_RO_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
static const struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0444 }, \
.show_limit = _prefix##_show, \
}
#define QUEUE_LIM_RW_ENTRY(_prefix, _name) \
static struct queue_sysfs_entry _prefix##_entry = { \
static const struct queue_sysfs_entry _prefix##_entry = { \
.attr = { .name = _name, .mode = 0644 }, \
.show_limit = _prefix##_show, \
.store_limit = _prefix##_store, \
@@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
@@ -634,7 +665,7 @@ QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment");
/* legacy alias for logical_block_size: */
static struct queue_sysfs_entry queue_hw_sector_size_entry = {
static const struct queue_sysfs_entry queue_hw_sector_size_entry = {
.attr = {.name = "hw_sector_size", .mode = 0444 },
.show_limit = queue_logical_block_size_show,
};
@@ -700,7 +731,7 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
#endif
/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
static const struct attribute *const queue_attrs[] = {
/*
* Attributes which are protected with q->limits_lock.
*/
@@ -754,12 +785,13 @@ static struct attribute *queue_attrs[] = {
&queue_nomerges_entry.attr,
&queue_poll_entry.attr,
&queue_poll_delay_entry.attr,
&queue_zoned_qd1_writes_entry.attr,
NULL,
};
/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
static const struct attribute *const blk_mq_queue_attrs[] = {
/*
* Attributes which require some form of locking other than
* q->sysfs_lock.
@@ -779,14 +811,15 @@ static struct attribute *blk_mq_queue_attrs[] = {
NULL,
};
static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
static umode_t queue_attr_visible(struct kobject *kobj, const struct attribute *attr,
int n)
{
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
if ((attr == &queue_max_open_zones_entry.attr ||
attr == &queue_max_active_zones_entry.attr) &&
attr == &queue_max_active_zones_entry.attr ||
attr == &queue_zoned_qd1_writes_entry.attr) &&
!blk_queue_is_zoned(q))
return 0;
@@ -794,7 +827,7 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
}
static umode_t blk_mq_queue_attr_visible(struct kobject *kobj,
struct attribute *attr, int n)
const struct attribute *attr, int n)
{
struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
struct request_queue *q = disk->queue;
@@ -808,17 +841,17 @@ static umode_t blk_mq_queue_attr_visible(struct kobject *kobj,
return attr->mode;
}
static struct attribute_group queue_attr_group = {
.attrs = queue_attrs,
.is_visible = queue_attr_visible,
static const struct attribute_group queue_attr_group = {
.attrs_const = queue_attrs,
.is_visible_const = queue_attr_visible,
};
static struct attribute_group blk_mq_queue_attr_group = {
.attrs = blk_mq_queue_attrs,
.is_visible = blk_mq_queue_attr_visible,
static const struct attribute_group blk_mq_queue_attr_group = {
.attrs_const = blk_mq_queue_attrs,
.is_visible_const = blk_mq_queue_attr_visible,
};
#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
#define to_queue(atr) container_of_const((atr), struct queue_sysfs_entry, attr)
static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
@@ -934,6 +967,14 @@ int blk_register_queue(struct gendisk *disk)
blk_mq_debugfs_register(q);
blk_debugfs_unlock(q, memflags);
/*
* For blk-mq rotational zoned devices, default to using QD=1
* writes. For non-mq rotational zoned devices, the device driver can
* set an appropriate default.
*/
if (queue_is_mq(q) && blk_queue_rot(q) && blk_queue_is_zoned(q))
blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q);
ret = disk_register_independent_access_ranges(disk);
if (ret)
goto out_debugfs_remove;

View File

@@ -782,10 +782,11 @@ void wbt_init_enable_default(struct gendisk *disk)
return;
rwb = wbt_alloc();
if (WARN_ON_ONCE(!rwb))
if (!rwb)
return;
if (WARN_ON_ONCE(wbt_init(disk, rwb))) {
if (wbt_init(disk, rwb)) {
pr_warn("%s: failed to enable wbt\n", disk->disk_name);
wbt_free(rwb);
return;
}

View File

@@ -16,6 +16,8 @@
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <trace/events/block.h>
@@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = {
/*
* Per-zone write plug.
* @node: hlist_node structure for managing the plug using a hash table.
* @entry: list_head structure for listing the plug in the disk list of active
* zone write plugs.
* @bio_list: The list of BIOs that are currently plugged.
* @bio_work: Work struct to handle issuing of plugged BIOs
* @rcu_head: RCU head to free zone write plugs with an RCU grace period.
@@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = {
*/
struct blk_zone_wplug {
struct hlist_node node;
struct list_head entry;
struct bio_list bio_list;
struct work_struct bio_work;
struct rcu_head rcu_head;
@@ -99,17 +104,17 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
* being executed or the zone write plug bio list is not empty.
* - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
* write pointer offset and need to update it.
* - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
* from the disk hash table and that the initial reference to the zone
* write plug set when the plug was first added to the hash table has been
* dropped. This flag is set when a zone is reset, finished or become full,
* to prevent new references to the zone write plug to be taken for
* newly incoming BIOs. A zone write plug flagged with this flag will be
* freed once all remaining references from BIOs or functions are dropped.
* - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
* removed from the disk hash table of zone write plugs when the last
* reference on the zone write plug is dropped. If set, this flag also
* indicates that the initial extra reference on the zone write plug was
* dropped, meaning that the reference count indicates the current number of
* active users (code context or BIOs and requests in flight). This flag is
* set when a zone is reset, finished or becomes full.
*/
#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
#define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
#define BLK_ZONE_WPLUG_DEAD (1U << 2)
/**
* blk_zone_cond_str - Return a zone condition name string
@@ -412,20 +417,32 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
return 0;
}
static int blkdev_truncate_zone_range(struct block_device *bdev,
blk_mode_t mode, const struct blk_zone_range *zrange)
static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode,
struct blk_zone_range *zrange)
{
loff_t start, end;
int ret = -EINVAL;
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
/* Out of range */
return -EINVAL;
goto out_unlock;
start = zrange->sector << SECTOR_SHIFT;
end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
return truncate_bdev_range(bdev, mode, start, end);
ret = truncate_bdev_range(bdev, mode, start, end);
if (ret)
goto out_unlock;
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector,
zrange->nr_sectors);
out_unlock:
filemap_invalidate_unlock(bdev->bd_mapping);
inode_unlock(bdev->bd_mapping->host);
return ret;
}
/*
@@ -438,7 +455,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
void __user *argp = (void __user *)arg;
struct blk_zone_range zrange;
enum req_op op;
int ret;
if (!argp)
return -EINVAL;
@@ -454,15 +470,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
switch (cmd) {
case BLKRESETZONE:
op = REQ_OP_ZONE_RESET;
/* Invalidate the page cache, including dirty pages. */
inode_lock(bdev->bd_mapping->host);
filemap_invalidate_lock(bdev->bd_mapping);
ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
if (ret)
goto fail;
break;
return blkdev_reset_zone(bdev, mode, &zrange);
case BLKOPENZONE:
op = REQ_OP_ZONE_OPEN;
break;
@@ -476,15 +484,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
if (cmd == BLKRESETZONE) {
filemap_invalidate_unlock(bdev->bd_mapping);
inode_unlock(bdev->bd_mapping->host);
}
return ret;
return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
}
static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
@@ -492,18 +492,12 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
return zone->start + zone->len >= get_capacity(disk);
}
static bool disk_zone_is_full(struct gendisk *disk,
unsigned int zno, unsigned int offset_in_zone)
{
if (zno < disk->nr_zones - 1)
return offset_in_zone >= disk->zone_capacity;
return offset_in_zone >= disk->last_zone_capacity;
}
static bool disk_zone_wplug_is_full(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
if (zwplug->zone_no < disk->nr_zones - 1)
return zwplug->wp_offset >= disk->zone_capacity;
return zwplug->wp_offset >= disk->last_zone_capacity;
}
static bool disk_insert_zone_wplug(struct gendisk *disk,
@@ -520,10 +514,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
* are racing with other submission context, so we may already have a
* zone write plug for the same zone.
*/
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
if (zwplg->zone_no == zwplug->zone_no) {
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
flags);
return false;
}
}
@@ -535,7 +530,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
* necessarilly in the active condition.
*/
zones_cond = rcu_dereference_check(disk->zones_cond,
lockdep_is_held(&disk->zone_wplugs_lock));
lockdep_is_held(&disk->zone_wplugs_hash_lock));
if (zones_cond)
zwplug->cond = zones_cond[zwplug->zone_no];
else
@@ -543,7 +538,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk,
hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
atomic_inc(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
return true;
}
@@ -587,105 +582,76 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
}
static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
{
if (refcount_dec_and_test(&zwplug->ref)) {
WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
}
}
static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
lockdep_assert_held(&zwplug->lock);
/* If the zone write plug was already removed, we are done. */
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
return false;
/* If the zone write plug is still plugged, it cannot be removed. */
if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
return false;
/*
* Completions of BIOs with blk_zone_write_plug_bio_endio() may
* happen after handling a request completion with
* blk_zone_write_plug_finish_request() (e.g. with split BIOs
* that are chained). In such case, disk_zone_wplug_unplug_bio()
* should not attempt to remove the zone write plug until all BIO
* completions are seen. Check by looking at the zone write plug
* reference count, which is 2 when the plug is unused (one reference
* taken when the plug was allocated and another reference taken by the
* caller context).
*/
if (refcount_read(&zwplug->ref) > 2)
return false;
/* We can remove zone write plugs for zones that are empty or full. */
return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
}
static void disk_remove_zone_wplug(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
{
struct gendisk *disk = zwplug->disk;
unsigned long flags;
/* If the zone write plug was already removed, we have nothing to do. */
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
return;
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
/*
* Mark the zone write plug as unhashed and drop the extra reference we
* took when the plug was inserted in the hash table. Also update the
* disk zone condition array with the current condition of the zone
* write plug.
*/
zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
lockdep_is_held(&disk->zone_wplugs_lock)),
lockdep_is_held(&disk->zone_wplugs_hash_lock)),
zwplug->zone_no, zwplug->cond);
hlist_del_init_rcu(&zwplug->node);
atomic_dec(&disk->nr_zone_wplugs);
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
}
static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
{
if (refcount_dec_and_test(&zwplug->ref))
disk_free_zone_wplug(zwplug);
}
/*
* Flag the zone write plug as dead and drop the initial reference we got when
* the zone write plug was added to the hash table. The zone write plug will be
* unhashed when its last reference is dropped.
*/
static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
{
lockdep_assert_held(&zwplug->lock);
if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
disk_put_zone_wplug(zwplug);
}
}
static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
struct blk_zone_wplug *zwplug);
static void blk_zone_wplug_bio_work(struct work_struct *work)
{
struct blk_zone_wplug *zwplug =
container_of(work, struct blk_zone_wplug, bio_work);
disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
/* Drop the reference we took in disk_zone_wplug_schedule_work(). */
disk_put_zone_wplug(zwplug);
}
static void blk_zone_wplug_bio_work(struct work_struct *work);
/*
* Get a reference on the write plug for the zone containing @sector.
* If the plug does not exist, it is allocated and hashed.
* Return a pointer to the zone write plug with the plug spinlock held.
* Get a zone write plug for the zone containing @sector.
* If the plug does not exist, it is allocated and inserted in the disk hash
* table.
*/
static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
sector_t sector, gfp_t gfp_mask,
unsigned long *flags)
static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
sector_t sector, gfp_t gfp_mask)
{
unsigned int zno = disk_zone_no(disk, sector);
struct blk_zone_wplug *zwplug;
again:
zwplug = disk_get_zone_wplug(disk, sector);
if (zwplug) {
/*
* Check that a BIO completion or a zone reset or finish
* operation has not already removed the zone write plug from
* the hash table and dropped its reference count. In such case,
* we need to get a new plug so start over from the beginning.
*/
spin_lock_irqsave(&zwplug->lock, *flags);
if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
spin_unlock_irqrestore(&zwplug->lock, *flags);
disk_put_zone_wplug(zwplug);
goto again;
}
if (zwplug)
return zwplug;
}
/*
* Allocate and initialize a zone write plug with an extra reference
@@ -704,17 +670,15 @@ again:
zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
bio_list_init(&zwplug->bio_list);
INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
INIT_LIST_HEAD(&zwplug->entry);
zwplug->disk = disk;
spin_lock_irqsave(&zwplug->lock, *flags);
/*
* Insert the new zone write plug in the hash table. This can fail only
* if another context already inserted a plug. Retry from the beginning
* in such case.
*/
if (!disk_insert_zone_wplug(disk, zwplug)) {
spin_unlock_irqrestore(&zwplug->lock, *flags);
mempool_free(zwplug, disk->zone_wplugs_pool);
goto again;
}
@@ -739,6 +703,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
*/
static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
{
struct gendisk *disk = zwplug->disk;
struct bio *bio;
lockdep_assert_held(&zwplug->lock);
@@ -752,6 +717,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
blk_zone_wplug_bio_io_error(zwplug, bio);
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
/*
* If we are using the per disk zone write plugs worker thread, remove
* the zone write plug from the work list and drop the reference we
* took when the zone write plug was added to that list.
*/
if (blk_queue_zoned_qd1_writes(disk->queue)) {
spin_lock(&disk->zone_wplugs_list_lock);
if (!list_empty(&zwplug->entry)) {
list_del_init(&zwplug->entry);
disk_put_zone_wplug(zwplug);
}
spin_unlock(&disk->zone_wplugs_list_lock);
}
}
/*
@@ -788,14 +767,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
disk_zone_wplug_update_cond(disk, zwplug);
disk_zone_wplug_abort(zwplug);
/*
* The zone write plug now has no BIO plugged: remove it from the
* hash table so that it cannot be seen. The plug will be freed
* when the last reference is dropped.
*/
if (disk_should_remove_zone_wplug(disk, zwplug))
disk_remove_zone_wplug(disk, zwplug);
if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
disk_mark_zone_wplug_dead(zwplug);
}
static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
@@ -1192,19 +1165,24 @@ void blk_zone_mgmt_bio_endio(struct bio *bio)
}
}
static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
static void disk_zone_wplug_schedule_work(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
lockdep_assert_held(&zwplug->lock);
/*
* Take a reference on the zone write plug and schedule the submission
* of the next plugged BIO. blk_zone_wplug_bio_work() will release the
* reference we take here.
* Schedule the submission of the next plugged BIO. Taking a reference
* to the zone write plug is required as the bio_work belongs to the
* plug, and thus we must ensure that the write plug does not go away
* while the work is being scheduled but has not run yet.
* blk_zone_wplug_bio_work() will release the reference we take here,
* and we also drop this reference if the work is already scheduled.
*/
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
refcount_inc(&zwplug->ref);
queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
disk_put_zone_wplug(zwplug);
}
static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
@@ -1241,6 +1219,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
bio_list_add(&zwplug->bio_list, bio);
trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
bio->bi_iter.bi_sector, bio_sectors(bio));
/*
* If we are using the disk zone write plugs worker instead of the per
* zone write plug BIO work, add the zone write plug to the work list
* if it is not already there. Make sure to also get an extra reference
* on the zone write plug so that it does not go away until it is
* removed from the work list.
*/
if (blk_queue_zoned_qd1_writes(disk->queue)) {
spin_lock(&disk->zone_wplugs_list_lock);
if (list_empty(&zwplug->entry)) {
list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
refcount_inc(&zwplug->ref);
}
spin_unlock(&disk->zone_wplugs_list_lock);
}
}
/*
@@ -1438,7 +1432,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
if (bio->bi_opf & REQ_NOWAIT)
gfp_mask = GFP_NOWAIT;
zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
if (!zwplug) {
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
@@ -1447,6 +1441,21 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
return true;
}
spin_lock_irqsave(&zwplug->lock, flags);
/*
* If we got a zone write plug marked as dead, then the user is issuing
* writes to a full zone, or without synchronizing with zone reset or
* zone finish operations. In such case, fail the BIO to signal this
* invalid usage.
*/
if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) {
spin_unlock_irqrestore(&zwplug->lock, flags);
disk_put_zone_wplug(zwplug);
bio_io_error(bio);
return true;
}
/* Indicate that this BIO is being handled using zone write plugging. */
bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
@@ -1459,6 +1468,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
goto queue_bio;
}
/*
* For rotational devices, we will use the gendisk zone write plugs
* work instead of the per zone write plug BIO work, so queue the BIO.
*/
if (blk_queue_zoned_qd1_writes(disk->queue))
goto queue_bio;
/* If the zone is already plugged, add the BIO to the BIO plug list. */
if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
goto queue_bio;
@@ -1481,7 +1497,10 @@ queue_bio:
if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
disk_zone_wplug_schedule_bio_work(disk, zwplug);
if (blk_queue_zoned_qd1_writes(disk->queue))
wake_up_process(disk->zone_wplugs_worker);
else
disk_zone_wplug_schedule_work(disk, zwplug);
}
spin_unlock_irqrestore(&zwplug->lock, flags);
@@ -1527,7 +1546,7 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
disk->disk_name, zwplug->zone_no);
disk_zone_wplug_abort(zwplug);
}
disk_remove_zone_wplug(disk, zwplug);
disk_mark_zone_wplug_dead(zwplug);
spin_unlock_irqrestore(&zwplug->lock, flags);
disk_put_zone_wplug(zwplug);
@@ -1622,21 +1641,21 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
spin_lock_irqsave(&zwplug->lock, flags);
/* Schedule submission of the next plugged BIO if we have one. */
if (!bio_list_empty(&zwplug->bio_list)) {
disk_zone_wplug_schedule_bio_work(disk, zwplug);
spin_unlock_irqrestore(&zwplug->lock, flags);
return;
}
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
/*
* If the zone is full (it was fully written or finished, or empty
* (it was reset), remove its zone write plug from the hash table.
* For rotational devices, signal the BIO completion to the zone write
* plug work. Otherwise, schedule submission of the next plugged BIO
* if we have one.
*/
if (disk_should_remove_zone_wplug(disk, zwplug))
disk_remove_zone_wplug(disk, zwplug);
if (bio_list_empty(&zwplug->bio_list))
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
if (blk_queue_zoned_qd1_writes(disk->queue))
complete(&disk->zone_wplugs_worker_bio_done);
else if (!bio_list_empty(&zwplug->bio_list))
disk_zone_wplug_schedule_work(disk, zwplug);
if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
disk_mark_zone_wplug_dead(zwplug);
spin_unlock_irqrestore(&zwplug->lock, flags);
}
@@ -1727,10 +1746,9 @@ void blk_zone_write_plug_finish_request(struct request *req)
disk_put_zone_wplug(zwplug);
}
static void blk_zone_wplug_bio_work(struct work_struct *work)
static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
struct blk_zone_wplug *zwplug)
{
struct blk_zone_wplug *zwplug =
container_of(work, struct blk_zone_wplug, bio_work);
struct block_device *bdev;
unsigned long flags;
struct bio *bio;
@@ -1746,7 +1764,7 @@ again:
if (!bio) {
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
spin_unlock_irqrestore(&zwplug->lock, flags);
goto put_zwplug;
return false;
}
trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
@@ -1760,14 +1778,15 @@ again:
goto again;
}
bdev = bio->bi_bdev;
/*
* blk-mq devices will reuse the extra reference on the request queue
* usage counter we took when the BIO was plugged, but the submission
* path for BIO-based devices will not do that. So drop this extra
* reference here.
*/
if (blk_queue_zoned_qd1_writes(disk->queue))
reinit_completion(&disk->zone_wplugs_worker_bio_done);
bdev = bio->bi_bdev;
if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
bdev->bd_disk->fops->submit_bio(bio);
blk_queue_exit(bdev->bd_disk->queue);
@@ -1775,14 +1794,78 @@ again:
blk_mq_submit_bio(bio);
}
put_zwplug:
/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
disk_put_zone_wplug(zwplug);
return true;
}
static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
{
struct blk_zone_wplug *zwplug;
spin_lock_irq(&disk->zone_wplugs_list_lock);
zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
struct blk_zone_wplug, entry);
if (zwplug)
list_del_init(&zwplug->entry);
spin_unlock_irq(&disk->zone_wplugs_list_lock);
return zwplug;
}
static int disk_zone_wplugs_worker(void *data)
{
struct gendisk *disk = data;
struct blk_zone_wplug *zwplug;
unsigned int noio_flag;
noio_flag = memalloc_noio_save();
set_user_nice(current, MIN_NICE);
set_freezable();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
zwplug = disk_get_zone_wplugs_work(disk);
if (zwplug) {
/*
* Process all BIOs of this zone write plug and then
* drop the reference we took when adding the zone write
* plug to the active list.
*/
set_current_state(TASK_RUNNING);
while (disk_zone_wplug_submit_bio(disk, zwplug))
blk_wait_io(&disk->zone_wplugs_worker_bio_done);
disk_put_zone_wplug(zwplug);
continue;
}
/*
* Only sleep if nothing sets the state to running. Else check
* for zone write plugs work again as a newly submitted BIO
* might have added a zone write plug to the work list.
*/
if (get_current_state() == TASK_RUNNING) {
try_to_freeze();
} else {
if (kthread_should_stop()) {
set_current_state(TASK_RUNNING);
break;
}
schedule();
}
}
WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
memalloc_noio_restore(noio_flag);
return 0;
}
void disk_init_zone_resources(struct gendisk *disk)
{
spin_lock_init(&disk->zone_wplugs_lock);
spin_lock_init(&disk->zone_wplugs_hash_lock);
spin_lock_init(&disk->zone_wplugs_list_lock);
INIT_LIST_HEAD(&disk->zone_wplugs_list);
init_completion(&disk->zone_wplugs_worker_bio_done);
}
/*
@@ -1798,6 +1881,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
unsigned int pool_size)
{
unsigned int i;
int ret = -ENOMEM;
atomic_set(&disk->nr_zone_wplugs, 0);
disk->zone_wplugs_hash_bits =
@@ -1823,8 +1907,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
if (!disk->zone_wplugs_wq)
goto destroy_pool;
disk->zone_wplugs_worker =
kthread_create(disk_zone_wplugs_worker, disk,
"%s_zwplugs_worker", disk->disk_name);
if (IS_ERR(disk->zone_wplugs_worker)) {
ret = PTR_ERR(disk->zone_wplugs_worker);
disk->zone_wplugs_worker = NULL;
goto destroy_wq;
}
wake_up_process(disk->zone_wplugs_worker);
return 0;
destroy_wq:
destroy_workqueue(disk->zone_wplugs_wq);
disk->zone_wplugs_wq = NULL;
destroy_pool:
mempool_destroy(disk->zone_wplugs_pool);
disk->zone_wplugs_pool = NULL;
@@ -1832,7 +1929,7 @@ free_hash:
kfree(disk->zone_wplugs_hash);
disk->zone_wplugs_hash = NULL;
disk->zone_wplugs_hash_bits = 0;
return -ENOMEM;
return ret;
}
static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
@@ -1848,9 +1945,9 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
struct blk_zone_wplug, node);
refcount_inc(&zwplug->ref);
disk_remove_zone_wplug(disk, zwplug);
disk_put_zone_wplug(zwplug);
spin_lock_irq(&zwplug->lock);
disk_mark_zone_wplug_dead(zwplug);
spin_unlock_irq(&zwplug->lock);
}
}
@@ -1872,16 +1969,20 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
{
unsigned long flags;
spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
lockdep_is_held(&disk->zone_wplugs_lock));
spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
lockdep_is_held(&disk->zone_wplugs_hash_lock));
spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
kfree_rcu_mightsleep(zones_cond);
}
void disk_free_zone_resources(struct gendisk *disk)
{
if (disk->zone_wplugs_worker)
kthread_stop(disk->zone_wplugs_worker);
WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
if (disk->zone_wplugs_wq) {
destroy_workqueue(disk->zone_wplugs_wq);
disk->zone_wplugs_wq = NULL;
@@ -1910,6 +2011,7 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
{
struct queue_limits *lim = &disk->queue->limits;
unsigned int pool_size;
int ret = 0;
args->disk = disk;
args->nr_zones =
@@ -1932,10 +2034,13 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
pool_size =
min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
if (!disk->zone_wplugs_hash)
return disk_alloc_zone_resources(disk, pool_size);
if (!disk->zone_wplugs_hash) {
ret = disk_alloc_zone_resources(disk, pool_size);
if (ret)
kfree(args->zones_cond);
}
return 0;
return ret;
}
/*
@@ -1967,6 +2072,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
disk->zone_capacity = args->zone_capacity;
disk->last_zone_capacity = args->last_zone_capacity;
disk_set_zones_cond_array(disk, args->zones_cond);
args->zones_cond = NULL;
/*
* Some devices can advertise zone resource limits that are larger than
@@ -2078,7 +2184,6 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
struct gendisk *disk = args->disk;
struct blk_zone_wplug *zwplug;
unsigned int wp_offset;
unsigned long flags;
/*
* Remember the capacity of the first sequential zone and check
@@ -2108,10 +2213,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
if (!wp_offset || wp_offset >= zone->capacity)
return 0;
zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
if (!zwplug)
return -ENOMEM;
spin_unlock_irqrestore(&zwplug->lock, flags);
disk_put_zone_wplug(zwplug);
return 0;
@@ -2249,21 +2353,30 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
}
memalloc_noio_restore(noio_flag);
if (ret <= 0)
goto free_resources;
/*
* If zones where reported, make sure that the entire disk capacity
* has been checked.
*/
if (ret > 0 && args.sector != capacity) {
if (args.sector != capacity) {
pr_warn("%s: Missing zones from sector %llu\n",
disk->disk_name, args.sector);
ret = -ENODEV;
goto free_resources;
}
if (ret > 0)
return disk_update_zone_resources(disk, &args);
ret = disk_update_zone_resources(disk, &args);
if (ret)
goto free_resources;
return 0;
free_resources:
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
kfree(args.zones_cond);
memflags = blk_mq_freeze_queue(q);
disk_free_zone_resources(disk);
blk_mq_unfreeze_queue(q, memflags);

View File

@@ -55,7 +55,7 @@ bool __blk_freeze_queue_start(struct request_queue *q,
struct task_struct *owner);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
void submit_bio_noacct_nocheck(struct bio *bio, bool split);
void bio_await_chain(struct bio *bio);
int bio_submit_or_kill(struct bio *bio, unsigned int flags);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
@@ -108,11 +108,6 @@ static inline void blk_wait_io(struct completion *done)
struct block_device *blkdev_get_no_open(dev_t dev, bool autoload);
void blkdev_put_no_open(struct block_device *bdev);
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
struct page *page, unsigned len, unsigned offset);

View File

@@ -393,7 +393,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn);
bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn, NULL);
if (IS_ERR(bset->bd)) {
ret = PTR_ERR(bset->bd);
goto out_cleanup_queue;

View File

@@ -12,6 +12,7 @@
#include <linux/idr.h>
#include <linux/bsg.h>
#include <linux/slab.h>
#include <linux/io_uring/cmd.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
@@ -28,6 +29,7 @@ struct bsg_device {
unsigned int timeout;
unsigned int reserved_size;
bsg_sg_io_fn *sg_io_fn;
bsg_uring_cmd_fn *uring_cmd_fn;
};
static inline struct bsg_device *to_bsg_device(struct inode *inode)
@@ -158,11 +160,38 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
}
}
static int bsg_check_uring_features(unsigned int issue_flags)
{
/* BSG passthrough requires big SQE/CQE support */
if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) !=
(IO_URING_F_SQE128|IO_URING_F_CQE32))
return -EOPNOTSUPP;
return 0;
}
static int bsg_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
{
struct bsg_device *bd = to_bsg_device(file_inode(ioucmd->file));
bool open_for_write = ioucmd->file->f_mode & FMODE_WRITE;
struct request_queue *q = bd->queue;
int ret;
ret = bsg_check_uring_features(issue_flags);
if (ret)
return ret;
if (!bd->uring_cmd_fn)
return -EOPNOTSUPP;
return bd->uring_cmd_fn(q, ioucmd, issue_flags, open_for_write);
}
static const struct file_operations bsg_fops = {
.open = bsg_open,
.release = bsg_release,
.unlocked_ioctl = bsg_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.uring_cmd = bsg_uring_cmd,
.owner = THIS_MODULE,
.llseek = default_llseek,
};
@@ -187,7 +216,8 @@ void bsg_unregister_queue(struct bsg_device *bd)
EXPORT_SYMBOL_GPL(bsg_unregister_queue);
struct bsg_device *bsg_register_queue(struct request_queue *q,
struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn)
struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn,
bsg_uring_cmd_fn *uring_cmd_fn)
{
struct bsg_device *bd;
int ret;
@@ -199,6 +229,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
bd->reserved_size = INT_MAX;
bd->queue = q;
bd->sg_io_fn = sg_io_fn;
bd->uring_cmd_fn = uring_cmd_fn;
ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL);
if (ret < 0) {

View File

@@ -290,13 +290,14 @@ EXPORT_SYMBOL(disk_check_media_change);
* Should be called when the media changes for @disk. Generates a uevent
* and attempts to free all dentries and inodes and invalidates all block
* device page cache entries in that case.
*
* Callers that need a partition re-scan should arrange for one explicitly.
*/
void disk_force_media_change(struct gendisk *disk)
{
disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
inc_diskseq(disk);
bdev_mark_dead(disk->part0, true);
set_bit(GD_NEED_PART_SCAN, &disk->state);
}
EXPORT_SYMBOL_GPL(disk_force_media_change);

View File

@@ -153,13 +153,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
nr_sects = len >> SECTOR_SHIFT;
blk_start_plug(&plug);
while (1) {
if (fatal_signal_pending(current)) {
if (prev)
bio_await_chain(prev);
err = -EINTR;
goto out_unplug;
}
while (!fatal_signal_pending(current)) {
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
GFP_KERNEL);
if (!bio)
@@ -167,12 +161,11 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
prev = bio_chain_and_submit(prev, bio);
}
if (prev) {
err = submit_bio_wait(prev);
err = bio_submit_or_kill(prev, BLKDEV_ZERO_KILLABLE);
if (err == -EOPNOTSUPP)
err = 0;
bio_put(prev);
}
out_unplug:
blk_finish_plug(&plug);
fail:
filemap_invalidate_unlock(bdev->bd_mapping);

View File

@@ -19,6 +19,7 @@
enum {
TCG_SECP_00 = 0,
TCG_SECP_01,
TCG_SECP_02,
};
/*
@@ -125,6 +126,7 @@ enum opal_uid {
OPAL_LOCKING_INFO_TABLE,
OPAL_ENTERPRISE_LOCKING_INFO_TABLE,
OPAL_DATASTORE,
OPAL_LOCKING_TABLE,
/* C_PIN_TABLE object ID's */
OPAL_C_PIN_MSID,
OPAL_C_PIN_SID,
@@ -154,6 +156,7 @@ enum opal_method {
OPAL_AUTHENTICATE,
OPAL_RANDOM,
OPAL_ERASE,
OPAL_REACTIVATE,
};
enum opal_token {
@@ -224,6 +227,8 @@ enum opal_lockingstate {
enum opal_parameter {
OPAL_SUM_SET_LIST = 0x060000,
OPAL_SUM_RANGE_POLICY = 0x060001,
OPAL_SUM_ADMIN1_PIN = 0x060002,
};
enum opal_revertlsp {
@@ -269,6 +274,25 @@ struct opal_header {
struct opal_data_subpacket subpkt;
};
/*
* TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
* Section: 3.3.4.7.5 STACK_RESET
*/
#define OPAL_STACK_RESET 0x0002
struct opal_stack_reset {
u8 extendedComID[4];
__be32 request_code;
};
struct opal_stack_reset_response {
u8 extendedComID[4];
__be32 request_code;
u8 reserved0[2];
__be16 data_length;
__be32 response;
};
#define FC_TPER 0x0001
#define FC_LOCKING 0x0002
#define FC_GEOMETRY 0x0003

View File

@@ -40,9 +40,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data,
(le32_to_cpu(dr->disc_size) >> 9);
if (name) {
strlcat(state->pp_buf, " [", PAGE_SIZE);
strlcat(state->pp_buf, name, PAGE_SIZE);
strlcat(state->pp_buf, "]", PAGE_SIZE);
seq_buf_printf(&state->pp_buf, " [%s]", name);
}
put_partition(state, slot, first_sector, nr_sects);
return dr;
@@ -78,14 +76,14 @@ static int riscix_partition(struct parsed_partitions *state,
if (!rr)
return -1;
strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " [RISCiX]");
if (rr->magic == RISCIX_MAGIC) {
unsigned long size = nr_sects > 2 ? 2 : nr_sects;
int part;
strlcat(state->pp_buf, " <", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " <");
put_partition(state, slot++, first_sect, size);
for (part = 0; part < 8; part++) {
@@ -94,13 +92,11 @@ static int riscix_partition(struct parsed_partitions *state,
put_partition(state, slot++,
le32_to_cpu(rr->part[part].start),
le32_to_cpu(rr->part[part].length));
strlcat(state->pp_buf, "(", PAGE_SIZE);
strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE);
strlcat(state->pp_buf, ")", PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "(%s)", rr->part[part].name);
}
}
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >\n");
} else {
put_partition(state, slot++, first_sect, nr_sects);
}
@@ -130,7 +126,7 @@ static int linux_partition(struct parsed_partitions *state,
struct linux_part *linuxp;
unsigned long size = nr_sects > 2 ? 2 : nr_sects;
strlcat(state->pp_buf, " [Linux]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " [Linux]");
put_partition(state, slot++, first_sect, size);
@@ -138,7 +134,7 @@ static int linux_partition(struct parsed_partitions *state,
if (!linuxp)
return -1;
strlcat(state->pp_buf, " <", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " <");
while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) ||
linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) {
if (slot == state->limit)
@@ -148,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state,
le32_to_cpu(linuxp->nr_sects));
linuxp ++;
}
strlcat(state->pp_buf, " >", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >");
put_dev_sector(sect);
return slot;
@@ -293,7 +289,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
break;
}
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}
#endif
@@ -366,7 +362,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
return 0;
}
strlcat(state->pp_buf, " [ICS]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " [ICS]");
for (slot = 1, p = (const struct ics_part *)data; p->size; p++) {
u32 start = le32_to_cpu(p->start);
@@ -400,7 +396,7 @@ int adfspart_check_ICS(struct parsed_partitions *state)
}
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}
#endif
@@ -460,7 +456,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
return 0;
}
strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " [POWERTEC]");
for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) {
u32 start = le32_to_cpu(p->start);
@@ -471,7 +467,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state)
}
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}
#endif
@@ -542,7 +538,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
size = get_capacity(state->disk);
put_partition(state, slot++, start, size - start);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
}
return i ? 1 : 0;

View File

@@ -173,24 +173,22 @@ int aix_partition(struct parsed_partitions *state)
if (d) {
struct lvm_rec *p = (struct lvm_rec *)d;
u16 lvm_version = be16_to_cpu(p->version);
char tmp[64];
if (lvm_version == 1) {
int pp_size_log2 = be16_to_cpu(p->pp_size);
pp_bytes_size = 1 << pp_size_log2;
pp_blocks_size = pp_bytes_size / 512;
snprintf(tmp, sizeof(tmp),
" AIX LVM header version %u found\n",
lvm_version);
seq_buf_printf(&state->pp_buf,
" AIX LVM header version %u found\n",
lvm_version);
vgda_len = be32_to_cpu(p->vgda_len);
vgda_sector = be32_to_cpu(p->vgda_psn[0]);
} else {
snprintf(tmp, sizeof(tmp),
" unsupported AIX LVM version %d found\n",
lvm_version);
seq_buf_printf(&state->pp_buf,
" unsupported AIX LVM version %d found\n",
lvm_version);
}
strlcat(state->pp_buf, tmp, PAGE_SIZE);
put_dev_sector(sect);
}
if (vgda_sector && (d = read_part_sector(state, vgda_sector, &sect))) {
@@ -251,14 +249,11 @@ int aix_partition(struct parsed_partitions *state)
continue;
}
if (lp_ix == lvip[lv_ix].pps_per_lv) {
char tmp[70];
put_partition(state, lv_ix + 1,
(i + 1 - lp_ix) * pp_blocks_size + psn_part1,
lvip[lv_ix].pps_per_lv * pp_blocks_size);
snprintf(tmp, sizeof(tmp), " <%s>\n",
n[lv_ix].name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, " <%s>\n",
n[lv_ix].name);
lvip[lv_ix].lv_is_contiguous = 1;
ret = 1;
next_lp_ix = 1;

View File

@@ -81,13 +81,8 @@ int amiga_partition(struct parsed_partitions *state)
/* blksize is blocks per 512 byte standard block */
blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512;
{
char tmp[7 + 10 + 1 + 1];
/* Be more informative */
snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
/* Be more informative */
seq_buf_printf(&state->pp_buf, " RDSK (%d)", blksize * 512);
blk = be32_to_cpu(rdb->rdb_PartitionList);
put_dev_sector(sect);
for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) {
@@ -179,27 +174,27 @@ int amiga_partition(struct parsed_partitions *state)
{
/* Be even more informative to aid mounting */
char dostype[4];
char tmp[42];
__be32 *dt = (__be32 *)dostype;
*dt = pb->pb_Environment[16];
if (dostype[3] < ' ')
snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)",
dostype[0], dostype[1],
dostype[2], dostype[3] + '@' );
seq_buf_printf(&state->pp_buf,
" (%c%c%c^%c)",
dostype[0], dostype[1],
dostype[2],
dostype[3] + '@');
else
snprintf(tmp, sizeof(tmp), " (%c%c%c%c)",
dostype[0], dostype[1],
dostype[2], dostype[3]);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
snprintf(tmp, sizeof(tmp), "(res %d spb %d)",
be32_to_cpu(pb->pb_Environment[6]),
be32_to_cpu(pb->pb_Environment[4]));
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf,
" (%c%c%c%c)",
dostype[0], dostype[1],
dostype[2], dostype[3]);
seq_buf_printf(&state->pp_buf, "(res %d spb %d)",
be32_to_cpu(pb->pb_Environment[6]),
be32_to_cpu(pb->pb_Environment[4]));
}
res = 1;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
rdb_done:
return res;

View File

@@ -70,7 +70,7 @@ int atari_partition(struct parsed_partitions *state)
}
pi = &rs->part[0];
strlcat(state->pp_buf, " AHDI", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " AHDI");
for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) {
struct rootsector *xrs;
Sector sect2;
@@ -89,7 +89,7 @@ int atari_partition(struct parsed_partitions *state)
#ifdef ICD_PARTS
part_fmt = 1;
#endif
strlcat(state->pp_buf, " XGM<", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " XGM<");
partsect = extensect = be32_to_cpu(pi->st);
while (1) {
xrs = read_part_sector(state, partsect, &sect2);
@@ -128,14 +128,14 @@ int atari_partition(struct parsed_partitions *state)
break;
}
}
strlcat(state->pp_buf, " >", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >");
}
#ifdef ICD_PARTS
if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */
pi = &rs->icdpart[0];
/* sanity check: no ICD format if first partition invalid */
if (OK_id(pi->id)) {
strlcat(state->pp_buf, " ICD<", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " ICD<");
for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) {
/* accept only GEM,BGM,RAW,LNX,SWP partitions */
if (!((pi->flg & 1) && OK_id(pi->id)))
@@ -144,13 +144,13 @@ int atari_partition(struct parsed_partitions *state)
be32_to_cpu(pi->st),
be32_to_cpu(pi->siz));
}
strlcat(state->pp_buf, " >", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >");
}
}
#endif
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}

View File

@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pagemap.h>
#include <linux/blkdev.h>
#include <linux/seq_buf.h>
#include "../blk.h"
/*
@@ -20,7 +21,7 @@ struct parsed_partitions {
int next;
int limit;
bool access_beyond_eod;
char *pp_buf;
struct seq_buf pp_buf;
};
typedef struct {
@@ -37,12 +38,9 @@ static inline void
put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
{
if (n < p->limit) {
char tmp[1 + BDEVNAME_SIZE + 10 + 1];
p->parts[n].from = from;
p->parts[n].size = size;
snprintf(tmp, sizeof(tmp), " %s%d", p->name, n);
strlcat(p->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&p->pp_buf, " %s%d", p->name, n);
}
}

View File

@@ -229,7 +229,6 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
struct parsed_partitions *state)
{
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
if (slot >= state->limit)
return 1;
@@ -244,8 +243,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
strscpy(info->volname, subpart->name, sizeof(info->volname));
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "(%s)", info->volname);
state->parts[slot].has_info = true;
@@ -379,7 +377,7 @@ int cmdline_partition(struct parsed_partitions *state)
cmdline_parts_set(parts, disk_size, state);
cmdline_parts_verifier(1, state);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}

View File

@@ -8,6 +8,7 @@
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/sysfs.h>
#include <linux/ctype.h>
#include <linux/vmalloc.h>
#include <linux/raid/detect.h>
@@ -123,16 +124,16 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
state = allocate_partitions(hd);
if (!state)
return NULL;
state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
if (!state->pp_buf) {
state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL);
if (!state->pp_buf.buffer) {
free_partitions(state);
return NULL;
}
state->pp_buf[0] = '\0';
seq_buf_init(&state->pp_buf, state->pp_buf.buffer, PAGE_SIZE);
state->disk = hd;
strscpy(state->name, hd->disk_name);
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
seq_buf_printf(&state->pp_buf, " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
sprintf(state->name, "p");
@@ -151,9 +152,9 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
}
if (res > 0) {
printk(KERN_INFO "%s", state->pp_buf);
printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
free_page((unsigned long)state->pp_buf);
free_page((unsigned long)state->pp_buf.buffer);
return state;
}
if (state->access_beyond_eod)
@@ -164,12 +165,12 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
if (err)
res = err;
if (res) {
strlcat(state->pp_buf,
" unable to read partition table\n", PAGE_SIZE);
printk(KERN_INFO "%s", state->pp_buf);
seq_buf_puts(&state->pp_buf,
" unable to read partition table\n");
printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
}
free_page((unsigned long)state->pp_buf);
free_page((unsigned long)state->pp_buf.buffer);
free_partitions(state);
return ERR_PTR(res);
}
@@ -177,31 +178,31 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
static ssize_t part_partition_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
return sysfs_emit(buf, "%d\n", bdev_partno(dev_to_bdev(dev)));
}
static ssize_t part_start_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
return sysfs_emit(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
}
static ssize_t part_ro_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
return sysfs_emit(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
}
static ssize_t part_alignment_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev)));
return sysfs_emit(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev)));
}
static ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
return sysfs_emit(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
}
static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);

View File

@@ -751,6 +751,6 @@ int efi_partition(struct parsed_partitions *state)
}
kfree(ptes);
kfree(gpt);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}

View File

@@ -173,15 +173,13 @@ static int find_vol1_partitions(struct parsed_partitions *state,
{
sector_t blk;
int counter;
char tmp[64];
Sector sect;
unsigned char *data;
loff_t offset, size;
struct vtoc_format1_label f1;
int secperblk;
snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "VOL1/%8s:", name);
/*
* get start of VTOC from the disk label and then search for format1
* and format8 labels
@@ -219,7 +217,7 @@ static int find_vol1_partitions(struct parsed_partitions *state,
blk++;
data = read_part_sector(state, blk * secperblk, &sect);
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
if (!data)
return -1;
@@ -237,11 +235,9 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
dasd_information2_t *info)
{
loff_t offset, geo_size, size;
char tmp[64];
int secperblk;
snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "LNX1/%8s:", name);
secperblk = blocksize >> 9;
if (label->lnx.ldl_version == 0xf2) {
size = label->lnx.formatted_blocks * secperblk;
@@ -258,7 +254,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
size = nr_sectors;
if (size != geo_size) {
if (!info) {
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}
if (!strcmp(info->type, "ECKD"))
@@ -270,7 +266,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
/* first and only partition starts in the first block after the label */
offset = labelsect + secperblk;
put_partition(state, 1, offset, size - offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}
@@ -282,7 +278,6 @@ static int find_cms1_partitions(struct parsed_partitions *state,
sector_t labelsect)
{
loff_t offset, size;
char tmp[64];
int secperblk;
/*
@@ -291,14 +286,12 @@ static int find_cms1_partitions(struct parsed_partitions *state,
blocksize = label->cms.block_size;
secperblk = blocksize >> 9;
if (label->cms.disk_offset != 0) {
snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "CMS1/%8s(MDSK):", name);
/* disk is reserved minidisk */
offset = label->cms.disk_offset * secperblk;
size = (label->cms.block_count - 1) * secperblk;
} else {
snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "CMS1/%8s:", name);
/*
* Special case for FBA devices:
* If an FBA device is CMS formatted with blocksize > 512 byte
@@ -314,7 +307,7 @@ static int find_cms1_partitions(struct parsed_partitions *state,
}
put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}
@@ -391,11 +384,11 @@ int ibm_partition(struct parsed_partitions *state)
*/
res = 1;
if (info->format == DASD_FORMAT_LDL) {
strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "(nonl)");
size = nr_sectors;
offset = (info->label_block + 1) * (blocksize >> 9);
put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
}
} else
res = 0;

View File

@@ -53,7 +53,7 @@ int karma_partition(struct parsed_partitions *state)
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
put_dev_sector(sect);
return 1;
}

View File

@@ -582,7 +582,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
return false;
}
strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE);
seq_buf_puts(&pp->pp_buf, " [LDM]");
/* Create the data partitions */
list_for_each (item, &ldb->v_part) {
@@ -597,7 +597,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp,
part_num++;
}
strlcat(pp->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&pp->pp_buf, "\n");
return true;
}

View File

@@ -86,7 +86,7 @@ int mac_partition(struct parsed_partitions *state)
if (blocks_in_map >= state->limit)
blocks_in_map = state->limit - 1;
strlcat(state->pp_buf, " [mac]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " [mac]");
for (slot = 1; slot <= blocks_in_map; ++slot) {
int pos = slot * secsize;
put_dev_sector(sect);
@@ -152,6 +152,6 @@ int mac_partition(struct parsed_partitions *state)
#endif
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}

View File

@@ -263,18 +263,11 @@ static void parse_solaris_x86(struct parsed_partitions *state,
put_dev_sector(sect);
return;
}
{
char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
seq_buf_printf(&state->pp_buf, " %s%d: <solaris:", state->name, origin);
if (le32_to_cpu(v->v_version) != 1) {
char tmp[64];
snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n",
le32_to_cpu(v->v_version));
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf,
" cannot handle version %d vtoc>\n",
le32_to_cpu(v->v_version));
put_dev_sector(sect);
return;
}
@@ -282,12 +275,10 @@ static void parse_solaris_x86(struct parsed_partitions *state,
max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8;
for (i = 0; i < max_nparts && state->next < state->limit; i++) {
struct solaris_x86_slice *s = &v->v_slice[i];
char tmp[3 + 10 + 1 + 1];
if (s->s_size == 0)
continue;
snprintf(tmp, sizeof(tmp), " [s%d]", i);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, " [s%d]", i);
/* solaris partitions are relative to current MS-DOS
* one; must add the offset of the current partition */
put_partition(state, state->next++,
@@ -295,7 +286,7 @@ static void parse_solaris_x86(struct parsed_partitions *state,
le32_to_cpu(s->s_size));
}
put_dev_sector(sect);
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >\n");
#endif
}
@@ -359,7 +350,6 @@ static void parse_bsd(struct parsed_partitions *state,
Sector sect;
struct bsd_disklabel *l;
struct bsd_partition *p;
char tmp[64];
l = read_part_sector(state, offset + 1, &sect);
if (!l)
@@ -369,8 +359,7 @@ static void parse_bsd(struct parsed_partitions *state,
return;
}
snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, " %s%d: <%s:", state->name, origin, flavour);
if (le16_to_cpu(l->d_npartitions) < max_partitions)
max_partitions = le16_to_cpu(l->d_npartitions);
@@ -391,18 +380,16 @@ static void parse_bsd(struct parsed_partitions *state,
/* full parent partition, we have it already */
continue;
if (offset > bsd_start || offset+size < bsd_start+bsd_size) {
strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "bad subpartition - ignored\n");
continue;
}
put_partition(state, state->next++, bsd_start, bsd_size);
}
put_dev_sector(sect);
if (le16_to_cpu(l->d_npartitions) > max_partitions) {
snprintf(tmp, sizeof(tmp), " (ignored %d more)",
le16_to_cpu(l->d_npartitions) - max_partitions);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
if (le16_to_cpu(l->d_npartitions) > max_partitions)
seq_buf_printf(&state->pp_buf, " (ignored %d more)",
le16_to_cpu(l->d_npartitions) - max_partitions);
seq_buf_puts(&state->pp_buf, " >\n");
}
#endif
@@ -496,12 +483,7 @@ static void parse_unixware(struct parsed_partitions *state,
put_dev_sector(sect);
return;
}
{
char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
}
seq_buf_printf(&state->pp_buf, " %s%d: <unixware:", state->name, origin);
p = &l->vtoc.v_slice[1];
/* I omit the 0th slice as it is the same as whole disk. */
while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) {
@@ -515,7 +497,7 @@ static void parse_unixware(struct parsed_partitions *state,
p++;
}
put_dev_sector(sect);
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >\n");
#endif
}
@@ -546,10 +528,7 @@ static void parse_minix(struct parsed_partitions *state,
* the normal boot sector. */
if (msdos_magic_present(data + 510) &&
p->sys_ind == MINIX_PARTITION) { /* subpartition table present */
char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, " %s%d: <minix:", state->name, origin);
for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) {
if (state->next == state->limit)
break;
@@ -558,7 +537,7 @@ static void parse_minix(struct parsed_partitions *state,
put_partition(state, state->next++,
start_sect(p), nr_sects(p));
}
strlcat(state->pp_buf, " >\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >\n");
}
put_dev_sector(sect);
#endif /* CONFIG_MINIX_SUBPARTITION */
@@ -602,7 +581,7 @@ int msdos_partition(struct parsed_partitions *state)
#ifdef CONFIG_AIX_PARTITION
return aix_partition(state);
#else
strlcat(state->pp_buf, " [AIX]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " [AIX]");
return 0;
#endif
}
@@ -629,7 +608,7 @@ int msdos_partition(struct parsed_partitions *state)
fb = (struct fat_boot_sector *) data;
if (slot == 1 && fb->reserved && fb->fats
&& fat_valid_media(fb->media)) {
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
put_dev_sector(sect);
return 1;
} else {
@@ -678,9 +657,9 @@ int msdos_partition(struct parsed_partitions *state)
n = min(size, max(sector_size, n));
put_partition(state, slot, start, n);
strlcat(state->pp_buf, " <", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " <");
parse_extended(state, start, size, disksig);
strlcat(state->pp_buf, " >", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, " >");
continue;
}
put_partition(state, slot, start, size);
@@ -688,12 +667,12 @@ int msdos_partition(struct parsed_partitions *state)
if (p->sys_ind == LINUX_RAID_PARTITION)
state->parts[slot].flags = ADDPART_FLAG_RAID;
if (p->sys_ind == DM6_PARTITION)
strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "[DM]");
if (p->sys_ind == EZD_PARTITION)
strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "[EZD]");
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
/* second pass - output for each on a separate line */
p = (struct msdos_partition *) (0x1be + data);

View File

@@ -36,7 +36,6 @@ static void add_of_partition(struct parsed_partitions *state, int slot,
struct device_node *np)
{
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
const char *partname;
int len;
@@ -63,8 +62,7 @@ static void add_of_partition(struct parsed_partitions *state, int slot,
partname = of_get_property(np, "name", &len);
strscpy(info->volname, partname, sizeof(info->volname));
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "(%s)", info->volname);
}
int of_partition(struct parsed_partitions *state)
@@ -104,7 +102,7 @@ int of_partition(struct parsed_partitions *state)
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
}

View File

@@ -81,7 +81,7 @@ int osf_partition(struct parsed_partitions *state)
le32_to_cpu(partition->p_size));
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
put_dev_sector(sect);
return 1;
}

View File

@@ -79,7 +79,7 @@ int sgi_partition(struct parsed_partitions *state)
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
put_dev_sector(sect);
return 1;
}

View File

@@ -121,7 +121,7 @@ int sun_partition(struct parsed_partitions *state)
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
put_dev_sector(sect);
return 1;
}

View File

@@ -54,7 +54,6 @@ int sysv68_partition(struct parsed_partitions *state)
unsigned char *data;
struct dkblk0 *b;
struct slice *slice;
char tmp[64];
data = read_part_sector(state, 0, &sect);
if (!data)
@@ -74,8 +73,7 @@ int sysv68_partition(struct parsed_partitions *state)
return -1;
slices -= 1; /* last slice is the whole disk */
snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "sysV68: %s(s%u)", state->name, slices);
slice = (struct slice *)data;
for (i = 0; i < slices; i++, slice++) {
if (slot == state->limit)
@@ -84,12 +82,11 @@ int sysv68_partition(struct parsed_partitions *state)
put_partition(state, slot,
be32_to_cpu(slice->blkoff),
be32_to_cpu(slice->nblocks));
snprintf(tmp, sizeof(tmp), "(s%u)", i);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
seq_buf_printf(&state->pp_buf, "(s%u)", i);
}
slot++;
}
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
put_dev_sector(sect);
return 1;
}

View File

@@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state)
label->pt_part[i].pi_blkoff,
label->pt_part[i].pi_nblocks);
put_dev_sector(sect);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
seq_buf_puts(&state->pp_buf, "\n");
return 1;
} else {
put_dev_sector(sect);

View File

@@ -160,6 +160,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
[OPAL_DATASTORE] =
{ 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 },
[OPAL_LOCKING_TABLE] =
{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00 },
/* C_PIN_TABLE object ID's */
[OPAL_C_PIN_MSID] =
@@ -218,6 +220,8 @@ static const u8 opalmethod[][OPAL_METHOD_LENGTH] = {
{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 },
[OPAL_ERASE] =
{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 },
[OPAL_REACTIVATE] =
{ 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x01 },
};
static int end_opal_session_error(struct opal_dev *dev);
@@ -1514,7 +1518,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
return err;
}
static int setup_locking_range(struct opal_dev *dev, void *data)
static int setup_enable_range(struct opal_dev *dev, void *data)
{
u8 uid[OPAL_UID_LENGTH];
struct opal_user_lr_setup *setup = data;
@@ -1528,38 +1532,47 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
if (lr == 0)
err = enable_global_lr(dev, uid, setup);
else {
err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_VALUES);
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_RANGESTART);
add_token_u64(&err, dev, setup->range_start);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_RANGELENGTH);
add_token_u64(&err, dev, setup->range_length);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_READLOCKENABLED);
add_token_u64(&err, dev, !!setup->RLE);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_WRITELOCKENABLED);
add_token_u64(&err, dev, !!setup->WLE);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
}
else
err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, 0, 0);
if (err) {
pr_debug("Error building Setup Locking range command.\n");
pr_debug("Failed to create enable lr command.\n");
return err;
}
return finalize_and_send(dev, parse_and_check_status);
}
static int setup_locking_range_start_length(struct opal_dev *dev, void *data)
{
int err;
u8 uid[OPAL_UID_LENGTH];
struct opal_user_lr_setup *setup = data;
err = build_locking_range(uid, sizeof(uid), setup->session.opal_key.lr);
if (err)
return err;
err = cmd_start(dev, uid, opalmethod[OPAL_SET]);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_VALUES);
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_RANGESTART);
add_token_u64(&err, dev, setup->range_start);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_RANGELENGTH);
add_token_u64(&err, dev, setup->range_length);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
if (err) {
pr_debug("Error building Setup Locking RangeStartLength command.\n");
return err;
}
@@ -1568,7 +1581,7 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
static int response_get_column(const struct parsed_resp *resp,
int *iter,
u8 column,
u64 column,
u64 *value)
{
const struct opal_resp_tok *tok;
@@ -1586,7 +1599,7 @@ static int response_get_column(const struct parsed_resp *resp,
n++;
if (response_get_u64(resp, n) != column) {
pr_debug("Token %d does not match expected column %u.\n",
pr_debug("Token %d does not match expected column %llu.\n",
n, column);
return OPAL_INVAL_PARAM;
}
@@ -1744,6 +1757,12 @@ static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data)
OPAL_ADMINSP_UID, NULL, 0);
}
static int start_anybodyLSP_opal_session(struct opal_dev *dev, void *data)
{
return start_generic_opal_session(dev, OPAL_ANYBODY_UID,
OPAL_LOCKINGSP_UID, NULL, 0);
}
static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
{
int ret;
@@ -2285,6 +2304,74 @@ static int activate_lsp(struct opal_dev *dev, void *data)
return finalize_and_send(dev, parse_and_check_status);
}
static int reactivate_lsp(struct opal_dev *dev, void *data)
{
struct opal_lr_react *opal_react = data;
u8 user_lr[OPAL_UID_LENGTH];
int err, i;
err = cmd_start(dev, opaluid[OPAL_THISSP_UID],
opalmethod[OPAL_REACTIVATE]);
if (err) {
pr_debug("Error building Reactivate LockingSP command.\n");
return err;
}
/*
* If neither 'entire_table' nor 'num_lrs' is set, the device
* gets reactivated with SUM disabled. Only Admin1PIN will change
* if set.
*/
if (opal_react->entire_table) {
/* Entire Locking table (all locking ranges) will be put in SUM. */
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u64(&err, dev, OPAL_SUM_SET_LIST);
add_token_bytestring(&err, dev, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH);
add_token_u8(&err, dev, OPAL_ENDNAME);
} else if (opal_react->num_lrs) {
/* Subset of Locking table (selected locking range(s)) to be put in SUM */
err = build_locking_range(user_lr, sizeof(user_lr),
opal_react->lr[0]);
if (err)
return err;
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u64(&err, dev, OPAL_SUM_SET_LIST);
add_token_u8(&err, dev, OPAL_STARTLIST);
add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
for (i = 1; i < opal_react->num_lrs; i++) {
user_lr[7] = opal_react->lr[i];
add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH);
}
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
}
/* Skipping the rangle policy parameter is same as setting its value to zero */
if (opal_react->range_policy && (opal_react->num_lrs || opal_react->entire_table)) {
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u64(&err, dev, OPAL_SUM_RANGE_POLICY);
add_token_u8(&err, dev, 1);
add_token_u8(&err, dev, OPAL_ENDNAME);
}
/*
* Optional parameter. If set, it changes the Admin1 PIN even when SUM
* is being disabled.
*/
if (opal_react->new_admin_key.key_len) {
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u64(&err, dev, OPAL_SUM_ADMIN1_PIN);
add_token_bytestring(&err, dev, opal_react->new_admin_key.key,
opal_react->new_admin_key.key_len);
add_token_u8(&err, dev, OPAL_ENDNAME);
}
return finalize_and_send(dev, parse_and_check_status);
}
/* Determine if we're in the Manufactured Inactive or Active state */
static int get_lsp_lifecycle(struct opal_dev *dev, void *data)
{
@@ -2955,12 +3042,92 @@ static int opal_activate_lsp(struct opal_dev *dev,
return ret;
}
static int opal_reactivate_lsp(struct opal_dev *dev,
struct opal_lr_react *opal_lr_react)
{
const struct opal_step active_steps[] = {
{ start_admin1LSP_opal_session, &opal_lr_react->key },
{ reactivate_lsp, opal_lr_react },
/* No end_opal_session. The controller terminates the session */
};
int ret;
/* use either 'entire_table' parameter or set of locking ranges */
if (opal_lr_react->num_lrs > OPAL_MAX_LRS ||
(opal_lr_react->num_lrs && opal_lr_react->entire_table))
return -EINVAL;
ret = opal_get_key(dev, &opal_lr_react->key);
if (ret)
return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
static int opal_setup_locking_range(struct opal_dev *dev,
struct opal_user_lr_setup *opal_lrs)
{
const struct opal_step lr_steps[] = {
{ start_auth_opal_session, &opal_lrs->session },
{ setup_locking_range, opal_lrs },
{ setup_locking_range_start_length, opal_lrs },
{ setup_enable_range, opal_lrs },
{ end_opal_session, }
}, lr_global_steps[] = {
{ start_auth_opal_session, &opal_lrs->session },
{ setup_enable_range, opal_lrs },
{ end_opal_session, }
};
int ret;
ret = opal_get_key(dev, &opal_lrs->session.opal_key);
if (ret)
return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
if (opal_lrs->session.opal_key.lr == 0)
ret = execute_steps(dev, lr_global_steps, ARRAY_SIZE(lr_global_steps));
else
ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
static int opal_setup_locking_range_start_length(struct opal_dev *dev,
struct opal_user_lr_setup *opal_lrs)
{
const struct opal_step lr_steps[] = {
{ start_auth_opal_session, &opal_lrs->session },
{ setup_locking_range_start_length, opal_lrs },
{ end_opal_session, }
};
int ret;
/* we can not set global locking range offset or length */
if (opal_lrs->session.opal_key.lr == 0)
return -EINVAL;
ret = opal_get_key(dev, &opal_lrs->session.opal_key);
if (ret)
return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
mutex_unlock(&dev->dev_lock);
return ret;
}
static int opal_enable_disable_range(struct opal_dev *dev,
struct opal_user_lr_setup *opal_lrs)
{
const struct opal_step lr_steps[] = {
{ start_auth_opal_session, &opal_lrs->session },
{ setup_enable_range, opal_lrs },
{ end_opal_session, }
};
int ret;
@@ -3228,6 +3395,200 @@ static int opal_get_geometry(struct opal_dev *dev, void __user *data)
return 0;
}
static int get_sum_ranges(struct opal_dev *dev, void *data)
{
const char *lr_uid;
size_t lr_uid_len;
u64 val;
const struct opal_resp_tok *tok;
int err, tok_n = 2;
struct opal_sum_ranges *sranges = data;
const __u8 lr_all[OPAL_MAX_LRS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
err = generic_get_columns(dev, opaluid[OPAL_LOCKING_INFO_TABLE], OPAL_SUM_SET_LIST,
OPAL_SUM_RANGE_POLICY);
if (err) {
pr_debug("Couldn't get locking info table columns %d to %d.\n",
OPAL_SUM_SET_LIST, OPAL_SUM_RANGE_POLICY);
return err;
}
tok = response_get_token(&dev->parsed, tok_n);
if (IS_ERR(tok))
return PTR_ERR(tok);
if (!response_token_matches(tok, OPAL_STARTNAME)) {
pr_debug("Unexpected response token type %d.\n", tok_n);
return OPAL_INVAL_PARAM;
}
tok_n++;
if (response_get_u64(&dev->parsed, tok_n) != OPAL_SUM_SET_LIST) {
pr_debug("Token %d does not match expected column %u.\n",
tok_n, OPAL_SUM_SET_LIST);
return OPAL_INVAL_PARAM;
}
tok_n++;
tok = response_get_token(&dev->parsed, tok_n);
if (IS_ERR(tok))
return PTR_ERR(tok);
/*
* The OPAL_SUM_SET_LIST response contains two distinct values:
*
* - the list of individual locking ranges (UIDs) put in SUM. The list
* may also be empty signaling the SUM is disabled.
*
* - the Locking table UID if the entire Locking table is put in SUM.
*/
if (response_token_matches(tok, OPAL_STARTLIST)) {
sranges->num_lrs = 0;
tok_n++;
tok = response_get_token(&dev->parsed, tok_n);
if (IS_ERR(tok))
return PTR_ERR(tok);
while (!response_token_matches(tok, OPAL_ENDLIST)) {
lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid);
if (lr_uid_len != OPAL_UID_LENGTH) {
pr_debug("Unexpected response token type %d.\n", tok_n);
return OPAL_INVAL_PARAM;
}
if (memcmp(lr_uid, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH)) {
if (lr_uid[5] != LOCKING_RANGE_NON_GLOBAL) {
pr_debug("Unexpected byte %d at LR UUID position 5.\n",
lr_uid[5]);
return OPAL_INVAL_PARAM;
}
sranges->lr[sranges->num_lrs++] = lr_uid[7];
} else
sranges->lr[sranges->num_lrs++] = 0;
tok_n++;
tok = response_get_token(&dev->parsed, tok_n);
if (IS_ERR(tok))
return PTR_ERR(tok);
}
} else {
/* Only OPAL_LOCKING_TABLE UID is an alternative to OPAL_STARTLIST here. */
lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid);
if (lr_uid_len != OPAL_UID_LENGTH) {
pr_debug("Unexpected response token type %d.\n", tok_n);
return OPAL_INVAL_PARAM;
}
if (memcmp(lr_uid, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH)) {
pr_debug("Unexpected response UID.\n");
return OPAL_INVAL_PARAM;
}
/* sed-opal kernel API already provides following limit in Activate command */
sranges->num_lrs = OPAL_MAX_LRS;
memcpy(sranges->lr, lr_all, OPAL_MAX_LRS);
}
tok_n++;
tok = response_get_token(&dev->parsed, tok_n);
if (IS_ERR(tok))
return PTR_ERR(tok);
if (!response_token_matches(tok, OPAL_ENDNAME)) {
pr_debug("Unexpected response token type %d.\n", tok_n);
return OPAL_INVAL_PARAM;
}
tok_n++;
err = response_get_column(&dev->parsed, &tok_n, OPAL_SUM_RANGE_POLICY, &val);
if (err)
return err;
sranges->range_policy = val ? 1 : 0;
return 0;
}
static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opal_sum_rngs,
void __user *data)
{
const struct opal_step admin_steps[] = {
{ start_admin1LSP_opal_session, &opal_sum_rngs->key },
{ get_sum_ranges, opal_sum_rngs },
{ end_opal_session, }
}, anybody_steps[] = {
{ start_anybodyLSP_opal_session, NULL },
{ get_sum_ranges, opal_sum_rngs },
{ end_opal_session, }
};
int ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
if (opal_sum_rngs->key.key_len)
/* Use Admin1 session (authenticated by PIN) to retrieve LockingInfo columns */
ret = execute_steps(dev, admin_steps, ARRAY_SIZE(admin_steps));
else
/* Use Anybody session (no key) to retrieve LockingInfo columns */
ret = execute_steps(dev, anybody_steps, ARRAY_SIZE(anybody_steps));
mutex_unlock(&dev->dev_lock);
/* skip session info when copying back to uspace */
if (!ret && copy_to_user(data + offsetof(struct opal_sum_ranges, num_lrs),
(void *)opal_sum_rngs + offsetof(struct opal_sum_ranges, num_lrs),
sizeof(*opal_sum_rngs) - offsetof(struct opal_sum_ranges, num_lrs))) {
pr_debug("Error copying SUM ranges info to userspace\n");
return -EFAULT;
}
return ret;
}
static int opal_stack_reset(struct opal_dev *dev)
{
struct opal_stack_reset *req;
struct opal_stack_reset_response *resp;
int ret;
mutex_lock(&dev->dev_lock);
memset(dev->cmd, 0, IO_BUFFER_LENGTH);
req = (struct opal_stack_reset *)dev->cmd;
req->extendedComID[0] = dev->comid >> 8;
req->extendedComID[1] = dev->comid & 0xFF;
req->request_code = cpu_to_be32(OPAL_STACK_RESET);
ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02,
dev->cmd, IO_BUFFER_LENGTH, true);
if (ret) {
pr_debug("Error sending stack reset: %d\n", ret);
goto out;
}
memset(dev->resp, 0, IO_BUFFER_LENGTH);
ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02,
dev->resp, IO_BUFFER_LENGTH, false);
if (ret) {
pr_debug("Error receiving stack reset response: %d\n", ret);
goto out;
}
resp = (struct opal_stack_reset_response *)dev->resp;
if (be16_to_cpu(resp->data_length) != 4) {
pr_debug("Stack reset pending\n");
ret = -EBUSY;
goto out;
}
if (be32_to_cpu(resp->response) != 0) {
pr_debug("Stack reset failed: %u\n", be32_to_cpu(resp->response));
ret = -EIO;
}
out:
mutex_unlock(&dev->dev_lock);
return ret;
}
int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
{
void *p;
@@ -3313,6 +3674,21 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_SET_SID_PW:
ret = opal_set_new_sid_pw(dev, p);
break;
case IOC_OPAL_REACTIVATE_LSP:
ret = opal_reactivate_lsp(dev, p);
break;
case IOC_OPAL_LR_SET_START_LEN:
ret = opal_setup_locking_range_start_length(dev, p);
break;
case IOC_OPAL_ENABLE_DISABLE_LR:
ret = opal_enable_disable_range(dev, p);
break;
case IOC_OPAL_GET_SUM_STATUS:
ret = opal_get_sum_ranges(dev, p, arg);
break;
case IOC_OPAL_STACK_RESET:
ret = opal_stack_reset(dev);
break;
default:
break;

View File

@@ -12,230 +12,115 @@
#include <linux/unaligned.h>
#include "blk.h"
struct blk_integrity_iter {
void *prot_buf;
void *data_buf;
sector_t seed;
unsigned int data_size;
unsigned short interval;
const char *disk_name;
#define APP_TAG_ESCAPE 0xffff
#define REF_TAG_ESCAPE 0xffffffff
/*
* This union is used for onstack allocations when the pi field is split across
* segments. blk_validate_integrity_limits() guarantees pi_tuple_size matches
* the sizeof one of these two types.
*/
union pi_tuple {
struct crc64_pi_tuple crc64_pi;
struct t10_pi_tuple t10_pi;
};
static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len,
unsigned char csum_type)
struct blk_integrity_iter {
struct bio *bio;
struct bio_integrity_payload *bip;
struct blk_integrity *bi;
struct bvec_iter data_iter;
struct bvec_iter prot_iter;
unsigned int interval_remaining;
u64 seed;
u64 csum;
};
static void blk_calculate_guard(struct blk_integrity_iter *iter, void *data,
unsigned int len)
{
if (csum_type == BLK_INTEGRITY_CSUM_IP)
return (__force __be16)ip_compute_csum(data, len);
return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len));
switch (iter->bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
iter->csum = crc64_nvme(iter->csum, data, len);
break;
case BLK_INTEGRITY_CSUM_CRC:
iter->csum = crc_t10dif_update(iter->csum, data, len);
break;
case BLK_INTEGRITY_CSUM_IP:
iter->csum = (__force u32)csum_partial(data, len,
(__force __wsum)iter->csum);
break;
default:
WARN_ON_ONCE(1);
iter->csum = U64_MAX;
break;
}
}
static void blk_integrity_csum_finish(struct blk_integrity_iter *iter)
{
switch (iter->bi->csum_type) {
case BLK_INTEGRITY_CSUM_IP:
iter->csum = (__force u16)csum_fold((__force __wsum)iter->csum);
break;
default:
break;
}
}
/*
* Type 1 and Type 2 protection use the same format: 16 bit guard tag,
* 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref
* tag.
* Update the csum for formats that have metadata padding in front of the data
* integrity field
*/
static void t10_pi_generate(struct blk_integrity_iter *iter,
struct blk_integrity *bi)
static void blk_integrity_csum_offset(struct blk_integrity_iter *iter)
{
u8 offset = bi->pi_offset;
unsigned int i;
unsigned int offset = iter->bi->pi_offset;
struct bio_vec *bvec = iter->bip->bip_vec;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf + offset;
while (offset > 0) {
struct bio_vec pbv = bvec_iter_bvec(bvec, iter->prot_iter);
unsigned int len = min(pbv.bv_len, offset);
void *prot_buf = bvec_kmap_local(&pbv);
pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval,
bi->csum_type);
if (offset)
pi->guard_tag = t10_pi_csum(pi->guard_tag,
iter->prot_buf, offset, bi->csum_type);
pi->app_tag = 0;
blk_calculate_guard(iter, prot_buf, len);
kunmap_local(prot_buf);
offset -= len;
bvec_iter_advance_single(bvec, &iter->prot_iter, len);
}
blk_integrity_csum_finish(iter);
}
if (bi->flags & BLK_INTEGRITY_REF_TAG)
pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
else
pi->ref_tag = 0;
static void blk_integrity_copy_from_tuple(struct bio_integrity_payload *bip,
struct bvec_iter *iter, void *tuple,
unsigned int tuple_size)
{
while (tuple_size) {
struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter);
unsigned int len = min(tuple_size, pbv.bv_len);
void *prot_buf = bvec_kmap_local(&pbv);
iter->data_buf += iter->interval;
iter->prot_buf += bi->metadata_size;
iter->seed++;
memcpy(prot_buf, tuple, len);
kunmap_local(prot_buf);
bvec_iter_advance_single(bip->bip_vec, iter, len);
tuple_size -= len;
tuple += len;
}
}
static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
struct blk_integrity *bi)
static void blk_integrity_copy_to_tuple(struct bio_integrity_payload *bip,
struct bvec_iter *iter, void *tuple,
unsigned int tuple_size)
{
u8 offset = bi->pi_offset;
unsigned int i;
while (tuple_size) {
struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter);
unsigned int len = min(tuple_size, pbv.bv_len);
void *prot_buf = bvec_kmap_local(&pbv);
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf + offset;
__be16 csum;
if (bi->flags & BLK_INTEGRITY_REF_TAG) {
if (pi->app_tag == T10_PI_APP_ESCAPE)
goto next;
if (be32_to_cpu(pi->ref_tag) !=
lower_32_bits(iter->seed)) {
pr_err("%s: ref tag error at location %llu " \
"(rcvd %u)\n", iter->disk_name,
(unsigned long long)
iter->seed, be32_to_cpu(pi->ref_tag));
return BLK_STS_PROTECTION;
}
} else {
if (pi->app_tag == T10_PI_APP_ESCAPE &&
pi->ref_tag == T10_PI_REF_ESCAPE)
goto next;
}
csum = t10_pi_csum(0, iter->data_buf, iter->interval,
bi->csum_type);
if (offset)
csum = t10_pi_csum(csum, iter->prot_buf, offset,
bi->csum_type);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
"(rcvd %04x, want %04x)\n", iter->disk_name,
(unsigned long long)iter->seed,
be16_to_cpu(pi->guard_tag), be16_to_cpu(csum));
return BLK_STS_PROTECTION;
}
next:
iter->data_buf += iter->interval;
iter->prot_buf += bi->metadata_size;
iter->seed++;
}
return BLK_STS_OK;
}
/**
* t10_pi_type1_prepare - prepare PI prior submitting request to device
* @rq: request with PI that should be prepared
*
* For Type 1/Type 2, the virtual start sector is the one that was
* originally submitted by the block layer for the ref_tag usage. Due to
* partitioning, MD/DM cloning, etc. the actual physical start sector is
* likely to be different. Remap protection information to match the
* physical LBA.
*/
static void t10_pi_type1_prepare(struct request *rq)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
const int tuple_sz = bi->metadata_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
struct bio_integrity_payload *bip = bio_integrity(bio);
u32 virt = bip_get_seed(bip) & 0xffffffff;
struct bio_vec iv;
struct bvec_iter iter;
/* Already remapped? */
if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
break;
bip_for_each_vec(iv, bip, iter) {
unsigned int j;
void *p;
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == virt)
pi->ref_tag = cpu_to_be32(ref_tag);
virt++;
ref_tag++;
p += tuple_sz;
}
kunmap_local(p);
}
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
}
}
/**
* t10_pi_type1_complete - prepare PI prior returning request to the blk layer
* @rq: request with PI that should be prepared
* @nr_bytes: total bytes to prepare
*
* For Type 1/Type 2, the virtual start sector is the one that was
* originally submitted by the block layer for the ref_tag usage. Due to
* partitioning, MD/DM cloning, etc. the actual physical start sector is
* likely to be different. Since the physical start sector was submitted
* to the device, we should remap it back to virtual values expected by the
* block layer.
*/
static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->metadata_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
struct bio_integrity_payload *bip = bio_integrity(bio);
u32 virt = bip_get_seed(bip) & 0xffffffff;
struct bio_vec iv;
struct bvec_iter iter;
bip_for_each_vec(iv, bip, iter) {
unsigned int j;
void *p;
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == ref_tag)
pi->ref_tag = cpu_to_be32(virt);
virt++;
ref_tag++;
intervals--;
p += tuple_sz;
}
kunmap_local(p);
}
}
}
static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
{
return cpu_to_be64(crc64_nvme(crc, data, len));
}
static void ext_pi_crc64_generate(struct blk_integrity_iter *iter,
struct blk_integrity *bi)
{
u8 offset = bi->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct crc64_pi_tuple *pi = iter->prot_buf + offset;
pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval);
if (offset)
pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag),
iter->prot_buf, offset);
pi->app_tag = 0;
if (bi->flags & BLK_INTEGRITY_REF_TAG)
put_unaligned_be48(iter->seed, pi->ref_tag);
else
put_unaligned_be48(0ULL, pi->ref_tag);
iter->data_buf += iter->interval;
iter->prot_buf += bi->metadata_size;
iter->seed++;
memcpy(tuple, prot_buf, len);
kunmap_local(prot_buf);
bvec_iter_advance_single(bip->bip_vec, iter, len);
tuple_size -= len;
tuple += len;
}
}
@@ -246,228 +131,437 @@ static bool ext_pi_ref_escape(const u8 ref_tag[6])
return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
}
static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
struct blk_integrity *bi)
static blk_status_t blk_verify_ext_pi(struct blk_integrity_iter *iter,
struct crc64_pi_tuple *pi)
{
u8 offset = bi->pi_offset;
unsigned int i;
u64 seed = lower_48_bits(iter->seed);
u64 guard = get_unaligned_be64(&pi->guard_tag);
u64 ref = get_unaligned_be48(pi->ref_tag);
u16 app = get_unaligned_be16(&pi->app_tag);
for (i = 0; i < iter->data_size; i += iter->interval) {
struct crc64_pi_tuple *pi = iter->prot_buf + offset;
u64 ref, seed;
__be64 csum;
if (bi->flags & BLK_INTEGRITY_REF_TAG) {
if (pi->app_tag == T10_PI_APP_ESCAPE)
goto next;
ref = get_unaligned_be48(pi->ref_tag);
seed = lower_48_bits(iter->seed);
if (ref != seed) {
pr_err("%s: ref tag error at location %llu (rcvd %llu)\n",
iter->disk_name, seed, ref);
return BLK_STS_PROTECTION;
}
} else {
if (pi->app_tag == T10_PI_APP_ESCAPE &&
ext_pi_ref_escape(pi->ref_tag))
goto next;
}
csum = ext_pi_crc64(0, iter->data_buf, iter->interval);
if (offset)
csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf,
offset);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
"(rcvd %016llx, want %016llx)\n",
iter->disk_name, (unsigned long long)iter->seed,
be64_to_cpu(pi->guard_tag), be64_to_cpu(csum));
if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) {
if (app == APP_TAG_ESCAPE)
return BLK_STS_OK;
if (ref != seed) {
pr_err("%s: ref tag error at location %llu (rcvd %llu)\n",
iter->bio->bi_bdev->bd_disk->disk_name, seed,
ref);
return BLK_STS_PROTECTION;
}
} else if (app == APP_TAG_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) {
return BLK_STS_OK;
}
next:
iter->data_buf += iter->interval;
iter->prot_buf += bi->metadata_size;
iter->seed++;
if (guard != iter->csum) {
pr_err("%s: guard tag error at sector %llu (rcvd %016llx, want %016llx)\n",
iter->bio->bi_bdev->bd_disk->disk_name, iter->seed,
guard, iter->csum);
return BLK_STS_PROTECTION;
}
return BLK_STS_OK;
}
static void ext_pi_type1_prepare(struct request *rq)
static blk_status_t blk_verify_pi(struct blk_integrity_iter *iter,
struct t10_pi_tuple *pi, u16 guard)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
const int tuple_sz = bi->metadata_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
u32 seed = lower_32_bits(iter->seed);
u32 ref = get_unaligned_be32(&pi->ref_tag);
u16 app = get_unaligned_be16(&pi->app_tag);
__rq_for_each_bio(bio, rq) {
struct bio_integrity_payload *bip = bio_integrity(bio);
u64 virt = lower_48_bits(bip_get_seed(bip));
struct bio_vec iv;
struct bvec_iter iter;
/* Already remapped? */
if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
break;
bip_for_each_vec(iv, bip, iter) {
unsigned int j;
void *p;
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == virt)
put_unaligned_be48(ref_tag, pi->ref_tag);
virt++;
ref_tag++;
p += tuple_sz;
}
kunmap_local(p);
if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) {
if (app == APP_TAG_ESCAPE)
return BLK_STS_OK;
if (ref != seed) {
pr_err("%s: ref tag error at location %u (rcvd %u)\n",
iter->bio->bi_bdev->bd_disk->disk_name, seed,
ref);
return BLK_STS_PROTECTION;
}
} else if (app == APP_TAG_ESCAPE && ref == REF_TAG_ESCAPE) {
return BLK_STS_OK;
}
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
if (guard != (u16)iter->csum) {
pr_err("%s: guard tag error at sector %llu (rcvd %04x, want %04x)\n",
iter->bio->bi_bdev->bd_disk->disk_name, iter->seed,
guard, (u16)iter->csum);
return BLK_STS_PROTECTION;
}
return BLK_STS_OK;
}
static blk_status_t blk_verify_t10_pi(struct blk_integrity_iter *iter,
struct t10_pi_tuple *pi)
{
u16 guard = get_unaligned_be16(&pi->guard_tag);
return blk_verify_pi(iter, pi, guard);
}
static blk_status_t blk_verify_ip_pi(struct blk_integrity_iter *iter,
struct t10_pi_tuple *pi)
{
u16 guard = get_unaligned((u16 *)&pi->guard_tag);
return blk_verify_pi(iter, pi, guard);
}
static blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter,
union pi_tuple *tuple)
{
switch (iter->bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
return blk_verify_ext_pi(iter, &tuple->crc64_pi);
case BLK_INTEGRITY_CSUM_CRC:
return blk_verify_t10_pi(iter, &tuple->t10_pi);
case BLK_INTEGRITY_CSUM_IP:
return blk_verify_ip_pi(iter, &tuple->t10_pi);
default:
return BLK_STS_OK;
}
}
static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
static void blk_set_ext_pi(struct blk_integrity_iter *iter,
struct crc64_pi_tuple *pi)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->metadata_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
put_unaligned_be64(iter->csum, &pi->guard_tag);
put_unaligned_be16(0, &pi->app_tag);
put_unaligned_be48(iter->seed, &pi->ref_tag);
}
__rq_for_each_bio(bio, rq) {
struct bio_integrity_payload *bip = bio_integrity(bio);
u64 virt = lower_48_bits(bip_get_seed(bip));
struct bio_vec iv;
struct bvec_iter iter;
static void blk_set_pi(struct blk_integrity_iter *iter,
struct t10_pi_tuple *pi, __be16 csum)
{
put_unaligned(csum, &pi->guard_tag);
put_unaligned_be16(0, &pi->app_tag);
put_unaligned_be32(iter->seed, &pi->ref_tag);
}
bip_for_each_vec(iv, bip, iter) {
unsigned int j;
void *p;
static void blk_set_t10_pi(struct blk_integrity_iter *iter,
struct t10_pi_tuple *pi)
{
blk_set_pi(iter, pi, cpu_to_be16((u16)iter->csum));
}
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag);
static void blk_set_ip_pi(struct blk_integrity_iter *iter,
struct t10_pi_tuple *pi)
{
blk_set_pi(iter, pi, (__force __be16)(u16)iter->csum);
}
if (ref == ref_tag)
put_unaligned_be48(virt, pi->ref_tag);
virt++;
ref_tag++;
intervals--;
p += tuple_sz;
}
kunmap_local(p);
}
static void blk_integrity_set(struct blk_integrity_iter *iter,
union pi_tuple *tuple)
{
switch (iter->bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
return blk_set_ext_pi(iter, &tuple->crc64_pi);
case BLK_INTEGRITY_CSUM_CRC:
return blk_set_t10_pi(iter, &tuple->t10_pi);
case BLK_INTEGRITY_CSUM_IP:
return blk_set_ip_pi(iter, &tuple->t10_pi);
default:
WARN_ON_ONCE(1);
return;
}
}
static blk_status_t blk_integrity_interval(struct blk_integrity_iter *iter,
bool verify)
{
blk_status_t ret = BLK_STS_OK;
union pi_tuple tuple;
void *ptuple = &tuple;
struct bio_vec pbv;
blk_integrity_csum_offset(iter);
pbv = bvec_iter_bvec(iter->bip->bip_vec, iter->prot_iter);
if (pbv.bv_len >= iter->bi->pi_tuple_size) {
ptuple = bvec_kmap_local(&pbv);
bvec_iter_advance_single(iter->bip->bip_vec, &iter->prot_iter,
iter->bi->metadata_size - iter->bi->pi_offset);
} else if (verify) {
blk_integrity_copy_to_tuple(iter->bip, &iter->prot_iter,
ptuple, iter->bi->pi_tuple_size);
}
if (verify)
ret = blk_integrity_verify(iter, ptuple);
else
blk_integrity_set(iter, ptuple);
if (likely(ptuple != &tuple)) {
kunmap_local(ptuple);
} else if (!verify) {
blk_integrity_copy_from_tuple(iter->bip, &iter->prot_iter,
ptuple, iter->bi->pi_tuple_size);
}
iter->interval_remaining = 1 << iter->bi->interval_exp;
iter->csum = 0;
iter->seed++;
return ret;
}
static blk_status_t blk_integrity_iterate(struct bio *bio,
struct bvec_iter *data_iter,
bool verify)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
struct blk_integrity_iter iter = {
.bio = bio,
.bip = bip,
.bi = bi,
.data_iter = *data_iter,
.prot_iter = bip->bip_iter,
.interval_remaining = 1 << bi->interval_exp,
.seed = data_iter->bi_sector,
.csum = 0,
};
blk_status_t ret = BLK_STS_OK;
while (iter.data_iter.bi_size && ret == BLK_STS_OK) {
struct bio_vec bv = bvec_iter_bvec(iter.bio->bi_io_vec,
iter.data_iter);
void *kaddr = bvec_kmap_local(&bv);
void *data = kaddr;
unsigned int len;
bvec_iter_advance_single(iter.bio->bi_io_vec, &iter.data_iter,
bv.bv_len);
while (bv.bv_len && ret == BLK_STS_OK) {
len = min(iter.interval_remaining, bv.bv_len);
blk_calculate_guard(&iter, data, len);
bv.bv_len -= len;
data += len;
iter.interval_remaining -= len;
if (!iter.interval_remaining)
ret = blk_integrity_interval(&iter, verify);
}
kunmap_local(kaddr);
}
return ret;
}
void bio_integrity_generate(struct bio *bio)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
struct blk_integrity_iter iter;
struct bvec_iter bviter;
struct bio_vec bv;
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
iter.seed = bio->bi_iter.bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
bio_for_each_segment(bv, bio, bviter) {
void *kaddr = bvec_kmap_local(&bv);
iter.data_buf = kaddr;
iter.data_size = bv.bv_len;
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
ext_pi_crc64_generate(&iter, bi);
break;
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
t10_pi_generate(&iter, bi);
break;
default:
break;
}
kunmap_local(kaddr);
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
blk_integrity_iterate(bio, &bio->bi_iter, false);
break;
default:
break;
}
}
blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter)
{
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
struct blk_integrity_iter iter;
struct bvec_iter bviter;
struct bio_vec bv;
/*
* At the moment verify is called bi_iter has been advanced during split
* and completion, so use the copy created during submission here.
*/
iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
iter.seed = saved_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
__bio_for_each_segment(bv, bio, bviter, *saved_iter) {
void *kaddr = bvec_kmap_local(&bv);
blk_status_t ret = BLK_STS_OK;
iter.data_buf = kaddr;
iter.data_size = bv.bv_len;
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
ret = ext_pi_crc64_verify(&iter, bi);
break;
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
ret = t10_pi_verify(&iter, bi);
break;
default:
break;
}
kunmap_local(kaddr);
if (ret)
return ret;
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
return blk_integrity_iterate(bio, saved_iter, true);
default:
break;
}
return BLK_STS_OK;
}
void blk_integrity_prepare(struct request *rq)
/*
* Advance @iter past the protection offset for protection formats that
* contain front padding on the metadata region.
*/
static void blk_pi_advance_offset(struct blk_integrity *bi,
struct bio_integrity_payload *bip,
struct bvec_iter *iter)
{
unsigned int offset = bi->pi_offset;
while (offset > 0) {
struct bio_vec bv = mp_bvec_iter_bvec(bip->bip_vec, *iter);
unsigned int len = min(bv.bv_len, offset);
bvec_iter_advance_single(bip->bip_vec, iter, len);
offset -= len;
}
}
static void *blk_tuple_remap_begin(union pi_tuple *tuple,
struct blk_integrity *bi,
struct bio_integrity_payload *bip,
struct bvec_iter *iter)
{
struct bvec_iter titer;
struct bio_vec pbv;
blk_pi_advance_offset(bi, bip, iter);
pbv = bvec_iter_bvec(bip->bip_vec, *iter);
if (likely(pbv.bv_len >= bi->pi_tuple_size))
return bvec_kmap_local(&pbv);
/*
* We need to preserve the state of the original iter for the
* copy_from_tuple at the end, so make a temp iter for here.
*/
titer = *iter;
blk_integrity_copy_to_tuple(bip, &titer, tuple, bi->pi_tuple_size);
return tuple;
}
static void blk_tuple_remap_end(union pi_tuple *tuple, void *ptuple,
struct blk_integrity *bi,
struct bio_integrity_payload *bip,
struct bvec_iter *iter)
{
unsigned int len = bi->metadata_size - bi->pi_offset;
if (likely(ptuple != tuple)) {
kunmap_local(ptuple);
} else {
blk_integrity_copy_from_tuple(bip, iter, ptuple,
bi->pi_tuple_size);
len -= bi->pi_tuple_size;
}
bvec_iter_advance(bip->bip_vec, iter, len);
}
static void blk_set_ext_unmap_ref(struct crc64_pi_tuple *pi, u64 virt,
u64 ref_tag)
{
u64 ref = get_unaligned_be48(&pi->ref_tag);
if (ref == lower_48_bits(ref_tag) && ref != lower_48_bits(virt))
put_unaligned_be48(virt, pi->ref_tag);
}
static void blk_set_t10_unmap_ref(struct t10_pi_tuple *pi, u32 virt,
u32 ref_tag)
{
u32 ref = get_unaligned_be32(&pi->ref_tag);
if (ref == ref_tag && ref != virt)
put_unaligned_be32(virt, &pi->ref_tag);
}
static void blk_reftag_remap_complete(struct blk_integrity *bi,
union pi_tuple *tuple, u64 virt, u64 ref)
{
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
blk_set_ext_unmap_ref(&tuple->crc64_pi, virt, ref);
break;
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
blk_set_t10_unmap_ref(&tuple->t10_pi, virt, ref);
break;
default:
WARN_ON_ONCE(1);
break;
}
}
static void blk_set_ext_map_ref(struct crc64_pi_tuple *pi, u64 virt,
u64 ref_tag)
{
u64 ref = get_unaligned_be48(&pi->ref_tag);
if (ref == lower_48_bits(virt) && ref != ref_tag)
put_unaligned_be48(ref_tag, pi->ref_tag);
}
static void blk_set_t10_map_ref(struct t10_pi_tuple *pi, u32 virt, u32 ref_tag)
{
u32 ref = get_unaligned_be32(&pi->ref_tag);
if (ref == virt && ref != ref_tag)
put_unaligned_be32(ref_tag, &pi->ref_tag);
}
static void blk_reftag_remap_prepare(struct blk_integrity *bi,
union pi_tuple *tuple,
u64 virt, u64 ref)
{
switch (bi->csum_type) {
case BLK_INTEGRITY_CSUM_CRC64:
blk_set_ext_map_ref(&tuple->crc64_pi, virt, ref);
break;
case BLK_INTEGRITY_CSUM_CRC:
case BLK_INTEGRITY_CSUM_IP:
blk_set_t10_map_ref(&tuple->t10_pi, virt, ref);
break;
default:
WARN_ON_ONCE(1);
break;
}
}
static void __blk_reftag_remap(struct bio *bio, struct blk_integrity *bi,
unsigned *intervals, u64 *ref, bool prep)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bvec_iter iter = bip->bip_iter;
u64 virt = bip_get_seed(bip);
union pi_tuple *ptuple;
union pi_tuple tuple;
if (prep && bip->bip_flags & BIP_MAPPED_INTEGRITY) {
*ref += bio->bi_iter.bi_size >> bi->interval_exp;
return;
}
while (iter.bi_size && *intervals) {
ptuple = blk_tuple_remap_begin(&tuple, bi, bip, &iter);
if (prep)
blk_reftag_remap_prepare(bi, ptuple, virt, *ref);
else
blk_reftag_remap_complete(bi, ptuple, virt, *ref);
blk_tuple_remap_end(&tuple, ptuple, bi, bip, &iter);
(*intervals)--;
(*ref)++;
virt++;
}
if (prep)
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
}
static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes,
bool prep)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
u64 ref = blk_rq_pos(rq) >> (bi->interval_exp - SECTOR_SHIFT);
unsigned intervals = nr_bytes >> bi->interval_exp;
struct bio *bio;
if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
return;
if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64)
ext_pi_type1_prepare(rq);
else
t10_pi_type1_prepare(rq);
__rq_for_each_bio(bio, rq) {
__blk_reftag_remap(bio, bi, &intervals, &ref, prep);
if (!intervals)
break;
}
}
void blk_integrity_prepare(struct request *rq)
{
blk_integrity_remap(rq, blk_rq_bytes(rq), true);
}
void blk_integrity_complete(struct request *rq, unsigned int nr_bytes)
{
struct blk_integrity *bi = &rq->q->limits.integrity;
if (!(bi->flags & BLK_INTEGRITY_REF_TAG))
return;
if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64)
ext_pi_type1_complete(rq, nr_bytes);
else
t10_pi_type1_complete(rq, nr_bytes);
blk_integrity_remap(rq, nr_bytes, false);
}

View File

@@ -141,12 +141,6 @@ config CRYPTO_ACOMP
select CRYPTO_ALGAPI
select CRYPTO_ACOMP2
config CRYPTO_HKDF
tristate
select CRYPTO_SHA256 if CRYPTO_SELFTESTS
select CRYPTO_SHA512 if CRYPTO_SELFTESTS
select CRYPTO_HASH2
config CRYPTO_MANAGER
tristate
default CRYPTO_ALGAPI if CRYPTO_SELFTESTS

View File

@@ -36,7 +36,6 @@ obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o
obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o
obj-$(CONFIG_CRYPTO_SIG2) += sig.o
obj-$(CONFIG_CRYPTO_KPP2) += kpp.o
obj-$(CONFIG_CRYPTO_HKDF) += hkdf.o
dh_generic-y := dh.o
dh_generic-y += dh_helper.o

View File

@@ -1,573 +0,0 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation
* Function"), aka RFC 5869. See also the original paper (Krawczyk 2010):
* "Cryptographic Extraction and Key Derivation: The HKDF Scheme".
*
* Copyright 2019 Google LLC
*/
#include <crypto/internal/hash.h>
#include <crypto/sha2.h>
#include <crypto/hkdf.h>
#include <linux/module.h>
/*
* HKDF consists of two steps:
*
* 1. HKDF-Extract: extract a pseudorandom key from the input keying material
* and optional salt.
* 2. HKDF-Expand: expand the pseudorandom key into output keying material of
* any length, parameterized by an application-specific info string.
*
*/
/**
* hkdf_extract - HKDF-Extract (RFC 5869 section 2.2)
* @hmac_tfm: an HMAC transform using the hash function desired for HKDF. The
* caller is responsible for setting the @prk afterwards.
* @ikm: input keying material
* @ikmlen: length of @ikm
* @salt: input salt value
* @saltlen: length of @salt
* @prk: resulting pseudorandom key
*
* Extracts a pseudorandom key @prk from the input keying material
* @ikm with length @ikmlen and salt @salt with length @saltlen.
* The length of @prk is given by the digest size of @hmac_tfm.
* For an 'unsalted' version of HKDF-Extract @salt must be set
* to all zeroes and @saltlen must be set to the length of @prk.
*
* Returns 0 on success with the pseudorandom key stored in @prk,
* or a negative errno value otherwise.
*/
int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
unsigned int ikmlen, const u8 *salt, unsigned int saltlen,
u8 *prk)
{
int err;
err = crypto_shash_setkey(hmac_tfm, salt, saltlen);
if (!err)
err = crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk);
return err;
}
EXPORT_SYMBOL_GPL(hkdf_extract);
/**
* hkdf_expand - HKDF-Expand (RFC 5869 section 2.3)
* @hmac_tfm: hash context keyed with pseudorandom key
* @info: application-specific information
* @infolen: length of @info
* @okm: output keying material
* @okmlen: length of @okm
*
* This expands the pseudorandom key, which was already keyed into @hmac_tfm,
* into @okmlen bytes of output keying material parameterized by the
* application-specific @info of length @infolen bytes.
* This is thread-safe and may be called by multiple threads in parallel.
*
* Returns 0 on success with output keying material stored in @okm,
* or a negative errno value otherwise.
*/
int hkdf_expand(struct crypto_shash *hmac_tfm,
const u8 *info, unsigned int infolen,
u8 *okm, unsigned int okmlen)
{
SHASH_DESC_ON_STACK(desc, hmac_tfm);
unsigned int i, hashlen = crypto_shash_digestsize(hmac_tfm);
int err;
const u8 *prev = NULL;
u8 counter = 1;
u8 tmp[HASH_MAX_DIGESTSIZE] = {};
if (WARN_ON(okmlen > 255 * hashlen))
return -EINVAL;
desc->tfm = hmac_tfm;
for (i = 0; i < okmlen; i += hashlen) {
err = crypto_shash_init(desc);
if (err)
goto out;
if (prev) {
err = crypto_shash_update(desc, prev, hashlen);
if (err)
goto out;
}
if (infolen) {
err = crypto_shash_update(desc, info, infolen);
if (err)
goto out;
}
BUILD_BUG_ON(sizeof(counter) != 1);
if (okmlen - i < hashlen) {
err = crypto_shash_finup(desc, &counter, 1, tmp);
if (err)
goto out;
memcpy(&okm[i], tmp, okmlen - i);
memzero_explicit(tmp, sizeof(tmp));
} else {
err = crypto_shash_finup(desc, &counter, 1, &okm[i]);
if (err)
goto out;
}
counter++;
prev = &okm[i];
}
err = 0;
out:
if (unlikely(err))
memzero_explicit(okm, okmlen); /* so caller doesn't need to */
shash_desc_zero(desc);
memzero_explicit(tmp, HASH_MAX_DIGESTSIZE);
return err;
}
EXPORT_SYMBOL_GPL(hkdf_expand);
struct hkdf_testvec {
const char *test;
const u8 *ikm;
const u8 *salt;
const u8 *info;
const u8 *prk;
const u8 *okm;
u16 ikm_size;
u16 salt_size;
u16 info_size;
u16 prk_size;
u16 okm_size;
};
/*
* HKDF test vectors from RFC5869
*
* Additional HKDF test vectors from
* https://github.com/brycx/Test-Vector-Generation/blob/master/HKDF/hkdf-hmac-sha2-test-vectors.md
*/
static const struct hkdf_testvec hkdf_sha256_tv[] = {
{
.test = "basic hdkf test",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
"\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 22,
.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
.salt_size = 13,
.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
.info_size = 10,
.prk = "\x07\x77\x09\x36\x2c\x2e\x32\xdf\x0d\xdc\x3f\x0d\xc4\x7b\xba\x63"
"\x90\xb6\xc7\x3b\xb5\x0f\x9c\x31\x22\xec\x84\x4a\xd7\xc2\xb3\xe5",
.prk_size = 32,
.okm = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a"
"\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf"
"\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65",
.okm_size = 42,
}, {
.test = "hkdf test with long input",
.ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
"\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
.ikm_size = 80,
.salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
.salt_size = 80,
.info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
.info_size = 80,
.prk = "\x06\xa6\xb8\x8c\x58\x53\x36\x1a\x06\x10\x4c\x9c\xeb\x35\xb4\x5c"
"\xef\x76\x00\x14\x90\x46\x71\x01\x4a\x19\x3f\x40\xc1\x5f\xc2\x44",
.prk_size = 32,
.okm = "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34"
"\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c"
"\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09"
"\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71"
"\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f"
"\x1d\x87",
.okm_size = 82,
}, {
.test = "hkdf test with zero salt and info",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
"\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 22,
.salt = NULL,
.salt_size = 0,
.info = NULL,
.info_size = 0,
.prk = "\x19\xef\x24\xa3\x2c\x71\x7b\x16\x7f\x33\xa9\x1d\x6f\x64\x8b\xdf"
"\x96\x59\x67\x76\xaf\xdb\x63\x77\xac\x43\x4c\x1c\x29\x3c\xcb\x04",
.prk_size = 32,
.okm = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31"
"\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d"
"\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8",
.okm_size = 42,
}, {
.test = "hkdf test with short input",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 11,
.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
.salt_size = 13,
.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
.info_size = 10,
.prk = "\x82\x65\xf6\x9d\x7f\xf7\xe5\x01\x37\x93\x01\x5c\xa0\xef\x92\x0c"
"\xb1\x68\x21\x99\xc8\xbc\x3a\x00\xda\x0c\xab\x47\xb7\xb0\x0f\xdf",
.prk_size = 32,
.okm = "\x58\xdc\xe1\x0d\x58\x01\xcd\xfd\xa8\x31\x72\x6b\xfe\xbc\xb7\x43"
"\xd1\x4a\x7e\xe8\x3a\xa0\x57\xa9\x3d\x59\xb0\xa1\x31\x7f\xf0\x9d"
"\x10\x5c\xce\xcf\x53\x56\x92\xb1\x4d\xd5",
.okm_size = 42,
}, {
.test = "unsalted hkdf test with zero info",
.ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
"\x0c\x0c\x0c\x0c\x0c\x0c",
.ikm_size = 22,
.salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
.salt_size = 32,
.info = NULL,
.info_size = 0,
.prk = "\xaa\x84\x1e\x1f\x35\x74\xf3\x2d\x13\xfb\xa8\x00\x5f\xcd\x9b\x8d"
"\x77\x67\x82\xa5\xdf\xa1\x92\x38\x92\xfd\x8b\x63\x5d\x3a\x89\xdf",
.prk_size = 32,
.okm = "\x59\x68\x99\x17\x9a\xb1\xbc\x00\xa7\xc0\x37\x86\xff\x43\xee\x53"
"\x50\x04\xbe\x2b\xb9\xbe\x68\xbc\x14\x06\x63\x6f\x54\xbd\x33\x8a"
"\x66\xa2\x37\xba\x2a\xcb\xce\xe3\xc9\xa7",
.okm_size = 42,
}
};
static const struct hkdf_testvec hkdf_sha384_tv[] = {
{
.test = "basic hkdf test",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
"\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 22,
.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
.salt_size = 13,
.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
.info_size = 10,
.prk = "\x70\x4b\x39\x99\x07\x79\xce\x1d\xc5\x48\x05\x2c\x7d\xc3\x9f\x30"
"\x35\x70\xdd\x13\xfb\x39\xf7\xac\xc5\x64\x68\x0b\xef\x80\xe8\xde"
"\xc7\x0e\xe9\xa7\xe1\xf3\xe2\x93\xef\x68\xec\xeb\x07\x2a\x5a\xde",
.prk_size = 48,
.okm = "\x9b\x50\x97\xa8\x60\x38\xb8\x05\x30\x90\x76\xa4\x4b\x3a\x9f\x38"
"\x06\x3e\x25\xb5\x16\xdc\xbf\x36\x9f\x39\x4c\xfa\xb4\x36\x85\xf7"
"\x48\xb6\x45\x77\x63\xe4\xf0\x20\x4f\xc5",
.okm_size = 42,
}, {
.test = "hkdf test with long input",
.ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
"\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
.ikm_size = 80,
.salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
.salt_size = 80,
.info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
.info_size = 80,
.prk = "\xb3\x19\xf6\x83\x1d\xff\x93\x14\xef\xb6\x43\xba\xa2\x92\x63\xb3"
"\x0e\x4a\x8d\x77\x9f\xe3\x1e\x9c\x90\x1e\xfd\x7d\xe7\x37\xc8\x5b"
"\x62\xe6\x76\xd4\xdc\x87\xb0\x89\x5c\x6a\x7d\xc9\x7b\x52\xce\xbb",
.prk_size = 48,
.okm = "\x48\x4c\xa0\x52\xb8\xcc\x72\x4f\xd1\xc4\xec\x64\xd5\x7b\x4e\x81"
"\x8c\x7e\x25\xa8\xe0\xf4\x56\x9e\xd7\x2a\x6a\x05\xfe\x06\x49\xee"
"\xbf\x69\xf8\xd5\xc8\x32\x85\x6b\xf4\xe4\xfb\xc1\x79\x67\xd5\x49"
"\x75\x32\x4a\x94\x98\x7f\x7f\x41\x83\x58\x17\xd8\x99\x4f\xdb\xd6"
"\xf4\xc0\x9c\x55\x00\xdc\xa2\x4a\x56\x22\x2f\xea\x53\xd8\x96\x7a"
"\x8b\x2e",
.okm_size = 82,
}, {
.test = "hkdf test with zero salt and info",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
"\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 22,
.salt = NULL,
.salt_size = 0,
.info = NULL,
.info_size = 0,
.prk = "\x10\xe4\x0c\xf0\x72\xa4\xc5\x62\x6e\x43\xdd\x22\xc1\xcf\x72\x7d"
"\x4b\xb1\x40\x97\x5c\x9a\xd0\xcb\xc8\xe4\x5b\x40\x06\x8f\x8f\x0b"
"\xa5\x7c\xdb\x59\x8a\xf9\xdf\xa6\x96\x3a\x96\x89\x9a\xf0\x47\xe5",
.prk_size = 48,
.okm = "\xc8\xc9\x6e\x71\x0f\x89\xb0\xd7\x99\x0b\xca\x68\xbc\xde\xc8\xcf"
"\x85\x40\x62\xe5\x4c\x73\xa7\xab\xc7\x43\xfa\xde\x9b\x24\x2d\xaa"
"\xcc\x1c\xea\x56\x70\x41\x5b\x52\x84\x9c",
.okm_size = 42,
}, {
.test = "hkdf test with short input",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 11,
.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
.salt_size = 13,
.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
.info_size = 10,
.prk = "\x6d\x31\x69\x98\x28\x79\x80\x88\xb3\x59\xda\xd5\x0b\x8f\x01\xb0"
"\x15\xf1\x7a\xa3\xbd\x4e\x27\xa6\xe9\xf8\x73\xb7\x15\x85\xca\x6a"
"\x00\xd1\xf0\x82\x12\x8a\xdb\x3c\xf0\x53\x0b\x57\xc0\xf9\xac\x72",
.prk_size = 48,
.okm = "\xfb\x7e\x67\x43\xeb\x42\xcd\xe9\x6f\x1b\x70\x77\x89\x52\xab\x75"
"\x48\xca\xfe\x53\x24\x9f\x7f\xfe\x14\x97\xa1\x63\x5b\x20\x1f\xf1"
"\x85\xb9\x3e\x95\x19\x92\xd8\x58\xf1\x1a",
.okm_size = 42,
}, {
.test = "unsalted hkdf test with zero info",
.ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
"\x0c\x0c\x0c\x0c\x0c\x0c",
.ikm_size = 22,
.salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
.salt_size = 48,
.info = NULL,
.info_size = 0,
.prk = "\x9d\x2d\xa5\x06\x6f\x05\xd1\x6c\x59\xfe\xdf\x6c\x5f\x32\xc7\x5e"
"\xda\x9a\x47\xa7\x9c\x93\x6a\xa4\x4c\xb7\x63\xa8\xe2\x2f\xfb\xfc"
"\xd8\xfe\x55\x43\x58\x53\x47\x21\x90\x39\xd1\x68\x28\x36\x33\xf5",
.prk_size = 48,
.okm = "\x6a\xd7\xc7\x26\xc8\x40\x09\x54\x6a\x76\xe0\x54\x5d\xf2\x66\x78"
"\x7e\x2b\x2c\xd6\xca\x43\x73\xa1\xf3\x14\x50\xa7\xbd\xf9\x48\x2b"
"\xfa\xb8\x11\xf5\x54\x20\x0e\xad\x8f\x53",
.okm_size = 42,
}
};
static const struct hkdf_testvec hkdf_sha512_tv[] = {
{
.test = "basic hkdf test",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
"\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 22,
.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
.salt_size = 13,
.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
.info_size = 10,
.prk = "\x66\x57\x99\x82\x37\x37\xde\xd0\x4a\x88\xe4\x7e\x54\xa5\x89\x0b"
"\xb2\xc3\xd2\x47\xc7\xa4\x25\x4a\x8e\x61\x35\x07\x23\x59\x0a\x26"
"\xc3\x62\x38\x12\x7d\x86\x61\xb8\x8c\xf8\x0e\xf8\x02\xd5\x7e\x2f"
"\x7c\xeb\xcf\x1e\x00\xe0\x83\x84\x8b\xe1\x99\x29\xc6\x1b\x42\x37",
.prk_size = 64,
.okm = "\x83\x23\x90\x08\x6c\xda\x71\xfb\x47\x62\x5b\xb5\xce\xb1\x68\xe4"
"\xc8\xe2\x6a\x1a\x16\xed\x34\xd9\xfc\x7f\xe9\x2c\x14\x81\x57\x93"
"\x38\xda\x36\x2c\xb8\xd9\xf9\x25\xd7\xcb",
.okm_size = 42,
}, {
.test = "hkdf test with long input",
.ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"
"\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f"
"\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f"
"\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f",
.ikm_size = 80,
.salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f"
"\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f"
"\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f"
"\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f"
"\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf",
.salt_size = 80,
.info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf"
"\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf"
"\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf"
"\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef"
"\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff",
.info_size = 80,
.prk = "\x35\x67\x25\x42\x90\x7d\x4e\x14\x2c\x00\xe8\x44\x99\xe7\x4e\x1d"
"\xe0\x8b\xe8\x65\x35\xf9\x24\xe0\x22\x80\x4a\xd7\x75\xdd\xe2\x7e"
"\xc8\x6c\xd1\xe5\xb7\xd1\x78\xc7\x44\x89\xbd\xbe\xb3\x07\x12\xbe"
"\xb8\x2d\x4f\x97\x41\x6c\x5a\x94\xea\x81\xeb\xdf\x3e\x62\x9e\x4a",
.prk_size = 64,
.okm = "\xce\x6c\x97\x19\x28\x05\xb3\x46\xe6\x16\x1e\x82\x1e\xd1\x65\x67"
"\x3b\x84\xf4\x00\xa2\xb5\x14\xb2\xfe\x23\xd8\x4c\xd1\x89\xdd\xf1"
"\xb6\x95\xb4\x8c\xbd\x1c\x83\x88\x44\x11\x37\xb3\xce\x28\xf1\x6a"
"\xa6\x4b\xa3\x3b\xa4\x66\xb2\x4d\xf6\xcf\xcb\x02\x1e\xcf\xf2\x35"
"\xf6\xa2\x05\x6c\xe3\xaf\x1d\xe4\x4d\x57\x20\x97\xa8\x50\x5d\x9e"
"\x7a\x93",
.okm_size = 82,
}, {
.test = "hkdf test with zero salt and info",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b"
"\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 22,
.salt = NULL,
.salt_size = 0,
.info = NULL,
.info_size = 0,
.prk = "\xfd\x20\x0c\x49\x87\xac\x49\x13\x13\xbd\x4a\x2a\x13\x28\x71\x21"
"\x24\x72\x39\xe1\x1c\x9e\xf8\x28\x02\x04\x4b\x66\xef\x35\x7e\x5b"
"\x19\x44\x98\xd0\x68\x26\x11\x38\x23\x48\x57\x2a\x7b\x16\x11\xde"
"\x54\x76\x40\x94\x28\x63\x20\x57\x8a\x86\x3f\x36\x56\x2b\x0d\xf6",
.prk_size = 64,
.okm = "\xf5\xfa\x02\xb1\x82\x98\xa7\x2a\x8c\x23\x89\x8a\x87\x03\x47\x2c"
"\x6e\xb1\x79\xdc\x20\x4c\x03\x42\x5c\x97\x0e\x3b\x16\x4b\xf9\x0f"
"\xff\x22\xd0\x48\x36\xd0\xe2\x34\x3b\xac",
.okm_size = 42,
}, {
.test = "hkdf test with short input",
.ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b",
.ikm_size = 11,
.salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c",
.salt_size = 13,
.info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
.info_size = 10,
.prk = "\x67\x40\x9c\x9c\xac\x28\xb5\x2e\xe9\xfa\xd9\x1c\x2f\xda\x99\x9f"
"\x7c\xa2\x2e\x34\x34\xf0\xae\x77\x28\x63\x83\x65\x68\xad\x6a\x7f"
"\x10\xcf\x11\x3b\xfd\xdd\x56\x01\x29\xa5\x94\xa8\xf5\x23\x85\xc2"
"\xd6\x61\xd7\x85\xd2\x9c\xe9\x3a\x11\x40\x0c\x92\x06\x83\x18\x1d",
.prk_size = 64,
.okm = "\x74\x13\xe8\x99\x7e\x02\x06\x10\xfb\xf6\x82\x3f\x2c\xe1\x4b\xff"
"\x01\x87\x5d\xb1\xca\x55\xf6\x8c\xfc\xf3\x95\x4d\xc8\xaf\xf5\x35"
"\x59\xbd\x5e\x30\x28\xb0\x80\xf7\xc0\x68",
.okm_size = 42,
}, {
.test = "unsalted hkdf test with zero info",
.ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c"
"\x0c\x0c\x0c\x0c\x0c\x0c",
.ikm_size = 22,
.salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
.salt_size = 64,
.info = NULL,
.info_size = 0,
.prk = "\x53\x46\xb3\x76\xbf\x3a\xa9\xf8\x4f\x8f\x6e\xd5\xb1\xc4\xf4\x89"
"\x17\x2e\x24\x4d\xac\x30\x3d\x12\xf6\x8e\xcc\x76\x6e\xa6\x00\xaa"
"\x88\x49\x5e\x7f\xb6\x05\x80\x31\x22\xfa\x13\x69\x24\xa8\x40\xb1"
"\xf0\x71\x9d\x2d\x5f\x68\xe2\x9b\x24\x22\x99\xd7\x58\xed\x68\x0c",
.prk_size = 64,
.okm = "\x14\x07\xd4\x60\x13\xd9\x8b\xc6\xde\xce\xfc\xfe\xe5\x5f\x0f\x90"
"\xb0\xc7\xf6\x3d\x68\xeb\x1a\x80\xea\xf0\x7e\x95\x3c\xfc\x0a\x3a"
"\x52\x40\xa1\x55\xd6\xe4\xda\xa9\x65\xbb",
.okm_size = 42,
}
};
static int hkdf_test(const char *shash, const struct hkdf_testvec *tv)
{ struct crypto_shash *tfm = NULL;
u8 *prk = NULL, *okm = NULL;
unsigned int prk_size;
const char *driver;
int err;
tfm = crypto_alloc_shash(shash, 0, 0);
if (IS_ERR(tfm)) {
pr_err("%s(%s): failed to allocate transform: %ld\n",
tv->test, shash, PTR_ERR(tfm));
return PTR_ERR(tfm);
}
driver = crypto_shash_driver_name(tfm);
prk_size = crypto_shash_digestsize(tfm);
prk = kzalloc(prk_size, GFP_KERNEL);
if (!prk) {
err = -ENOMEM;
goto out_free;
}
if (tv->prk_size != prk_size) {
pr_err("%s(%s): prk size mismatch (vec %u, digest %u\n",
tv->test, driver, tv->prk_size, prk_size);
err = -EINVAL;
goto out_free;
}
err = hkdf_extract(tfm, tv->ikm, tv->ikm_size,
tv->salt, tv->salt_size, prk);
if (err) {
pr_err("%s(%s): hkdf_extract failed with %d\n",
tv->test, driver, err);
goto out_free;
}
if (memcmp(prk, tv->prk, tv->prk_size)) {
pr_err("%s(%s): hkdf_extract prk mismatch\n",
tv->test, driver);
print_hex_dump(KERN_ERR, "prk: ", DUMP_PREFIX_NONE,
16, 1, prk, tv->prk_size, false);
err = -EINVAL;
goto out_free;
}
okm = kzalloc(tv->okm_size, GFP_KERNEL);
if (!okm) {
err = -ENOMEM;
goto out_free;
}
err = crypto_shash_setkey(tfm, tv->prk, tv->prk_size);
if (err) {
pr_err("%s(%s): failed to set prk, error %d\n",
tv->test, driver, err);
goto out_free;
}
err = hkdf_expand(tfm, tv->info, tv->info_size,
okm, tv->okm_size);
if (err) {
pr_err("%s(%s): hkdf_expand() failed with %d\n",
tv->test, driver, err);
} else if (memcmp(okm, tv->okm, tv->okm_size)) {
pr_err("%s(%s): hkdf_expand() okm mismatch\n",
tv->test, driver);
print_hex_dump(KERN_ERR, "okm: ", DUMP_PREFIX_NONE,
16, 1, okm, tv->okm_size, false);
err = -EINVAL;
}
out_free:
kfree(okm);
kfree(prk);
crypto_free_shash(tfm);
return err;
}
static int __init crypto_hkdf_module_init(void)
{
int ret = 0, i;
if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS))
return 0;
for (i = 0; i < ARRAY_SIZE(hkdf_sha256_tv); i++) {
ret = hkdf_test("hmac(sha256)", &hkdf_sha256_tv[i]);
if (ret)
return ret;
}
for (i = 0; i < ARRAY_SIZE(hkdf_sha384_tv); i++) {
ret = hkdf_test("hmac(sha384)", &hkdf_sha384_tv[i]);
if (ret)
return ret;
}
for (i = 0; i < ARRAY_SIZE(hkdf_sha512_tv); i++) {
ret = hkdf_test("hmac(sha512)", &hkdf_sha512_tv[i]);
if (ret)
return ret;
}
return 0;
}
static void __exit crypto_hkdf_module_exit(void) {}
late_initcall(crypto_hkdf_module_init);
module_exit(crypto_hkdf_module_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("HMAC-based Key Derivation Function (HKDF)");

View File

@@ -3,7 +3,6 @@ drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd-y += drbd_interval.o drbd_state.o
drbd-y += drbd_nla.o
drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o

View File

@@ -874,7 +874,7 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
if (uuid && uuid != UUID_JUST_CREATED)
uuid = uuid + UUID_NEW_BM_OFFSET;
else
get_random_bytes(&uuid, sizeof(u64));
uuid = get_random_u64();
drbd_uuid_set(device, UI_BITMAP, uuid);
drbd_print_uuids(device, "updated sync UUID");
drbd_md_sync(device);
@@ -3337,7 +3337,7 @@ void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
u64 val;
unsigned long long bm_uuid;
get_random_bytes(&val, sizeof(u64));
val = get_random_u64();
spin_lock_irq(&device->ldev->md.uuid_lock);
bm_uuid = device->ldev->md.uuid[UI_BITMAP];

File diff suppressed because it is too large Load Diff

View File

@@ -1,56 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>
#include "drbd_nla.h"
static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
{
struct nlattr *head = nla_data(nla);
int len = nla_len(nla);
int rem;
/*
* validate_nla (called from nla_parse_nested) ignores attributes
* beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
* In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
* flag set also, check and remove that flag before calling
* nla_parse_nested.
*/
nla_for_each_attr(nla, head, len, rem) {
if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
if (nla_type(nla) > maxtype)
return -EOPNOTSUPP;
}
}
return 0;
}
int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy)
{
int err;
err = drbd_nla_check_mandatory(maxtype, nla);
if (!err)
err = nla_parse_nested_deprecated(tb, maxtype, nla, policy,
NULL);
return err;
}
struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
{
int err;
/*
* If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
* we don't know about that attribute, reject all the nested
* attributes.
*/
err = drbd_nla_check_mandatory(maxtype, nla);
if (err)
return ERR_PTR(err);
return nla_find_nested(nla, attrtype);
}

View File

@@ -1,9 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
const struct nla_policy *policy);
extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
#endif /* __DRBD_NLA_H */

View File

@@ -46,6 +46,8 @@
#include <linux/kref.h>
#include <linux/kfifo.h>
#include <linux/blk-integrity.h>
#include <linux/maple_tree.h>
#include <linux/xarray.h>
#include <uapi/linux/fs.h>
#include <uapi/linux/ublk_cmd.h>
@@ -58,6 +60,11 @@
#define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE)
#define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV)
#define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV)
#define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF)
#define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF)
/* Default max shmem buffer size: 4GB (may be increased in future) */
#define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32)
#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF)
#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF)
@@ -81,7 +88,8 @@
| (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \
| UBLK_F_SAFE_STOP_DEV \
| UBLK_F_BATCH_IO \
| UBLK_F_NO_AUTO_PART_SCAN)
| UBLK_F_NO_AUTO_PART_SCAN \
| UBLK_F_SHMEM_ZC)
#define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE \
@@ -289,6 +297,13 @@ struct ublk_queue {
struct ublk_io ios[] __counted_by(q_depth);
};
/* Maple tree value: maps a PFN range to buffer location */
struct ublk_buf_range {
unsigned short buf_index;
unsigned short flags;
unsigned int base_offset; /* byte offset within buffer */
};
struct ublk_device {
struct gendisk *ub_disk;
@@ -323,6 +338,10 @@ struct ublk_device {
bool block_open; /* protected by open_mutex */
/* shared memory zero copy */
struct maple_tree buf_tree;
struct ida buf_ida;
struct ublk_queue *queues[];
};
@@ -334,6 +353,9 @@ struct ublk_params_header {
static void ublk_io_release(void *priv);
static void ublk_stop_dev_unlocked(struct ublk_device *ub);
static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq,
u32 *buf_idx, u32 *buf_off);
static void ublk_buf_cleanup(struct ublk_device *ub);
static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq);
static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub,
u16 q_id, u16 tag, struct ublk_io *io);
@@ -398,6 +420,22 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub)
return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY;
}
static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_SHMEM_ZC;
}
static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq,
unsigned int tag)
{
return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC;
}
static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub)
{
return ub->dev_info.flags & UBLK_F_SHMEM_ZC;
}
static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq)
{
return ubq->flags & UBLK_F_AUTO_BUF_REG;
@@ -808,7 +846,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
static int ublk_integrity_flags(u32 flags)
{
int ret_flags = 0;
int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE;
if (flags & LBMD_PI_CAP_INTEGRITY) {
flags &= ~LBMD_PI_CAP_INTEGRITY;
@@ -1460,6 +1498,19 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
iod->op_flags = ublk_op | ublk_req_build_flags(req);
iod->nr_sectors = blk_rq_sectors(req);
iod->start_sector = blk_rq_pos(req);
/* Try shmem zero-copy match before setting addr */
if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) {
u32 buf_idx, buf_off;
if (ublk_try_buf_match(ubq->dev, req,
&buf_idx, &buf_off)) {
iod->op_flags |= UBLK_IO_F_SHMEM_ZC;
iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off);
return BLK_STS_OK;
}
}
iod->addr = io->buf.addr;
return BLK_STS_OK;
@@ -1505,6 +1556,10 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io,
req_op(req) != REQ_OP_DRV_IN)
goto exit;
/* shmem zero copy: no data to unmap, pages already shared */
if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag))
goto exit;
/* for READ request, writing data in iod->addr to rq buffers */
unmapped_bytes = ublk_unmap_io(need_map, req, io);
@@ -1663,7 +1718,13 @@ static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq,
static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req,
struct ublk_io *io)
{
unsigned mapped_bytes = ublk_map_io(ubq, req, io);
unsigned mapped_bytes;
/* shmem zero copy: skip data copy, pages already shared */
if (ublk_iod_is_shmem_zc(ubq, req->tag))
return true;
mapped_bytes = ublk_map_io(ubq, req, io);
/* partially mapped, update io descriptor */
if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
@@ -1789,7 +1850,7 @@ static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq,
* Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf.
* Returns the new length after filtering.
*/
static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
unsigned int len)
{
unsigned int i, j;
@@ -1805,6 +1866,41 @@ static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf,
return j;
}
static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq,
const struct ublk_batch_io_data *data,
unsigned short *tag_buf, size_t len, int ret)
{
int i, res;
/*
* Undo prep state for all IOs since userspace never received them.
* This restores IOs to pre-prepared state so they can be cleanly
* re-prepared when tags are pulled from FIFO again.
*/
for (i = 0; i < len; i++) {
struct ublk_io *io = &ubq->ios[tag_buf[i]];
int index = -1;
ublk_io_lock(io);
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
index = io->buf.auto_reg.index;
io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
io->flags |= UBLK_IO_FLAG_ACTIVE;
ublk_io_unlock(io);
if (index != -1)
io_buffer_unregister_bvec(data->cmd, index,
data->issue_flags);
}
res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
tag_buf, len, &ubq->evts_lock);
pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
"tags(%d %zu) ret %d\n", __func__, res, len,
ret);
}
#define MAX_NR_TAG 128
static int __ublk_batch_dispatch(struct ublk_queue *ubq,
const struct ublk_batch_io_data *data,
@@ -1848,37 +1944,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq,
sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz);
ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags);
if (unlikely(ret < 0)) {
int i, res;
/*
* Undo prep state for all IOs since userspace never received them.
* This restores IOs to pre-prepared state so they can be cleanly
* re-prepared when tags are pulled from FIFO again.
*/
for (i = 0; i < len; i++) {
struct ublk_io *io = &ubq->ios[tag_buf[i]];
int index = -1;
ublk_io_lock(io);
if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG)
index = io->buf.auto_reg.index;
io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG);
io->flags |= UBLK_IO_FLAG_ACTIVE;
ublk_io_unlock(io);
if (index != -1)
io_buffer_unregister_bvec(data->cmd, index,
data->issue_flags);
}
res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo,
tag_buf, len, &ubq->evts_lock);
pr_warn_ratelimited("%s: copy tags or post CQE failure, move back "
"tags(%d %zu) ret %d\n", __func__, res, len,
ret);
}
if (unlikely(ret < 0))
ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret);
return ret;
}
@@ -2910,22 +2977,26 @@ static void ublk_stop_dev(struct ublk_device *ub)
ublk_cancel_dev(ub);
}
static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io)
{
/* UBLK_IO_FLAG_CANCELED can be cleared now */
spin_lock(&ubq->cancel_lock);
io->flags &= ~UBLK_IO_FLAG_CANCELED;
spin_unlock(&ubq->cancel_lock);
}
/* reset per-queue io flags */
static void ublk_queue_reset_io_flags(struct ublk_queue *ubq)
{
int j;
/* UBLK_IO_FLAG_CANCELED can be cleared now */
spin_lock(&ubq->cancel_lock);
for (j = 0; j < ubq->q_depth; j++)
ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED;
ubq->canceling = false;
spin_unlock(&ubq->cancel_lock);
ubq->fail_io = false;
}
/* device can only be started after all IOs are ready */
static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id,
struct ublk_io *io)
__must_hold(&ub->mutex)
{
struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
@@ -2934,6 +3005,7 @@ static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id)
ub->unprivileged_daemons = true;
ubq->nr_io_ready++;
ublk_reset_io_flags(ubq, io);
/* Check if this specific queue is now fully ready */
if (ublk_queue_ready(ubq)) {
@@ -3196,7 +3268,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub,
if (!ret)
ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL);
if (!ret)
ublk_mark_io_ready(ub, q_id);
ublk_mark_io_ready(ub, q_id, io);
mutex_unlock(&ub->mutex);
return ret;
}
@@ -3604,7 +3676,7 @@ static int ublk_batch_prep_io(struct ublk_queue *ubq,
ublk_io_unlock(io);
if (!ret)
ublk_mark_io_ready(data->ub, ubq->q_id);
ublk_mark_io_ready(data->ub, ubq->q_id, io);
return ret;
}
@@ -4200,6 +4272,7 @@ static void ublk_cdev_rel(struct device *dev)
{
struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
ublk_buf_cleanup(ub);
blk_mq_free_tag_set(&ub->tag_set);
ublk_deinit_queues(ub);
ublk_free_dev_number(ub);
@@ -4621,6 +4694,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header)
mutex_init(&ub->mutex);
spin_lock_init(&ub->lock);
mutex_init(&ub->cancel_mutex);
mt_init(&ub->buf_tree);
ida_init(&ub->buf_ida);
INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work);
ret = ublk_alloc_dev_number(ub, header->dev_id);
@@ -5171,6 +5246,314 @@ exit:
return err;
}
/*
* Lock for maple tree modification: acquire ub->mutex, then freeze queue
* if device is started. If device is not yet started, only mutex is
* needed since no I/O path can access the tree.
*
* This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked()
* already holds ub->mutex when calling del_gendisk() which freezes the queue.
*/
static unsigned int ublk_lock_buf_tree(struct ublk_device *ub)
{
unsigned int memflags = 0;
mutex_lock(&ub->mutex);
if (ub->ub_disk)
memflags = blk_mq_freeze_queue(ub->ub_disk->queue);
return memflags;
}
static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags)
{
if (ub->ub_disk)
blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags);
mutex_unlock(&ub->mutex);
}
/* Erase coalesced PFN ranges from the maple tree matching buf_index */
static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
{
MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
struct ublk_buf_range *range;
mas_lock(&mas);
mas_for_each(&mas, range, ULONG_MAX) {
if (range->buf_index == buf_index) {
mas_erase(&mas);
kfree(range);
}
}
mas_unlock(&mas);
}
static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
struct page **pages, unsigned long nr_pages,
int index, unsigned short flags)
{
unsigned long i;
int ret;
for (i = 0; i < nr_pages; i++) {
unsigned long pfn = page_to_pfn(pages[i]);
unsigned long start = i;
struct ublk_buf_range *range;
/* Find run of consecutive PFNs */
while (i + 1 < nr_pages &&
page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
i++;
range = kzalloc(sizeof(*range), GFP_KERNEL);
if (!range) {
ret = -ENOMEM;
goto unwind;
}
range->buf_index = index;
range->flags = flags;
range->base_offset = start << PAGE_SHIFT;
ret = mtree_insert_range(&ub->buf_tree, pfn,
pfn + (i - start),
range, GFP_KERNEL);
if (ret) {
kfree(range);
goto unwind;
}
}
return 0;
unwind:
ublk_buf_erase_ranges(ub, index);
return ret;
}
/*
* Register a shared memory buffer for zero-copy I/O.
* Pins pages, builds PFN maple tree, freezes/unfreezes the queue
* internally. Returns buffer index (>= 0) on success.
*/
static int ublk_ctrl_reg_buf(struct ublk_device *ub,
struct ublksrv_ctrl_cmd *header)
{
void __user *argp = (void __user *)(unsigned long)header->addr;
struct ublk_shmem_buf_reg buf_reg;
unsigned long nr_pages;
struct page **pages = NULL;
unsigned int gup_flags;
unsigned int memflags;
long pinned;
int index;
int ret;
if (!ublk_dev_support_shmem_zc(ub))
return -EOPNOTSUPP;
memset(&buf_reg, 0, sizeof(buf_reg));
if (copy_from_user(&buf_reg, argp,
min_t(size_t, header->len, sizeof(buf_reg))))
return -EFAULT;
if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY)
return -EINVAL;
if (buf_reg.reserved)
return -EINVAL;
if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX ||
!PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr))
return -EINVAL;
nr_pages = buf_reg.len >> PAGE_SHIFT;
/* Pin pages before any locks (may sleep) */
pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
if (!pages)
return -ENOMEM;
gup_flags = FOLL_LONGTERM;
if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
gup_flags |= FOLL_WRITE;
pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages);
if (pinned < 0) {
ret = pinned;
goto err_free_pages;
}
if (pinned != nr_pages) {
ret = -EFAULT;
goto err_unpin;
}
memflags = ublk_lock_buf_tree(ub);
index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL);
if (index < 0) {
ret = index;
goto err_unlock;
}
ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags);
if (ret) {
ida_free(&ub->buf_ida, index);
goto err_unlock;
}
ublk_unlock_buf_tree(ub, memflags);
kvfree(pages);
return index;
err_unlock:
ublk_unlock_buf_tree(ub, memflags);
err_unpin:
unpin_user_pages(pages, pinned);
err_free_pages:
kvfree(pages);
return ret;
}
static int __ublk_ctrl_unreg_buf(struct ublk_device *ub, int buf_index)
{
MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
struct ublk_buf_range *range;
struct page *pages[32];
int ret = -ENOENT;
mas_lock(&mas);
mas_for_each(&mas, range, ULONG_MAX) {
unsigned long base, nr, off;
if (range->buf_index != buf_index)
continue;
ret = 0;
base = mas.index;
nr = mas.last - base + 1;
mas_erase(&mas);
for (off = 0; off < nr; ) {
unsigned int batch = min_t(unsigned long,
nr - off, 32);
unsigned int j;
for (j = 0; j < batch; j++)
pages[j] = pfn_to_page(base + off + j);
unpin_user_pages(pages, batch);
off += batch;
}
kfree(range);
}
mas_unlock(&mas);
return ret;
}
static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
struct ublksrv_ctrl_cmd *header)
{
int index = (int)header->data[0];
unsigned int memflags;
int ret;
if (!ublk_dev_support_shmem_zc(ub))
return -EOPNOTSUPP;
if (index < 0 || index > USHRT_MAX)
return -EINVAL;
memflags = ublk_lock_buf_tree(ub);
ret = __ublk_ctrl_unreg_buf(ub, index);
if (!ret)
ida_free(&ub->buf_ida, index);
ublk_unlock_buf_tree(ub, memflags);
return ret;
}
static void ublk_buf_cleanup(struct ublk_device *ub)
{
MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
struct ublk_buf_range *range;
struct page *pages[32];
mas_for_each(&mas, range, ULONG_MAX) {
unsigned long base = mas.index;
unsigned long nr = mas.last - base + 1;
unsigned long off;
for (off = 0; off < nr; ) {
unsigned int batch = min_t(unsigned long,
nr - off, 32);
unsigned int j;
for (j = 0; j < batch; j++)
pages[j] = pfn_to_page(base + off + j);
unpin_user_pages(pages, batch);
off += batch;
}
kfree(range);
}
mtree_destroy(&ub->buf_tree);
ida_destroy(&ub->buf_ida);
}
/* Check if request pages match a registered shared memory buffer */
static bool ublk_try_buf_match(struct ublk_device *ub,
struct request *rq,
u32 *buf_idx, u32 *buf_off)
{
struct req_iterator iter;
struct bio_vec bv;
int index = -1;
unsigned long expected_offset = 0;
bool first = true;
rq_for_each_bvec(bv, rq, iter) {
unsigned long pfn = page_to_pfn(bv.bv_page);
unsigned long end_pfn = pfn +
((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT);
struct ublk_buf_range *range;
unsigned long off;
MA_STATE(mas, &ub->buf_tree, pfn, pfn);
range = mas_walk(&mas);
if (!range)
return false;
/* verify all pages in this bvec fall within the range */
if (end_pfn > mas.last)
return false;
off = range->base_offset +
(pfn - mas.index) * PAGE_SIZE + bv.bv_offset;
if (first) {
/* Read-only buffer can't serve READ (kernel writes) */
if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) &&
req_op(rq) != REQ_OP_WRITE)
return false;
index = range->buf_index;
expected_offset = off;
*buf_off = off;
first = false;
} else {
if (range->buf_index != index)
return false;
if (off != expected_offset)
return false;
}
expected_offset += bv.bv_len;
}
if (first)
return false;
*buf_idx = index;
return true;
}
static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
u32 cmd_op, struct ublksrv_ctrl_cmd *header)
{
@@ -5228,6 +5611,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub,
case UBLK_CMD_UPDATE_SIZE:
case UBLK_CMD_QUIESCE_DEV:
case UBLK_CMD_TRY_STOP_DEV:
case UBLK_CMD_REG_BUF:
case UBLK_CMD_UNREG_BUF:
mask = MAY_READ | MAY_WRITE;
break;
default:
@@ -5352,6 +5737,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_TRY_STOP_DEV:
ret = ublk_ctrl_try_stop_dev(ub);
break;
case UBLK_CMD_REG_BUF:
ret = ublk_ctrl_reg_buf(ub, &header);
break;
case UBLK_CMD_UNREG_BUF:
ret = ublk_ctrl_unreg_buf(ub, &header);
break;
default:
ret = -EOPNOTSUPP;
break;

View File

@@ -17,6 +17,7 @@
#include <linux/mutex.h>
#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/xattr.h>
/*
* Options for adding (and removing) a device.
@@ -34,6 +35,8 @@ enum {
ZLOOP_OPT_BUFFERED_IO = (1 << 8),
ZLOOP_OPT_ZONE_APPEND = (1 << 9),
ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10),
ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11),
ZLOOP_OPT_MAX_OPEN_ZONES = (1 << 12),
};
static const match_table_t zloop_opt_tokens = {
@@ -48,6 +51,8 @@ static const match_table_t zloop_opt_tokens = {
{ ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
{ ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" },
{ ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" },
{ ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" },
{ ZLOOP_OPT_MAX_OPEN_ZONES, "max_open_zones=%u" },
{ ZLOOP_OPT_ERR, NULL }
};
@@ -56,6 +61,7 @@ static const match_table_t zloop_opt_tokens = {
#define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
#define ZLOOP_DEF_NR_ZONES 64
#define ZLOOP_DEF_NR_CONV_ZONES 8
#define ZLOOP_DEF_MAX_OPEN_ZONES 0
#define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
#define ZLOOP_DEF_NR_QUEUES 1
#define ZLOOP_DEF_QUEUE_DEPTH 128
@@ -73,12 +79,14 @@ struct zloop_options {
sector_t zone_size;
sector_t zone_capacity;
unsigned int nr_conv_zones;
unsigned int max_open_zones;
char *base_dir;
unsigned int nr_queues;
unsigned int queue_depth;
bool buffered_io;
bool zone_append;
bool ordered_zone_append;
bool discard_write_cache;
};
/*
@@ -95,7 +103,12 @@ enum zloop_zone_flags {
ZLOOP_ZONE_SEQ_ERROR,
};
/*
* Zone descriptor.
* Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock
*/
struct zloop_zone {
struct list_head open_zone_entry;
struct file *file;
unsigned long flags;
@@ -119,6 +132,7 @@ struct zloop_device {
bool buffered_io;
bool zone_append;
bool ordered_zone_append;
bool discard_write_cache;
const char *base_dir;
struct file *data_dir;
@@ -128,8 +142,13 @@ struct zloop_device {
sector_t zone_capacity;
unsigned int nr_zones;
unsigned int nr_conv_zones;
unsigned int max_open_zones;
unsigned int block_size;
spinlock_t open_zones_lock;
struct list_head open_zones_lru_list;
unsigned int nr_open_zones;
struct zloop_zone zones[] __counted_by(nr_zones);
};
@@ -153,6 +172,122 @@ static unsigned int rq_zone_no(struct request *rq)
return blk_rq_pos(rq) >> zlo->zone_shift;
}
/*
* Open an already open zone. This is mostly a no-op, except for the imp open ->
* exp open condition change that may happen. We also move a zone at the tail of
* the list of open zones so that if we need to
* implicitly close one open zone, we can do so in LRU order.
*/
static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo,
struct zloop_zone *zone)
{
if (zlo->max_open_zones) {
spin_lock(&zlo->open_zones_lock);
list_move_tail(&zone->open_zone_entry,
&zlo->open_zones_lru_list);
spin_unlock(&zlo->open_zones_lock);
}
}
static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo,
struct zloop_zone *zone)
{
if (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
zone->cond == BLK_ZONE_COND_EXP_OPEN) {
spin_lock(&zlo->open_zones_lock);
list_del_init(&zone->open_zone_entry);
zlo->nr_open_zones--;
spin_unlock(&zlo->open_zones_lock);
}
}
static inline bool zloop_can_open_zone(struct zloop_device *zlo)
{
return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones;
}
/*
* If we have reached the maximum open zones limit, attempt to close an
* implicitly open zone (if we have any) so that we can implicitly open another
* zone without exceeding the maximum number of open zones.
*/
static bool zloop_close_imp_open_zone(struct zloop_device *zlo)
{
struct zloop_zone *zone;
lockdep_assert_held(&zlo->open_zones_lock);
if (zloop_can_open_zone(zlo))
return true;
list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) {
if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
zone->cond = BLK_ZONE_COND_CLOSED;
list_del_init(&zone->open_zone_entry);
zlo->nr_open_zones--;
return true;
}
}
return false;
}
static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo,
struct zloop_zone *zone,
bool explicit)
{
spin_lock(&zlo->open_zones_lock);
if (explicit) {
/*
* Explicit open: we cannot allow this if we have reached the
* maximum open zones limit.
*/
if (!zloop_can_open_zone(zlo))
goto fail;
zone->cond = BLK_ZONE_COND_EXP_OPEN;
} else {
/*
* Implicit open case: if we have reached the maximum open zones
* limit, try to close an implicitly open zone first.
*/
if (!zloop_close_imp_open_zone(zlo))
goto fail;
zone->cond = BLK_ZONE_COND_IMP_OPEN;
}
zlo->nr_open_zones++;
list_add_tail(&zone->open_zone_entry,
&zlo->open_zones_lru_list);
spin_unlock(&zlo->open_zones_lock);
return true;
fail:
spin_unlock(&zlo->open_zones_lock);
return false;
}
static bool zloop_do_open_zone(struct zloop_device *zlo,
struct zloop_zone *zone, bool explicit)
{
switch (zone->cond) {
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
if (explicit)
zone->cond = BLK_ZONE_COND_EXP_OPEN;
zloop_lru_rotate_open_zone(zlo, zone);
return true;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_CLOSED:
return zloop_open_closed_or_empty_zone(zlo, zone, explicit);
default:
return false;
}
}
static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
{
struct zloop_zone *zone = &zlo->zones[zone_no];
@@ -186,13 +321,17 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
spin_lock_irqsave(&zone->wp_lock, flags);
if (!file_sectors) {
zloop_lru_remove_open_zone(zlo, zone);
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
} else if (file_sectors == zlo->zone_capacity) {
zloop_lru_remove_open_zone(zlo, zone);
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = ULLONG_MAX;
} else {
zone->cond = BLK_ZONE_COND_CLOSED;
if (zone->cond != BLK_ZONE_COND_IMP_OPEN &&
zone->cond != BLK_ZONE_COND_EXP_OPEN)
zone->cond = BLK_ZONE_COND_CLOSED;
zone->wp = zone->start + file_sectors;
}
spin_unlock_irqrestore(&zone->wp_lock, flags);
@@ -216,19 +355,8 @@ static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
goto unlock;
}
switch (zone->cond) {
case BLK_ZONE_COND_EXP_OPEN:
break;
case BLK_ZONE_COND_EMPTY:
case BLK_ZONE_COND_CLOSED:
case BLK_ZONE_COND_IMP_OPEN:
zone->cond = BLK_ZONE_COND_EXP_OPEN;
break;
case BLK_ZONE_COND_FULL:
default:
if (!zloop_do_open_zone(zlo, zone, true))
ret = -EIO;
break;
}
unlock:
mutex_unlock(&zone->lock);
@@ -259,6 +387,7 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_EXP_OPEN:
spin_lock_irqsave(&zone->wp_lock, flags);
zloop_lru_remove_open_zone(zlo, zone);
if (zone->wp == zone->start)
zone->cond = BLK_ZONE_COND_EMPTY;
else
@@ -300,6 +429,7 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
}
spin_lock_irqsave(&zone->wp_lock, flags);
zloop_lru_remove_open_zone(zlo, zone);
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
@@ -347,6 +477,7 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
}
spin_lock_irqsave(&zone->wp_lock, flags);
zloop_lru_remove_open_zone(zlo, zone);
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = ULLONG_MAX;
clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
@@ -378,125 +509,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret)
zloop_put_cmd(cmd);
}
static void zloop_rw(struct zloop_cmd *cmd)
static int zloop_do_rw(struct zloop_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE;
unsigned int nr_bvec = blk_rq_nr_bvec(rq);
struct zloop_device *zlo = rq->q->queuedata;
unsigned int zone_no = rq_zone_no(rq);
sector_t sector = blk_rq_pos(rq);
sector_t nr_sectors = blk_rq_sectors(rq);
bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
int rw = is_write ? ITER_SOURCE : ITER_DEST;
struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)];
struct req_iterator rq_iter;
struct zloop_zone *zone;
struct iov_iter iter;
struct bio_vec tmp;
unsigned long flags;
sector_t zone_end;
unsigned int nr_bvec;
int ret;
atomic_set(&cmd->ref, 2);
cmd->sector = sector;
cmd->nr_sectors = nr_sectors;
cmd->ret = 0;
if (WARN_ON_ONCE(is_append && !zlo->zone_append)) {
ret = -EIO;
goto out;
}
/* We should never get an I/O beyond the device capacity. */
if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
ret = -EIO;
goto out;
}
zone = &zlo->zones[zone_no];
zone_end = zone->start + zlo->zone_capacity;
/*
* The block layer should never send requests that are not fully
* contained within the zone.
*/
if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
ret = -EIO;
goto out;
}
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
mutex_lock(&zone->lock);
ret = zloop_update_seq_zone(zlo, zone_no);
mutex_unlock(&zone->lock);
if (ret)
goto out;
}
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
mutex_lock(&zone->lock);
spin_lock_irqsave(&zone->wp_lock, flags);
/*
* Zone append operations always go at the current write
* pointer, but regular write operations must already be
* aligned to the write pointer when submitted.
*/
if (is_append) {
/*
* If ordered zone append is in use, we already checked
* and set the target sector in zloop_queue_rq().
*/
if (!zlo->ordered_zone_append) {
if (zone->cond == BLK_ZONE_COND_FULL ||
zone->wp + nr_sectors > zone_end) {
spin_unlock_irqrestore(&zone->wp_lock,
flags);
ret = -EIO;
goto unlock;
}
sector = zone->wp;
}
cmd->sector = sector;
} else if (sector != zone->wp) {
spin_unlock_irqrestore(&zone->wp_lock, flags);
pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
zone_no, sector, zone->wp);
ret = -EIO;
goto unlock;
}
/* Implicitly open the target zone. */
if (zone->cond == BLK_ZONE_COND_CLOSED ||
zone->cond == BLK_ZONE_COND_EMPTY)
zone->cond = BLK_ZONE_COND_IMP_OPEN;
/*
* Advance the write pointer, unless ordered zone append is in
* use. If the write fails, the write pointer position will be
* corrected when the next I/O starts execution.
*/
if (!is_append || !zlo->ordered_zone_append) {
zone->wp += nr_sectors;
if (zone->wp == zone_end) {
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = ULLONG_MAX;
}
}
spin_unlock_irqrestore(&zone->wp_lock, flags);
}
nr_bvec = blk_rq_nr_bvec(rq);
if (rq->bio != rq->biotail) {
struct bio_vec *bvec;
struct bio_vec tmp, *bvec;
cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO);
if (!cmd->bvec) {
ret = -EIO;
goto unlock;
}
if (!cmd->bvec)
return -EIO;
/*
* The bios of the request may be started from the middle of
@@ -522,7 +550,7 @@ static void zloop_rw(struct zloop_cmd *cmd)
iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
}
cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT;
cmd->iocb.ki_filp = zone->file;
cmd->iocb.ki_complete = zloop_rw_complete;
if (!zlo->buffered_io)
@@ -530,18 +558,166 @@ static void zloop_rw(struct zloop_cmd *cmd)
cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
if (rw == ITER_SOURCE)
ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
else
ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
unlock:
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
return zone->file->f_op->write_iter(&cmd->iocb, &iter);
return zone->file->f_op->read_iter(&cmd->iocb, &iter);
}
static int zloop_seq_write_prep(struct zloop_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct zloop_device *zlo = rq->q->queuedata;
unsigned int zone_no = rq_zone_no(rq);
sector_t nr_sectors = blk_rq_sectors(rq);
bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
struct zloop_zone *zone = &zlo->zones[zone_no];
sector_t zone_end = zone->start + zlo->zone_capacity;
unsigned long flags;
int ret = 0;
spin_lock_irqsave(&zone->wp_lock, flags);
/*
* Zone append operations always go at the current write pointer, but
* regular write operations must already be aligned to the write pointer
* when submitted.
*/
if (is_append) {
/*
* If ordered zone append is in use, we already checked and set
* the target sector in zloop_queue_rq().
*/
if (!zlo->ordered_zone_append) {
if (zone->cond == BLK_ZONE_COND_FULL ||
zone->wp + nr_sectors > zone_end) {
ret = -EIO;
goto out_unlock;
}
cmd->sector = zone->wp;
}
} else {
if (cmd->sector != zone->wp) {
pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
zone_no, cmd->sector, zone->wp);
ret = -EIO;
goto out_unlock;
}
}
/* Implicitly open the target zone. */
if (!zloop_do_open_zone(zlo, zone, false)) {
ret = -EIO;
goto out_unlock;
}
/*
* Advance the write pointer, unless ordered zone append is in use. If
* the write fails, the write pointer position will be corrected when
* the next I/O starts execution.
*/
if (!is_append || !zlo->ordered_zone_append) {
zone->wp += nr_sectors;
if (zone->wp == zone_end) {
zloop_lru_remove_open_zone(zlo, zone);
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = ULLONG_MAX;
}
}
out_unlock:
spin_unlock_irqrestore(&zone->wp_lock, flags);
return ret;
}
static void zloop_rw(struct zloop_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct zloop_device *zlo = rq->q->queuedata;
unsigned int zone_no = rq_zone_no(rq);
sector_t nr_sectors = blk_rq_sectors(rq);
bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
struct zloop_zone *zone;
int ret = -EIO;
atomic_set(&cmd->ref, 2);
cmd->sector = blk_rq_pos(rq);
cmd->nr_sectors = nr_sectors;
cmd->ret = 0;
if (WARN_ON_ONCE(is_append && !zlo->zone_append))
goto out;
/* We should never get an I/O beyond the device capacity. */
if (WARN_ON_ONCE(zone_no >= zlo->nr_zones))
goto out;
zone = &zlo->zones[zone_no];
/*
* The block layer should never send requests that are not fully
* contained within the zone.
*/
if (WARN_ON_ONCE(cmd->sector + nr_sectors >
zone->start + zlo->zone_size))
goto out;
if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
mutex_lock(&zone->lock);
ret = zloop_update_seq_zone(zlo, zone_no);
mutex_unlock(&zone->lock);
if (ret)
goto out;
}
if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
mutex_lock(&zone->lock);
ret = zloop_seq_write_prep(cmd);
if (!ret)
ret = zloop_do_rw(cmd);
mutex_unlock(&zone->lock);
} else {
ret = zloop_do_rw(cmd);
}
out:
if (ret != -EIOCBQUEUED)
zloop_rw_complete(&cmd->iocb, ret);
zloop_put_cmd(cmd);
}
static inline bool zloop_zone_is_active(struct zloop_zone *zone)
{
switch (zone->cond) {
case BLK_ZONE_COND_EXP_OPEN:
case BLK_ZONE_COND_IMP_OPEN:
case BLK_ZONE_COND_CLOSED:
return true;
default:
return false;
}
}
static int zloop_record_safe_wps(struct zloop_device *zlo)
{
unsigned int i;
int ret;
for (i = 0; i < zlo->nr_zones; i++) {
struct zloop_zone *zone = &zlo->zones[i];
struct file *file = zone->file;
if (!zloop_zone_is_active(zone))
continue;
ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file),
"user.zloop.wp", &zone->wp, sizeof(zone->wp), 0);
if (ret) {
pr_err("%pg: failed to record write pointer (%d)\n",
zlo->disk->part0, ret);
return ret;
}
}
return 0;
}
/*
* Sync the entire FS containing the zone files instead of walking all files.
*/
@@ -550,6 +726,12 @@ static int zloop_flush(struct zloop_device *zlo)
struct super_block *sb = file_inode(zlo->data_dir)->i_sb;
int ret;
if (zlo->discard_write_cache) {
ret = zloop_record_safe_wps(zlo);
if (ret)
return ret;
}
down_read(&sb->s_umount);
ret = sync_filesystem(sb);
up_read(&sb->s_umount);
@@ -692,6 +874,7 @@ static bool zloop_set_zone_append_sector(struct request *rq)
rq->__sector = zone->wp;
zone->wp += blk_rq_sectors(rq);
if (zone->wp >= zone_end) {
zloop_lru_remove_open_zone(zlo, zone);
zone->cond = BLK_ZONE_COND_FULL;
zone->wp = ULLONG_MAX;
}
@@ -889,6 +1072,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
int ret;
mutex_init(&zone->lock);
INIT_LIST_HEAD(&zone->open_zone_entry);
spin_lock_init(&zone->wp_lock);
zone->start = (sector_t)zone_no << zlo->zone_shift;
@@ -1009,12 +1193,20 @@ static int zloop_ctl_add(struct zloop_options *opts)
goto out;
}
if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) {
pr_err("Invalid maximum number of open zones %u\n",
opts->max_open_zones);
goto out;
}
zlo = kvzalloc_flex(*zlo, zones, nr_zones);
if (!zlo) {
ret = -ENOMEM;
goto out;
}
WRITE_ONCE(zlo->state, Zlo_creating);
spin_lock_init(&zlo->open_zones_lock);
INIT_LIST_HEAD(&zlo->open_zones_lru_list);
ret = mutex_lock_killable(&zloop_ctl_mutex);
if (ret)
@@ -1042,10 +1234,12 @@ static int zloop_ctl_add(struct zloop_options *opts)
zlo->zone_capacity = zlo->zone_size;
zlo->nr_zones = nr_zones;
zlo->nr_conv_zones = opts->nr_conv_zones;
zlo->max_open_zones = opts->max_open_zones;
zlo->buffered_io = opts->buffered_io;
zlo->zone_append = opts->zone_append;
if (zlo->zone_append)
zlo->ordered_zone_append = opts->ordered_zone_append;
zlo->discard_write_cache = opts->discard_write_cache;
zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
opts->nr_queues * opts->queue_depth, zlo->id);
@@ -1088,6 +1282,7 @@ static int zloop_ctl_add(struct zloop_options *opts)
lim.logical_block_size = zlo->block_size;
if (zlo->zone_append)
lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
lim.max_open_zones = zlo->max_open_zones;
zlo->tag_set.ops = &zloop_mq_ops;
zlo->tag_set.nr_hw_queues = opts->nr_queues;
@@ -1168,6 +1363,49 @@ out:
return ret;
}
static void zloop_truncate(struct file *file, loff_t pos)
{
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct dentry *dentry = file_dentry(file);
struct iattr newattrs;
newattrs.ia_size = pos;
newattrs.ia_valid = ATTR_SIZE;
inode_lock(dentry->d_inode);
notify_change(idmap, dentry, &newattrs, NULL);
inode_unlock(dentry->d_inode);
}
static void zloop_forget_cache(struct zloop_device *zlo)
{
unsigned int i;
int ret;
pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0);
for (i = 0; i < zlo->nr_zones; i++) {
struct zloop_zone *zone = &zlo->zones[i];
struct file *file = zone->file;
sector_t old_wp;
if (!zloop_zone_is_active(zone))
continue;
ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file),
"user.zloop.wp", &old_wp, sizeof(old_wp));
if (ret == -ENODATA) {
old_wp = 0;
} else if (ret != sizeof(old_wp)) {
pr_err("%pg: failed to retrieve write pointer (%d)\n",
zlo->disk->part0, ret);
continue;
}
if (old_wp < zone->wp)
zloop_truncate(file, old_wp);
}
}
static int zloop_ctl_remove(struct zloop_options *opts)
{
struct zloop_device *zlo;
@@ -1202,6 +1440,10 @@ static int zloop_ctl_remove(struct zloop_options *opts)
return ret;
del_gendisk(zlo->disk);
if (zlo->discard_write_cache)
zloop_forget_cache(zlo);
put_disk(zlo->disk);
pr_info("Removed device %d\n", opts->id);
@@ -1224,6 +1466,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES;
opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
@@ -1302,6 +1545,13 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
}
opts->nr_conv_zones = token;
break;
case ZLOOP_OPT_MAX_OPEN_ZONES:
if (match_uint(args, &token)) {
ret = -EINVAL;
goto out;
}
opts->max_open_zones = token;
break;
case ZLOOP_OPT_BASE_DIR:
p = match_strdup(args);
if (!p) {
@@ -1353,6 +1603,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf)
case ZLOOP_OPT_ORDERED_ZONE_APPEND:
opts->ordered_zone_append = true;
break;
case ZLOOP_OPT_DISCARD_WRITE_CACHE:
opts->discard_write_cache = true;
break;
case ZLOOP_OPT_ERR:
default:
pr_warn("unknown parameter or missing value '%s'\n", p);

View File

@@ -1373,6 +1373,14 @@ static CLOSURE_CALLBACK(cached_dev_free)
mutex_unlock(&bch_register_lock);
/*
* Wait for any pending sb_write to complete before free.
* The sb_bio is embedded in struct cached_dev, so we must
* ensure no I/O is in progress.
*/
down(&dc->sb_write_mutex);
up(&dc->sb_write_mutex);
if (dc->sb_disk)
folio_put(virt_to_folio(dc->sb_disk));

View File

@@ -208,6 +208,20 @@ enum llbitmap_state {
BitNeedSync,
/* data is synchronizing */
BitSyncing,
/*
* Proactive sync requested for unwritten region (raid456 only).
* Triggered via sysfs when user wants to pre-build XOR parity
* for regions that have never been written.
*/
BitNeedSyncUnwritten,
/* Proactive sync in progress for unwritten region */
BitSyncingUnwritten,
/*
* XOR parity has been pre-built for a region that has never had
* user data written. When user writes to this region, it transitions
* to BitDirty.
*/
BitCleanUnwritten,
BitStateCount,
BitNone = 0xff,
};
@@ -232,6 +246,12 @@ enum llbitmap_action {
* BitNeedSync.
*/
BitmapActionStale,
/*
* Proactive sync trigger for raid456 - builds XOR parity for
* Unwritten regions without requiring user data write first.
*/
BitmapActionProactiveSync,
BitmapActionClearUnwritten,
BitmapActionCount,
/* Init state is BitUnwritten */
BitmapActionInit,
@@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitNone,
[BitmapActionStale] = BitNone,
[BitmapActionProactiveSync] = BitNeedSyncUnwritten,
[BitmapActionClearUnwritten] = BitNone,
},
[BitClean] = {
[BitmapActionStartwrite] = BitDirty,
@@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitDirty] = {
[BitmapActionStartwrite] = BitNone,
@@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitClean,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitNeedSync] = {
[BitmapActionStartwrite] = BitNone,
@@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNone,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitSyncing] = {
[BitmapActionStartwrite] = BitNone,
@@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = {
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitNeedSync,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitNone,
},
[BitNeedSyncUnwritten] = {
[BitmapActionStartwrite] = BitNeedSync,
[BitmapActionStartsync] = BitSyncingUnwritten,
[BitmapActionEndsync] = BitNone,
[BitmapActionAbortsync] = BitUnwritten,
[BitmapActionReload] = BitUnwritten,
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitUnwritten,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitUnwritten,
},
[BitSyncingUnwritten] = {
[BitmapActionStartwrite] = BitSyncing,
[BitmapActionStartsync] = BitSyncingUnwritten,
[BitmapActionEndsync] = BitCleanUnwritten,
[BitmapActionAbortsync] = BitUnwritten,
[BitmapActionReload] = BitUnwritten,
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitUnwritten,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitUnwritten,
},
[BitCleanUnwritten] = {
[BitmapActionStartwrite] = BitDirty,
[BitmapActionStartsync] = BitNone,
[BitmapActionEndsync] = BitNone,
[BitmapActionAbortsync] = BitNone,
[BitmapActionReload] = BitNone,
[BitmapActionDaemon] = BitNone,
[BitmapActionDiscard] = BitUnwritten,
[BitmapActionStale] = BitUnwritten,
[BitmapActionProactiveSync] = BitNone,
[BitmapActionClearUnwritten] = BitUnwritten,
},
};
@@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
pctl->state[pos] = level_456 ? BitNeedSync : BitDirty;
break;
case BitClean:
case BitCleanUnwritten:
pctl->state[pos] = BitDirty;
break;
}
@@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap,
}
static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
int offset)
int offset, bool infect)
{
struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx];
unsigned int io_size = llbitmap->io_size;
@@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx,
* resync all the dirty bits, hence skip infect new dirty bits to
* prevent resync unnecessary data.
*/
if (llbitmap->mddev->degraded) {
if (llbitmap->mddev->degraded || !infect) {
set_bit(block, pctl->dirty);
return;
}
@@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state,
llbitmap->pctl[idx]->state[bit] = state;
if (state == BitDirty || state == BitNeedSync)
llbitmap_set_page_dirty(llbitmap, idx, bit);
llbitmap_set_page_dirty(llbitmap, idx, bit, true);
else if (state == BitNeedSyncUnwritten)
llbitmap_set_page_dirty(llbitmap, idx, bit, false);
}
static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
@@ -459,7 +528,8 @@ static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx)
rdev_for_each(rdev, mddev) {
sector_t sector;
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) ||
!test_bit(In_sync, &rdev->flags))
continue;
sector = mddev->bitmap_info.offset +
@@ -584,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap)
return 0;
}
/*
* Check if all underlying disks support write_zeroes with unmap.
*/
static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap)
{
struct mddev *mddev = llbitmap->mddev;
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
continue;
if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0)
return false;
}
return true;
}
/*
* Issue write_zeroes to all underlying disks to zero their data regions.
* This ensures parity consistency for RAID-456 (0 XOR 0 = 0).
* Returns true if all disks were successfully zeroed.
*/
static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap)
{
struct mddev *mddev = llbitmap->mddev;
struct md_rdev *rdev;
sector_t dev_sectors = mddev->dev_sectors;
int ret;
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags))
continue;
ret = blkdev_issue_zeroout(rdev->bdev,
rdev->data_offset,
dev_sectors,
GFP_KERNEL, 0);
if (ret) {
pr_warn("md/llbitmap: failed to zero disk %pg: %d\n",
rdev->bdev, ret);
return false;
}
}
return true;
}
static void llbitmap_init_state(struct llbitmap *llbitmap)
{
struct mddev *mddev = llbitmap->mddev;
enum llbitmap_state state = BitUnwritten;
unsigned long i;
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags))
if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) {
state = BitClean;
} else if (raid_is_456(mddev) &&
llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) {
/*
* All disks support write_zeroes with unmap. Zero all disks
* to ensure parity consistency, then set BitCleanUnwritten
* to skip initial sync.
*/
if (llbitmap_zero_all_disks(llbitmap))
state = BitCleanUnwritten;
}
for (i = 0; i < llbitmap->chunks; i++)
llbitmap_write(llbitmap, state, i);
@@ -626,11 +756,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap,
goto write_bitmap;
}
if (c == BitNeedSync)
if (c == BitNeedSync || c == BitNeedSyncUnwritten)
need_resync = !mddev->degraded;
state = state_machine[c][action];
write_bitmap:
if (unlikely(mddev->degraded)) {
/* For degraded array, mark new data as need sync. */
@@ -657,8 +786,7 @@ write_bitmap:
}
llbitmap_write(llbitmap, state, start);
if (state == BitNeedSync)
if (state == BitNeedSync || state == BitNeedSyncUnwritten)
need_resync = !mddev->degraded;
else if (state == BitDirty &&
!timer_pending(&llbitmap->pending_timer))
@@ -1069,12 +1197,12 @@ static void llbitmap_start_write(struct mddev *mddev, sector_t offset,
int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
while (page_start <= page_end) {
llbitmap_raise_barrier(llbitmap, page_start);
page_start++;
}
llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite);
}
static void llbitmap_end_write(struct mddev *mddev, sector_t offset,
@@ -1101,12 +1229,12 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset,
int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT;
llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
while (page_start <= page_end) {
llbitmap_raise_barrier(llbitmap, page_start);
page_start++;
}
llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard);
}
static void llbitmap_end_discard(struct mddev *mddev, sector_t offset,
@@ -1228,7 +1356,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset)
unsigned long p = offset >> llbitmap->chunkshift;
enum llbitmap_state c = llbitmap_read(llbitmap, p);
return c == BitClean || c == BitDirty;
return c == BitClean || c == BitDirty || c == BitCleanUnwritten;
}
static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
@@ -1242,6 +1370,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset)
if (c == BitUnwritten)
return blocks;
/* Skip CleanUnwritten - no user data, will be reset after recovery */
if (c == BitCleanUnwritten)
return blocks;
/* For degraded array, don't skip */
if (mddev->degraded)
return 0;
@@ -1260,14 +1392,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset,
{
struct llbitmap *llbitmap = mddev->bitmap;
unsigned long p = offset >> llbitmap->chunkshift;
enum llbitmap_state state;
/*
* Before recovery starts, convert CleanUnwritten to Unwritten.
* This ensures the new disk won't have stale parity data.
*/
if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
!test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery))
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
BitmapActionClearUnwritten);
/*
* Handle one bit at a time, this is much simpler. And it doesn't matter
* if md_do_sync() loop more times.
*/
*blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1));
return llbitmap_state_machine(llbitmap, p, p,
BitmapActionStartsync) == BitSyncing;
state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync);
return state == BitSyncing || state == BitSyncingUnwritten;
}
/* Something is wrong, sync_thread stop at @offset */
@@ -1473,9 +1616,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page)
}
mutex_unlock(&mddev->bitmap_info.mutex);
return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n",
return sprintf(page,
"unwritten %d\nclean %d\ndirty %d\n"
"need sync %d\nsyncing %d\n"
"need sync unwritten %d\nsyncing unwritten %d\n"
"clean unwritten %d\n",
bits[BitUnwritten], bits[BitClean], bits[BitDirty],
bits[BitNeedSync], bits[BitSyncing]);
bits[BitNeedSync], bits[BitSyncing],
bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten],
bits[BitCleanUnwritten]);
}
static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits);
@@ -1548,11 +1697,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len)
static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle);
static ssize_t
proactive_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
struct llbitmap *llbitmap;
/* Only for RAID-456 */
if (!raid_is_456(mddev))
return -EINVAL;
mutex_lock(&mddev->bitmap_info.mutex);
llbitmap = mddev->bitmap;
if (!llbitmap || !llbitmap->pctl) {
mutex_unlock(&mddev->bitmap_info.mutex);
return -ENODEV;
}
/* Trigger proactive sync on all Unwritten regions */
llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1,
BitmapActionProactiveSync);
mutex_unlock(&mddev->bitmap_info.mutex);
return len;
}
static struct md_sysfs_entry llbitmap_proactive_sync =
__ATTR(proactive_sync, 0200, NULL, proactive_sync_store);
static struct attribute *md_llbitmap_attrs[] = {
&llbitmap_bits.attr,
&llbitmap_metadata.attr,
&llbitmap_daemon_sleep.attr,
&llbitmap_barrier_idle.attr,
&llbitmap_proactive_sync.attr,
NULL
};

View File

@@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule);
static const struct kobj_type md_ktype;
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
/*
* This workqueue is used for sync_work to register new sync_thread, and for
@@ -98,7 +97,7 @@ static struct workqueue_struct *md_misc_wq;
static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev);
static void md_wakeup_thread_directly(struct md_thread __rcu **thread);
/*
@@ -188,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev)
spin_lock_init(&serial_tmp->serial_lock);
serial_tmp->serial_rb = RB_ROOT_CACHED;
init_waitqueue_head(&serial_tmp->serial_io_wait);
}
rdev->serial = serial;
@@ -489,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible)
}
percpu_ref_kill(&mddev->active_io);
/*
* RAID456 IO can sleep in wait_for_reshape while still holding an
* active_io reference. If reshape is already interrupted or frozen,
* wake those waiters so they can abort and drop the reference instead
* of deadlocking suspend.
*/
if (mddev->pers && mddev->pers->prepare_suspend &&
reshape_interrupted(mddev))
mddev->pers->prepare_suspend(mddev);
if (interruptible)
err = wait_event_interruptible(mddev->sb_wait,
percpu_ref_is_zero(&mddev->active_io));
@@ -959,7 +968,7 @@ void mddev_unlock(struct mddev *mddev)
list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
list_del_init(&rdev->same_set);
kobject_del(&rdev->kobj);
export_rdev(rdev, mddev);
export_rdev(rdev);
}
if (!legacy_async_del_gendisk) {
@@ -2632,7 +2641,7 @@ void md_autodetect_dev(dev_t dev);
/* just for claiming the bdev */
static struct md_rdev claim_rdev;
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
static void export_rdev(struct md_rdev *rdev)
{
pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
md_rdev_clear(rdev);
@@ -2788,7 +2797,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
if (!md_is_rdwr(mddev)) {
if (force_change)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev));
if (!mddev_is_dm(mddev))
pr_err_ratelimited("%s: can't update sb for read-only array %s\n",
__func__, mdname(mddev));
return;
}
@@ -4848,7 +4859,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
export_rdev(rdev, mddev);
export_rdev(rdev);
mddev_unlock_and_resume(mddev);
if (!err)
md_new_event();
@@ -6128,10 +6139,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr,
}
spin_unlock(&all_mddevs_lock);
rv = entry->store(mddev, page, length);
mddev_put(mddev);
/*
* For "array_state=clear", dropping the extra kobject reference from
* sysfs_break_active_protection() can trigger md kobject deletion.
* Restore active protection before mddev_put() so deletion happens
* after the sysfs write path fully unwinds.
*/
if (kn)
sysfs_unbreak_active_protection(kn);
mddev_put(mddev);
return rv;
}
@@ -6447,15 +6464,124 @@ static void md_safemode_timeout(struct timer_list *t)
static int start_dirty_degraded;
/*
* Read bitmap superblock and return the bitmap_id based on disk version.
* This is used as fallback when default bitmap version and on-disk version
* doesn't match, and mdadm is not the latest version to set bitmap_type.
*/
static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev)
{
struct md_rdev *rdev;
struct page *sb_page;
bitmap_super_t *sb;
enum md_submodule_id id = ID_BITMAP_NONE;
sector_t sector;
u32 version;
if (!mddev->bitmap_info.offset)
return ID_BITMAP_NONE;
sb_page = alloc_page(GFP_KERNEL);
if (!sb_page) {
pr_warn("md: %s: failed to allocate memory for bitmap\n",
mdname(mddev));
return ID_BITMAP_NONE;
}
sector = mddev->bitmap_info.offset;
rdev_for_each(rdev, mddev) {
u32 iosize;
if (!test_bit(In_sync, &rdev->flags) ||
test_bit(Faulty, &rdev->flags) ||
test_bit(Bitmap_sync, &rdev->flags))
continue;
iosize = roundup(sizeof(bitmap_super_t),
bdev_logical_block_size(rdev->bdev));
if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ,
true))
goto read_ok;
}
pr_warn("md: %s: failed to read bitmap from any device\n",
mdname(mddev));
goto out;
read_ok:
sb = kmap_local_page(sb_page);
if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) {
pr_warn("md: %s: invalid bitmap magic 0x%x\n",
mdname(mddev), le32_to_cpu(sb->magic));
goto out_unmap;
}
version = le32_to_cpu(sb->version);
switch (version) {
case BITMAP_MAJOR_LO:
case BITMAP_MAJOR_HI:
case BITMAP_MAJOR_CLUSTERED:
id = ID_BITMAP;
break;
case BITMAP_MAJOR_LOCKLESS:
id = ID_LLBITMAP;
break;
default:
pr_warn("md: %s: unknown bitmap version %u\n",
mdname(mddev), version);
break;
}
out_unmap:
kunmap_local(sb);
out:
__free_page(sb_page);
return id;
}
static int md_bitmap_create(struct mddev *mddev)
{
enum md_submodule_id orig_id = mddev->bitmap_id;
enum md_submodule_id sb_id;
int err;
if (mddev->bitmap_id == ID_BITMAP_NONE)
return -EINVAL;
if (!mddev_set_bitmap_ops(mddev))
return -ENOENT;
return mddev->bitmap_ops->create(mddev);
err = mddev->bitmap_ops->create(mddev);
if (!err)
return 0;
/*
* Create failed, if default bitmap version and on-disk version
* doesn't match, and mdadm is not the latest version to set
* bitmap_type, set bitmap_ops based on the disk version.
*/
mddev_clear_bitmap_ops(mddev);
sb_id = md_bitmap_get_id_from_sb(mddev);
if (sb_id == ID_BITMAP_NONE || sb_id == orig_id)
return err;
pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n",
mdname(mddev), orig_id, sb_id);
mddev->bitmap_id = sb_id;
if (!mddev_set_bitmap_ops(mddev)) {
mddev->bitmap_id = orig_id;
return -ENOENT;
}
err = mddev->bitmap_ops->create(mddev);
if (err) {
mddev_clear_bitmap_ops(mddev);
mddev->bitmap_id = orig_id;
}
return err;
}
static void md_bitmap_destroy(struct mddev *mddev)
@@ -7140,7 +7266,7 @@ static void autorun_devices(int part)
rdev_for_each_list(rdev, tmp, &candidates) {
list_del_init(&rdev->same_set);
if (bind_rdev_to_array(rdev, mddev))
export_rdev(rdev, mddev);
export_rdev(rdev);
}
autorun_array(mddev);
mddev_unlock_and_resume(mddev);
@@ -7150,7 +7276,7 @@ static void autorun_devices(int part)
*/
rdev_for_each_list(rdev, tmp, &candidates) {
list_del_init(&rdev->same_set);
export_rdev(rdev, mddev);
export_rdev(rdev);
}
mddev_put(mddev);
}
@@ -7338,13 +7464,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
pr_warn("md: %pg has different UUID to %pg\n",
rdev->bdev,
rdev0->bdev);
export_rdev(rdev, mddev);
export_rdev(rdev);
return -EINVAL;
}
}
err = bind_rdev_to_array(rdev, mddev);
if (err)
export_rdev(rdev, mddev);
export_rdev(rdev);
return err;
}
@@ -7387,7 +7513,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
/* This was a hot-add request, but events doesn't
* match, so reject it.
*/
export_rdev(rdev, mddev);
export_rdev(rdev);
return -EINVAL;
}
@@ -7413,7 +7539,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
}
}
if (has_journal || mddev->bitmap) {
export_rdev(rdev, mddev);
export_rdev(rdev);
return -EBUSY;
}
set_bit(Journal, &rdev->flags);
@@ -7428,7 +7554,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
/* --add initiated by this node */
err = mddev->cluster_ops->add_new_disk(mddev, rdev);
if (err) {
export_rdev(rdev, mddev);
export_rdev(rdev);
return err;
}
}
@@ -7438,7 +7564,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
err = bind_rdev_to_array(rdev, mddev);
if (err)
export_rdev(rdev, mddev);
export_rdev(rdev);
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
@@ -7501,7 +7627,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info)
err = bind_rdev_to_array(rdev, mddev);
if (err) {
export_rdev(rdev, mddev);
export_rdev(rdev);
return err;
}
}
@@ -7613,7 +7739,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
return 0;
abort_export:
export_rdev(rdev, mddev);
export_rdev(rdev);
return err;
}
@@ -10503,10 +10629,6 @@ static int __init md_init(void)
goto err_bitmap;
ret = -ENOMEM;
md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!md_wq)
goto err_wq;
md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0);
if (!md_misc_wq)
goto err_misc_wq;
@@ -10531,8 +10653,6 @@ err_mdp:
err_md:
destroy_workqueue(md_misc_wq);
err_misc_wq:
destroy_workqueue(md_wq);
err_wq:
md_llbitmap_exit();
err_bitmap:
md_bitmap_exit();
@@ -10841,7 +10961,6 @@ static __exit void md_exit(void)
spin_unlock(&all_mddevs_lock);
destroy_workqueue(md_misc_wq);
destroy_workqueue(md_wq);
md_bitmap_exit();
}

View File

@@ -126,7 +126,6 @@ enum sync_action {
struct serial_in_rdev {
struct rb_root_cached serial_rb;
spinlock_t serial_lock;
wait_queue_head_t serial_io_wait;
};
/*
@@ -381,7 +380,11 @@ struct serial_info {
struct rb_node node;
sector_t start; /* start sector of rb node */
sector_t last; /* end sector of rb node */
sector_t wnode_start; /* address of waiting nodes on the same list */
sector_t _subtree_last; /* highest sector in subtree of rb node */
struct list_head list_node;
struct list_head waiters;
struct completion ready;
};
/*

View File

@@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
}
err = -ENOMEM;
conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones);
conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones);
if (!conf->strip_zone)
goto abort;
conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *),
conf->nr_strip_zones,
mddev->raid_disks),
GFP_KERNEL);
conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *),
conf->nr_strip_zones,
mddev->raid_disks),
GFP_KERNEL);
if (!conf->devlist)
goto abort;
@@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
return 0;
abort:
kfree(conf->strip_zone);
kfree(conf->devlist);
kvfree(conf->strip_zone);
kvfree(conf->devlist);
kfree(conf);
*private_conf = ERR_PTR(err);
return err;
@@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv)
{
struct r0conf *conf = priv;
kfree(conf->strip_zone);
kfree(conf->devlist);
kvfree(conf->strip_zone);
kvfree(conf->devlist);
kfree(conf);
}

View File

@@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last,
START, LAST, static inline, raid1_rb);
static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio,
struct serial_info *si, int idx)
struct serial_info *si)
{
unsigned long flags;
int ret = 0;
sector_t lo = r1_bio->sector;
sector_t hi = lo + r1_bio->sectors;
sector_t hi = lo + r1_bio->sectors - 1;
int idx = sector_to_idx(r1_bio->sector);
struct serial_in_rdev *serial = &rdev->serial[idx];
struct serial_info *head_si;
spin_lock_irqsave(&serial->serial_lock, flags);
/* collision happened */
if (raid1_rb_iter_first(&serial->serial_rb, lo, hi))
ret = -EBUSY;
else {
head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
if (head_si && head_si != si) {
si->start = lo;
si->last = hi;
si->wnode_start = head_si->wnode_start;
list_add_tail(&si->list_node, &head_si->waiters);
ret = -EBUSY;
} else if (!head_si) {
si->start = lo;
si->last = hi;
si->wnode_start = si->start;
raid1_rb_insert(si, &serial->serial_rb);
}
spin_unlock_irqrestore(&serial->serial_lock, flags);
@@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio)
{
struct mddev *mddev = rdev->mddev;
struct serial_info *si;
int idx = sector_to_idx(r1_bio->sector);
struct serial_in_rdev *serial = &rdev->serial[idx];
if (WARN_ON(!mddev->serial_info_pool))
return;
si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO);
wait_event(serial->serial_io_wait,
check_and_add_serial(rdev, r1_bio, si, idx) == 0);
INIT_LIST_HEAD(&si->waiters);
INIT_LIST_HEAD(&si->list_node);
init_completion(&si->ready);
while (check_and_add_serial(rdev, r1_bio, si)) {
wait_for_completion(&si->ready);
reinit_completion(&si->ready);
}
}
static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
{
struct serial_info *si;
struct serial_info *si, *iter_si;
unsigned long flags;
int found = 0;
struct mddev *mddev = rdev->mddev;
@@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi)
for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi);
si; si = raid1_rb_iter_next(si, lo, hi)) {
if (si->start == lo && si->last == hi) {
raid1_rb_remove(si, &serial->serial_rb);
mempool_free(si, mddev->serial_info_pool);
found = 1;
break;
}
}
if (!found)
if (found) {
raid1_rb_remove(si, &serial->serial_rb);
if (!list_empty(&si->waiters)) {
list_for_each_entry(iter_si, &si->waiters, list_node) {
if (iter_si->wnode_start == si->wnode_start) {
list_del_init(&iter_si->list_node);
list_splice_init(&si->waiters, &iter_si->waiters);
raid1_rb_insert(iter_si, &serial->serial_rb);
complete(&iter_si->ready);
break;
}
}
}
mempool_free(si, mddev->serial_info_pool);
} else {
WARN(1, "The write IO is not recorded for serialization\n");
}
spin_unlock_irqrestore(&serial->serial_lock, flags);
wake_up(&serial->serial_io_wait);
}
/*
@@ -452,7 +475,7 @@ static void raid1_end_write_request(struct bio *bio)
int mirror = find_bio_disk(r1_bio, bio);
struct md_rdev *rdev = conf->mirrors[mirror].rdev;
sector_t lo = r1_bio->sector;
sector_t hi = r1_bio->sector + r1_bio->sectors;
sector_t hi = r1_bio->sector + r1_bio->sectors - 1;
bool ignore_error = !raid1_should_handle_error(bio) ||
(bio->bi_status && bio_op(bio) == REQ_OP_DISCARD);
@@ -1878,7 +1901,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
if (info->rdev)
return false;
if (bdev_nonrot(rdev->bdev)) {
if (!bdev_rot(rdev->bdev)) {
set_bit(Nonrot, &rdev->flags);
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
}

View File

@@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf,
if (!do_balance)
break;
nonrot = bdev_nonrot(rdev->bdev);
nonrot = !bdev_rot(rdev->bdev);
has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
if (min_pending > pending && nonrot) {
@@ -1184,7 +1184,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) {
raid_end_bio_io(r10_bio);
free_r10bio(r10_bio);
return;
}
@@ -1372,7 +1372,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
sectors = r10_bio->sectors;
if (!regular_request_wait(mddev, conf, bio, sectors)) {
raid_end_bio_io(r10_bio);
free_r10bio(r10_bio);
return;
}

View File

@@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
return -ENOMEM;
while (mb_offset < le32_to_cpu(mb->meta_size)) {
sector_t payload_len;
payload = (void *)mb + mb_offset;
payload_flush = (void *)mb + mb_offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
payload_len = sizeof(struct r5l_payload_data_parity) +
(sector_t)sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
goto mismatch;
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) {
payload_len = sizeof(struct r5l_payload_data_parity) +
(sector_t)sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
goto mismatch;
if (r5l_recovery_verify_data_checksum(
log, ctx, page, log_offset,
payload->checksum[0]) < 0)
@@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log,
payload->checksum[1]) < 0)
goto mismatch;
} else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
/* nothing to do for R5LOG_PAYLOAD_FLUSH here */
payload_len = sizeof(struct r5l_payload_flush) +
(sector_t)le32_to_cpu(payload_flush->size);
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
goto mismatch;
} else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */
goto mismatch;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
mb_offset += sizeof(struct r5l_payload_flush) +
le32_to_cpu(payload_flush->size);
} else {
/* DATA or PARITY payload */
if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) {
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
mb_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
}
mb_offset += payload_len;
}
put_page(page);
@@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
while (mb_offset < le32_to_cpu(mb->meta_size)) {
sector_t payload_len;
int dd;
payload = (void *)mb + mb_offset;
@@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) {
int i, count;
payload_len = sizeof(struct r5l_payload_flush) +
(sector_t)le32_to_cpu(payload_flush->size);
if (mb_offset + payload_len >
le32_to_cpu(mb->meta_size))
return -EINVAL;
count = le32_to_cpu(payload_flush->size) / sizeof(__le64);
for (i = 0; i < count; ++i) {
stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]);
@@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
}
}
mb_offset += sizeof(struct r5l_payload_flush) +
le32_to_cpu(payload_flush->size);
mb_offset += payload_len;
continue;
}
/* DATA or PARITY payload */
payload_len = sizeof(struct r5l_payload_data_parity) +
(sector_t)sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (mb_offset + payload_len > le32_to_cpu(mb->meta_size))
return -EINVAL;
stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ?
raid5_compute_sector(
conf, le64_to_cpu(payload->location), 0, &dd,
@@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
log_offset = r5l_ring_add(log, log_offset,
le32_to_cpu(payload->size));
mb_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
mb_offset += payload_len;
}
return 0;

View File

@@ -3916,6 +3916,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
break;
}
BUG_ON(other < 0);
if (test_bit(R5_LOCKED, &sh->dev[other].flags))
return 0;
pr_debug("Computing stripe %llu blocks %d,%d\n",
(unsigned long long)sh->sector,
disk_idx, other);
@@ -4594,20 +4596,6 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
async_tx_quiesce(&tx);
}
/*
* handle_stripe - do things to a stripe.
*
* We lock the stripe by setting STRIPE_ACTIVE and then examine the
* state of various bits to see what needs to be done.
* Possible results:
* return some read requests which now have data
* return some write requests which are safely on storage
* schedule a read on some buffers
* schedule a write of some buffers
* return confirmation of parity correctness
*
*/
static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
{
struct r5conf *conf = sh->raid_conf;
@@ -4901,6 +4889,18 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
set_bit(STRIPE_HANDLE, &head_sh->state);
}
/*
* handle_stripe - do things to a stripe.
*
* We lock the stripe by setting STRIPE_ACTIVE and then examine the
* state of various bits to see what needs to be done.
* Possible results:
* return some read requests which now have data
* return some write requests which are safely on storage
* schedule a read on some buffers
* schedule a write of some buffers
* return confirmation of parity correctness
*/
static void handle_stripe(struct stripe_head *sh)
{
struct stripe_head_state s;
@@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
raid5_release_stripe(sh);
int hash;
spin_lock_irq(&conf->device_lock);
hash = sh->hash_lock_index;
__release_stripe(conf, sh,
&conf->temp_inactive_list[hash]);
spin_unlock_irq(&conf->device_lock);
conf->retry_read_aligned = raid_bio;
conf->retry_read_offset = scnt;
return handled;
@@ -7541,7 +7547,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
if (test_bit(Journal, &rdev->flags))
continue;
if (bdev_nonrot(rdev->bdev)) {
if (!bdev_rot(rdev->bdev)) {
conf->batch_bio_dispatch = false;
break;
}
@@ -7780,6 +7786,7 @@ static int raid5_set_limits(struct mddev *mddev)
lim.logical_block_size = mddev->logical_block_size;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
lim.chunk_sectors = lim.io_opt >> 9;
lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;

View File

@@ -801,7 +801,6 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
}
#endif
void md_raid5_kick_device(struct r5conf *conf);
int raid5_set_cache_size(struct mddev *mddev, int size);
sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
void raid5_release_stripe(struct stripe_head *sh);

View File

@@ -0,0 +1,6 @@
CONFIG_KUNIT=y
CONFIG_PCI=y
CONFIG_BLOCK=y
CONFIG_BLK_DEV_NVME=y
CONFIG_NVME_HOST_AUTH=y
CONFIG_NVME_AUTH_KUNIT_TEST=y

View File

@@ -7,9 +7,15 @@ config NVME_KEYRING
config NVME_AUTH
tristate
select CRYPTO
select CRYPTO_HMAC
select CRYPTO_SHA256
select CRYPTO_SHA512
select CRYPTO_DH
select CRYPTO_DH_RFC7919_GROUPS
select CRYPTO_HKDF
select CRYPTO_LIB_SHA256
select CRYPTO_LIB_SHA512
config NVME_AUTH_KUNIT_TEST
tristate "KUnit tests for NVMe authentication" if !KUNIT_ALL_TESTS
depends on KUNIT && NVME_AUTH
default KUNIT_ALL_TESTS
help
Enable KUnit tests for some of the common code for NVMe over Fabrics
In-Band Authentication.

View File

@@ -7,3 +7,5 @@ obj-$(CONFIG_NVME_KEYRING) += nvme-keyring.o
nvme-auth-y += auth.o
nvme-keyring-y += keyring.o
obj-$(CONFIG_NVME_AUTH_KUNIT_TEST) += tests/auth_kunit.o

View File

@@ -9,14 +9,11 @@
#include <linux/prandom.h>
#include <linux/scatterlist.h>
#include <linux/unaligned.h>
#include <crypto/hash.h>
#include <crypto/dh.h>
#include <crypto/hkdf.h>
#include <crypto/sha2.h>
#include <linux/nvme.h>
#include <linux/nvme-auth.h>
#define HKDF_MAX_HASHLEN 64
static u32 nvme_dhchap_seqnum;
static DEFINE_MUTEX(nvme_dhchap_mutex);
@@ -38,9 +35,9 @@ u32 nvme_auth_get_seqnum(void)
}
EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum);
static struct nvme_auth_dhgroup_map {
const char name[16];
const char kpp[16];
static const struct nvme_auth_dhgroup_map {
char name[16];
char kpp[16];
} dhgroup_map[] = {
[NVME_AUTH_DHGROUP_NULL] = {
.name = "null", .kpp = "null" },
@@ -89,25 +86,21 @@ u8 nvme_auth_dhgroup_id(const char *dhgroup_name)
}
EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id);
static struct nvme_dhchap_hash_map {
static const struct nvme_dhchap_hash_map {
int len;
const char hmac[15];
const char digest[8];
char hmac[15];
} hash_map[] = {
[NVME_AUTH_HASH_SHA256] = {
.len = 32,
.hmac = "hmac(sha256)",
.digest = "sha256",
},
[NVME_AUTH_HASH_SHA384] = {
.len = 48,
.hmac = "hmac(sha384)",
.digest = "sha384",
},
[NVME_AUTH_HASH_SHA512] = {
.len = 64,
.hmac = "hmac(sha512)",
.digest = "sha512",
},
};
@@ -119,14 +112,6 @@ const char *nvme_auth_hmac_name(u8 hmac_id)
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_name);
const char *nvme_auth_digest_name(u8 hmac_id)
{
if (hmac_id >= ARRAY_SIZE(hash_map))
return NULL;
return hash_map[hmac_id].digest;
}
EXPORT_SYMBOL_GPL(nvme_auth_digest_name);
u8 nvme_auth_hmac_id(const char *hmac_name)
{
int i;
@@ -161,11 +146,10 @@ u32 nvme_auth_key_struct_size(u32 key_len)
}
EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size);
struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
u8 key_hash)
struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash)
{
struct nvme_dhchap_key *key;
unsigned char *p;
const char *p;
u32 crc;
int ret, key_len;
size_t allocated_len = strlen(secret);
@@ -183,14 +167,14 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
pr_debug("base64 key decoding error %d\n",
key_len);
ret = key_len;
goto out_free_secret;
goto out_free_key;
}
if (key_len != 36 && key_len != 52 &&
key_len != 68) {
pr_err("Invalid key len %d\n", key_len);
ret = -EINVAL;
goto out_free_secret;
goto out_free_key;
}
/* The last four bytes is the CRC in little-endian format */
@@ -205,12 +189,12 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret,
pr_err("key crc mismatch (key %08x, crc %08x)\n",
get_unaligned_le32(key->key + key_len), crc);
ret = -EKEYREJECTED;
goto out_free_secret;
goto out_free_key;
}
key->len = key_len;
key->hash = key_hash;
return key;
out_free_secret:
out_free_key:
nvme_auth_free_key(key);
return ERR_PTR(ret);
}
@@ -237,12 +221,106 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key)
}
EXPORT_SYMBOL_GPL(nvme_auth_free_key);
struct nvme_dhchap_key *nvme_auth_transform_key(
struct nvme_dhchap_key *key, char *nqn)
/*
* Start computing an HMAC value, given the algorithm ID and raw key.
*
* The context should be zeroized at the end of its lifetime. The caller can do
* that implicitly by calling nvme_auth_hmac_final(), or explicitly (needed when
* a context is abandoned without finalizing it) by calling memzero_explicit().
*/
int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id,
const u8 *key, size_t key_len)
{
const char *hmac_name;
struct crypto_shash *key_tfm;
SHASH_DESC_ON_STACK(shash, key_tfm);
hmac->hmac_id = hmac_id;
switch (hmac_id) {
case NVME_AUTH_HASH_SHA256:
hmac_sha256_init_usingrawkey(&hmac->sha256, key, key_len);
return 0;
case NVME_AUTH_HASH_SHA384:
hmac_sha384_init_usingrawkey(&hmac->sha384, key, key_len);
return 0;
case NVME_AUTH_HASH_SHA512:
hmac_sha512_init_usingrawkey(&hmac->sha512, key, key_len);
return 0;
}
pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id);
return -EINVAL;
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_init);
void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data,
size_t data_len)
{
switch (hmac->hmac_id) {
case NVME_AUTH_HASH_SHA256:
hmac_sha256_update(&hmac->sha256, data, data_len);
return;
case NVME_AUTH_HASH_SHA384:
hmac_sha384_update(&hmac->sha384, data, data_len);
return;
case NVME_AUTH_HASH_SHA512:
hmac_sha512_update(&hmac->sha512, data, data_len);
return;
}
/* Unreachable because nvme_auth_hmac_init() validated hmac_id */
WARN_ON_ONCE(1);
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_update);
/* Finish computing an HMAC value. Note that this zeroizes the HMAC context. */
void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out)
{
switch (hmac->hmac_id) {
case NVME_AUTH_HASH_SHA256:
hmac_sha256_final(&hmac->sha256, out);
return;
case NVME_AUTH_HASH_SHA384:
hmac_sha384_final(&hmac->sha384, out);
return;
case NVME_AUTH_HASH_SHA512:
hmac_sha512_final(&hmac->sha512, out);
return;
}
/* Unreachable because nvme_auth_hmac_init() validated hmac_id */
WARN_ON_ONCE(1);
}
EXPORT_SYMBOL_GPL(nvme_auth_hmac_final);
static int nvme_auth_hmac(u8 hmac_id, const u8 *key, size_t key_len,
const u8 *data, size_t data_len, u8 *out)
{
struct nvme_auth_hmac_ctx hmac;
int ret;
ret = nvme_auth_hmac_init(&hmac, hmac_id, key, key_len);
if (ret == 0) {
nvme_auth_hmac_update(&hmac, data, data_len);
nvme_auth_hmac_final(&hmac, out);
}
return ret;
}
static int nvme_auth_hash(u8 hmac_id, const u8 *data, size_t data_len, u8 *out)
{
switch (hmac_id) {
case NVME_AUTH_HASH_SHA256:
sha256(data, data_len, out);
return 0;
case NVME_AUTH_HASH_SHA384:
sha384(data, data_len, out);
return 0;
case NVME_AUTH_HASH_SHA512:
sha512(data, data_len, out);
return 0;
}
pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id);
return -EINVAL;
}
struct nvme_dhchap_key *nvme_auth_transform_key(
const struct nvme_dhchap_key *key, const char *nqn)
{
struct nvme_auth_hmac_ctx hmac;
struct nvme_dhchap_key *transformed_key;
int ret, key_len;
@@ -257,118 +335,33 @@ struct nvme_dhchap_key *nvme_auth_transform_key(
return ERR_PTR(-ENOMEM);
return transformed_key;
}
hmac_name = nvme_auth_hmac_name(key->hash);
if (!hmac_name) {
pr_warn("Invalid key hash id %d\n", key->hash);
return ERR_PTR(-EINVAL);
}
key_tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(key_tfm))
return ERR_CAST(key_tfm);
key_len = crypto_shash_digestsize(key_tfm);
ret = nvme_auth_hmac_init(&hmac, key->hash, key->key, key->len);
if (ret)
return ERR_PTR(ret);
key_len = nvme_auth_hmac_hash_len(key->hash);
transformed_key = nvme_auth_alloc_key(key_len, key->hash);
if (!transformed_key) {
ret = -ENOMEM;
goto out_free_key;
memzero_explicit(&hmac, sizeof(hmac));
return ERR_PTR(-ENOMEM);
}
shash->tfm = key_tfm;
ret = crypto_shash_setkey(key_tfm, key->key, key->len);
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_init(shash);
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_update(shash, nqn, strlen(nqn));
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17);
if (ret < 0)
goto out_free_transformed_key;
ret = crypto_shash_final(shash, transformed_key->key);
if (ret < 0)
goto out_free_transformed_key;
crypto_free_shash(key_tfm);
nvme_auth_hmac_update(&hmac, nqn, strlen(nqn));
nvme_auth_hmac_update(&hmac, "NVMe-over-Fabrics", 17);
nvme_auth_hmac_final(&hmac, transformed_key->key);
return transformed_key;
out_free_transformed_key:
nvme_auth_free_key(transformed_key);
out_free_key:
crypto_free_shash(key_tfm);
return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(nvme_auth_transform_key);
static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey)
int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len,
const u8 *challenge, u8 *aug, size_t hlen)
{
const char *digest_name;
struct crypto_shash *tfm;
u8 hashed_key[NVME_AUTH_MAX_DIGEST_SIZE];
int ret;
digest_name = nvme_auth_digest_name(hmac_id);
if (!digest_name) {
pr_debug("%s: failed to get digest for %d\n", __func__,
hmac_id);
return -EINVAL;
}
tfm = crypto_alloc_shash(digest_name, 0, 0);
if (IS_ERR(tfm))
return -ENOMEM;
ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey);
if (ret < 0)
pr_debug("%s: Failed to hash digest len %zu\n", __func__,
skey_len);
crypto_free_shash(tfm);
return ret;
}
int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len,
u8 *challenge, u8 *aug, size_t hlen)
{
struct crypto_shash *tfm;
u8 *hashed_key;
const char *hmac_name;
int ret;
hashed_key = kmalloc(hlen, GFP_KERNEL);
if (!hashed_key)
return -ENOMEM;
ret = nvme_auth_hash_skey(hmac_id, skey,
skey_len, hashed_key);
if (ret < 0)
goto out_free_key;
hmac_name = nvme_auth_hmac_name(hmac_id);
if (!hmac_name) {
pr_warn("%s: invalid hash algorithm %d\n",
__func__, hmac_id);
ret = -EINVAL;
goto out_free_key;
}
tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(tfm)) {
ret = PTR_ERR(tfm);
goto out_free_key;
}
ret = crypto_shash_setkey(tfm, hashed_key, hlen);
ret = nvme_auth_hash(hmac_id, skey, skey_len, hashed_key);
if (ret)
goto out_free_hash;
ret = crypto_shash_tfm_digest(tfm, challenge, hlen, aug);
out_free_hash:
crypto_free_shash(tfm);
out_free_key:
kfree_sensitive(hashed_key);
return ret;
ret = nvme_auth_hmac(hmac_id, hashed_key, hlen, challenge, hlen, aug);
memzero_explicit(hashed_key, sizeof(hashed_key));
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge);
@@ -411,7 +404,7 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm,
EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey);
int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
u8 *ctrl_key, size_t ctrl_key_len,
const u8 *ctrl_key, size_t ctrl_key_len,
u8 *sess_key, size_t sess_key_len)
{
struct kpp_request *req;
@@ -438,7 +431,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm,
}
EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret);
int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key)
{
struct nvme_dhchap_key *key;
u8 key_hash;
@@ -461,7 +454,7 @@ int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key)
*ret_key = key;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
EXPORT_SYMBOL_GPL(nvme_auth_parse_key);
/**
* nvme_auth_generate_psk - Generate a PSK for TLS
@@ -486,66 +479,32 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_key);
* Returns 0 on success with a valid generated PSK pointer in @ret_psk and
* the length of @ret_psk in @ret_len, or a negative error number otherwise.
*/
int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len,
u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len)
int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len,
const u8 *c1, const u8 *c2, size_t hash_len,
u8 **ret_psk, size_t *ret_len)
{
struct crypto_shash *tfm;
SHASH_DESC_ON_STACK(shash, tfm);
size_t psk_len = nvme_auth_hmac_hash_len(hmac_id);
struct nvme_auth_hmac_ctx hmac;
u8 *psk;
const char *hmac_name;
int ret, psk_len;
int ret;
if (!c1 || !c2)
return -EINVAL;
hmac_name = nvme_auth_hmac_name(hmac_id);
if (!hmac_name) {
pr_warn("%s: invalid hash algorithm %d\n",
__func__, hmac_id);
return -EINVAL;
}
tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(tfm))
return PTR_ERR(tfm);
psk_len = crypto_shash_digestsize(tfm);
ret = nvme_auth_hmac_init(&hmac, hmac_id, skey, skey_len);
if (ret)
return ret;
psk = kzalloc(psk_len, GFP_KERNEL);
if (!psk) {
ret = -ENOMEM;
goto out_free_tfm;
memzero_explicit(&hmac, sizeof(hmac));
return -ENOMEM;
}
shash->tfm = tfm;
ret = crypto_shash_setkey(tfm, skey, skey_len);
if (ret)
goto out_free_psk;
ret = crypto_shash_init(shash);
if (ret)
goto out_free_psk;
ret = crypto_shash_update(shash, c1, hash_len);
if (ret)
goto out_free_psk;
ret = crypto_shash_update(shash, c2, hash_len);
if (ret)
goto out_free_psk;
ret = crypto_shash_final(shash, psk);
if (!ret) {
*ret_psk = psk;
*ret_len = psk_len;
}
out_free_psk:
if (ret)
kfree_sensitive(psk);
out_free_tfm:
crypto_free_shash(tfm);
return ret;
nvme_auth_hmac_update(&hmac, c1, hash_len);
nvme_auth_hmac_update(&hmac, c2, hash_len);
nvme_auth_hmac_final(&hmac, psk);
*ret_psk = psk;
*ret_len = psk_len;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_auth_generate_psk);
@@ -584,158 +543,70 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_psk);
* Returns 0 on success with a valid digest pointer in @ret_digest, or a
* negative error number on failure.
*/
int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len,
char *subsysnqn, char *hostnqn, u8 **ret_digest)
int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len,
const char *subsysnqn, const char *hostnqn,
char **ret_digest)
{
struct crypto_shash *tfm;
SHASH_DESC_ON_STACK(shash, tfm);
u8 *digest, *enc;
const char *hmac_name;
size_t digest_len, hmac_len;
struct nvme_auth_hmac_ctx hmac;
u8 digest[NVME_AUTH_MAX_DIGEST_SIZE];
size_t hash_len = nvme_auth_hmac_hash_len(hmac_id);
char *enc;
size_t enc_len;
int ret;
if (WARN_ON(!subsysnqn || !hostnqn))
return -EINVAL;
hmac_name = nvme_auth_hmac_name(hmac_id);
if (!hmac_name) {
if (hash_len == 0) {
pr_warn("%s: invalid hash algorithm %d\n",
__func__, hmac_id);
return -EINVAL;
}
switch (nvme_auth_hmac_hash_len(hmac_id)) {
switch (hash_len) {
case 32:
hmac_len = 44;
enc_len = 44;
break;
case 48:
hmac_len = 64;
enc_len = 64;
break;
default:
pr_warn("%s: invalid hash algorithm '%s'\n",
__func__, hmac_name);
__func__, nvme_auth_hmac_name(hmac_id));
return -EINVAL;
}
enc = kzalloc(hmac_len + 1, GFP_KERNEL);
if (!enc)
return -ENOMEM;
tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(tfm)) {
ret = PTR_ERR(tfm);
goto out_free_enc;
}
digest_len = crypto_shash_digestsize(tfm);
digest = kzalloc(digest_len, GFP_KERNEL);
if (!digest) {
enc = kzalloc(enc_len + 1, GFP_KERNEL);
if (!enc) {
ret = -ENOMEM;
goto out_free_tfm;
goto out;
}
shash->tfm = tfm;
ret = crypto_shash_setkey(tfm, psk, psk_len);
ret = nvme_auth_hmac_init(&hmac, hmac_id, psk, psk_len);
if (ret)
goto out_free_digest;
goto out;
nvme_auth_hmac_update(&hmac, hostnqn, strlen(hostnqn));
nvme_auth_hmac_update(&hmac, " ", 1);
nvme_auth_hmac_update(&hmac, subsysnqn, strlen(subsysnqn));
nvme_auth_hmac_update(&hmac, " NVMe-over-Fabrics", 18);
nvme_auth_hmac_final(&hmac, digest);
ret = crypto_shash_init(shash);
if (ret)
goto out_free_digest;
ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn));
if (ret)
goto out_free_digest;
ret = crypto_shash_update(shash, " ", 1);
if (ret)
goto out_free_digest;
ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn));
if (ret)
goto out_free_digest;
ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18);
if (ret)
goto out_free_digest;
ret = crypto_shash_final(shash, digest);
if (ret)
goto out_free_digest;
ret = base64_encode(digest, digest_len, enc, true, BASE64_STD);
if (ret < hmac_len) {
ret = base64_encode(digest, hash_len, enc, true, BASE64_STD);
if (ret < enc_len) {
ret = -ENOKEY;
goto out_free_digest;
goto out;
}
*ret_digest = enc;
ret = 0;
out_free_digest:
kfree_sensitive(digest);
out_free_tfm:
crypto_free_shash(tfm);
out_free_enc:
out:
if (ret)
kfree_sensitive(enc);
memzero_explicit(digest, sizeof(digest));
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_generate_digest);
/**
* hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1)
* @hmac_tfm: hash context keyed with pseudorandom key
* @label: ASCII label without "tls13 " prefix
* @labellen: length of @label
* @context: context bytes
* @contextlen: length of @context
* @okm: output keying material
* @okmlen: length of @okm
*
* Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand().
*
* Returns 0 on success with output keying material stored in @okm,
* or a negative errno value otherwise.
*/
static int hkdf_expand_label(struct crypto_shash *hmac_tfm,
const u8 *label, unsigned int labellen,
const u8 *context, unsigned int contextlen,
u8 *okm, unsigned int okmlen)
{
int err;
u8 *info;
unsigned int infolen;
const char *tls13_prefix = "tls13 ";
unsigned int prefixlen = strlen(tls13_prefix);
if (WARN_ON(labellen > (255 - prefixlen)))
return -EINVAL;
if (WARN_ON(contextlen > 255))
return -EINVAL;
infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen);
info = kzalloc(infolen, GFP_KERNEL);
if (!info)
return -ENOMEM;
/* HkdfLabel.Length */
put_unaligned_be16(okmlen, info);
/* HkdfLabel.Label */
info[2] = prefixlen + labellen;
memcpy(info + 3, tls13_prefix, prefixlen);
memcpy(info + 3 + prefixlen, label, labellen);
/* HkdfLabel.Context */
info[3 + prefixlen + labellen] = contextlen;
memcpy(info + 4 + prefixlen + labellen, context, contextlen);
err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen);
kfree_sensitive(info);
return err;
}
/**
* nvme_auth_derive_tls_psk - Derive TLS PSK
* @hmac_id: Hash function identifier
@@ -763,82 +634,92 @@ static int hkdf_expand_label(struct crypto_shash *hmac_tfm,
* Returns 0 on success with a valid psk pointer in @ret_psk or a negative
* error number otherwise.
*/
int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len,
u8 *psk_digest, u8 **ret_psk)
int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len,
const char *psk_digest, u8 **ret_psk)
{
struct crypto_shash *hmac_tfm;
const char *hmac_name;
const char *label = "nvme-tls-psk";
static const char default_salt[HKDF_MAX_HASHLEN];
size_t prk_len;
const char *ctx;
unsigned char *prk, *tls_key;
static const u8 default_salt[NVME_AUTH_MAX_DIGEST_SIZE];
static const char label[] = "tls13 nvme-tls-psk";
const size_t label_len = sizeof(label) - 1;
u8 prk[NVME_AUTH_MAX_DIGEST_SIZE];
size_t hash_len, ctx_len;
u8 *hmac_data = NULL, *tls_key;
size_t i;
int ret;
hmac_name = nvme_auth_hmac_name(hmac_id);
if (!hmac_name) {
hash_len = nvme_auth_hmac_hash_len(hmac_id);
if (hash_len == 0) {
pr_warn("%s: invalid hash algorithm %d\n",
__func__, hmac_id);
return -EINVAL;
}
if (hmac_id == NVME_AUTH_HASH_SHA512) {
pr_warn("%s: unsupported hash algorithm %s\n",
__func__, hmac_name);
__func__, nvme_auth_hmac_name(hmac_id));
return -EINVAL;
}
hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0);
if (IS_ERR(hmac_tfm))
return PTR_ERR(hmac_tfm);
prk_len = crypto_shash_digestsize(hmac_tfm);
prk = kzalloc(prk_len, GFP_KERNEL);
if (!prk) {
ret = -ENOMEM;
goto out_free_shash;
if (psk_len != hash_len) {
pr_warn("%s: unexpected psk_len %zu\n", __func__, psk_len);
return -EINVAL;
}
if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) {
/* HKDF-Extract */
ret = nvme_auth_hmac(hmac_id, default_salt, hash_len, psk, psk_len,
prk);
if (ret)
goto out;
/*
* HKDF-Expand-Label (RFC 8446 section 7.1), with output length equal to
* the hash length (so only a single HMAC operation is needed)
*/
hmac_data = kmalloc(/* output length */ 2 +
/* label */ 1 + label_len +
/* context (max) */ 1 + 3 + 1 + strlen(psk_digest) +
/* counter */ 1,
GFP_KERNEL);
if (!hmac_data) {
ret = -ENOMEM;
goto out;
}
/* output length */
i = 0;
hmac_data[i++] = hash_len >> 8;
hmac_data[i++] = hash_len;
/* label */
static_assert(label_len <= 255);
hmac_data[i] = label_len;
memcpy(&hmac_data[i + 1], label, label_len);
i += 1 + label_len;
/* context */
ctx_len = sprintf(&hmac_data[i + 1], "%02d %s", hmac_id, psk_digest);
if (ctx_len > 255) {
ret = -EINVAL;
goto out_free_prk;
goto out;
}
ret = hkdf_extract(hmac_tfm, psk, psk_len,
default_salt, prk_len, prk);
if (ret)
goto out_free_prk;
hmac_data[i] = ctx_len;
i += 1 + ctx_len;
ret = crypto_shash_setkey(hmac_tfm, prk, prk_len);
if (ret)
goto out_free_prk;
ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest);
if (!ctx) {
ret = -ENOMEM;
goto out_free_prk;
}
/* counter (this overwrites the NUL terminator written by sprintf) */
hmac_data[i++] = 1;
tls_key = kzalloc(psk_len, GFP_KERNEL);
if (!tls_key) {
ret = -ENOMEM;
goto out_free_ctx;
goto out;
}
ret = hkdf_expand_label(hmac_tfm,
label, strlen(label),
ctx, strlen(ctx),
tls_key, psk_len);
ret = nvme_auth_hmac(hmac_id, prk, hash_len, hmac_data, i, tls_key);
if (ret) {
kfree(tls_key);
goto out_free_ctx;
kfree_sensitive(tls_key);
goto out;
}
*ret_psk = tls_key;
out_free_ctx:
kfree(ctx);
out_free_prk:
kfree(prk);
out_free_shash:
crypto_free_shash(hmac_tfm);
out:
kfree_sensitive(hmac_data);
memzero_explicit(prk, sizeof(prk));
return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk);

View File

@@ -0,0 +1,175 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Unit tests for NVMe authentication functions
*
* Copyright 2026 Google LLC
*/
#include <crypto/sha2.h>
#include <kunit/test.h>
#include <linux/nvme.h>
#include <linux/nvme-auth.h>
#include <linux/slab.h>
struct nvme_auth_test_values {
u8 hmac_id;
size_t hash_len;
u8 expected_psk[NVME_AUTH_MAX_DIGEST_SIZE];
char *expected_psk_digest;
u8 expected_tls_psk[NVME_AUTH_MAX_DIGEST_SIZE];
};
static void kfree_action(void *ptr)
{
kfree(ptr);
}
static void kunit_add_kfree_action(struct kunit *test, void *ptr)
{
KUNIT_ASSERT_EQ(test, 0,
kunit_add_action_or_reset(test, kfree_action, ptr));
}
/*
* Test the derivation of a TLS PSK from the initial skey. The vals parameter
* gives the expected value of tls_psk as well as the intermediate values psk
* and psk_digest. The inputs are implicitly the fixed values set below.
*/
static void
test_nvme_auth_derive_tls_psk(struct kunit *test,
const struct nvme_auth_test_values *vals)
{
const u8 hmac_id = vals->hmac_id;
const size_t hash_len = vals->hash_len;
const size_t skey_len = hash_len;
u8 skey[NVME_AUTH_MAX_DIGEST_SIZE];
u8 c1[NVME_AUTH_MAX_DIGEST_SIZE];
u8 c2[NVME_AUTH_MAX_DIGEST_SIZE];
const char *subsysnqn = "subsysnqn";
const char *hostnqn = "hostnqn";
u8 *psk = NULL, *tls_psk = NULL;
char *psk_digest = NULL;
size_t psk_len;
int ret;
for (int i = 0; i < NVME_AUTH_MAX_DIGEST_SIZE; i++) {
skey[i] = 'A' + i;
c1[i] = i;
c2[i] = 0xff - i;
}
ret = nvme_auth_generate_psk(hmac_id, skey, skey_len, c1, c2, hash_len,
&psk, &psk_len);
kunit_add_kfree_action(test, psk);
KUNIT_ASSERT_EQ(test, 0, ret);
KUNIT_ASSERT_EQ(test, hash_len, psk_len);
KUNIT_ASSERT_MEMEQ(test, vals->expected_psk, psk, psk_len);
ret = nvme_auth_generate_digest(hmac_id, psk, psk_len, subsysnqn,
hostnqn, &psk_digest);
kunit_add_kfree_action(test, psk_digest);
if (vals->expected_psk_digest == NULL) {
/*
* Algorithm has an ID assigned but is not supported by
* nvme_auth_generate_digest().
*/
KUNIT_ASSERT_EQ(test, -EINVAL, ret);
return;
}
KUNIT_ASSERT_EQ(test, 0, ret);
KUNIT_ASSERT_STREQ(test, vals->expected_psk_digest, psk_digest);
ret = nvme_auth_derive_tls_psk(hmac_id, psk, psk_len, psk_digest,
&tls_psk);
kunit_add_kfree_action(test, tls_psk);
KUNIT_ASSERT_EQ(test, 0, ret);
KUNIT_ASSERT_MEMEQ(test, vals->expected_tls_psk, tls_psk, psk_len);
}
static void test_nvme_auth_derive_tls_psk_hmac_sha256(struct kunit *test)
{
static const struct nvme_auth_test_values vals = {
.hmac_id = NVME_AUTH_HASH_SHA256,
.hash_len = SHA256_DIGEST_SIZE,
.expected_psk = {
0x17, 0x33, 0xc5, 0x9f, 0xa7, 0xf4, 0x8f, 0xcf,
0x37, 0xf5, 0xf2, 0x6f, 0xc4, 0xff, 0x02, 0x68,
0xad, 0x4f, 0x78, 0xe0, 0x30, 0xf4, 0xf3, 0xb0,
0xbf, 0xd1, 0xd4, 0x7e, 0x7b, 0xb1, 0x44, 0x7a,
},
.expected_psk_digest = "OldoKuTfKddMuyCznAZojkWD7P4D9/AtzDzLimtOxqI=",
.expected_tls_psk = {
0x3c, 0x17, 0xda, 0x62, 0x84, 0x74, 0xa0, 0x4d,
0x22, 0x47, 0xc4, 0xca, 0xb4, 0x79, 0x68, 0xc9,
0x15, 0x38, 0x81, 0x93, 0xf7, 0xc0, 0x71, 0xbd,
0x94, 0x89, 0xcc, 0x36, 0x66, 0xcd, 0x7c, 0xc8,
},
};
test_nvme_auth_derive_tls_psk(test, &vals);
}
static void test_nvme_auth_derive_tls_psk_hmac_sha384(struct kunit *test)
{
static const struct nvme_auth_test_values vals = {
.hmac_id = NVME_AUTH_HASH_SHA384,
.hash_len = SHA384_DIGEST_SIZE,
.expected_psk = {
0xf1, 0x4b, 0x2d, 0xd3, 0x23, 0x4c, 0x45, 0x96,
0x94, 0xd3, 0xbc, 0x63, 0xf8, 0x96, 0x8b, 0xd6,
0xb3, 0x7c, 0x2c, 0x6d, 0xe8, 0x49, 0xe2, 0x2e,
0x11, 0x87, 0x49, 0x00, 0x1c, 0xe4, 0xbb, 0xe8,
0x64, 0x0b, 0x9e, 0x3a, 0x74, 0x8c, 0xb1, 0x1c,
0xe4, 0xb1, 0xd7, 0x1d, 0x35, 0x9c, 0xce, 0x39,
},
.expected_psk_digest = "cffMWk8TSS7HOQebjgYEIkrPrjWPV4JE5cdPB8WhEvY4JBW5YynKyv66XscN4A9n",
.expected_tls_psk = {
0x27, 0x74, 0x75, 0x32, 0x33, 0x53, 0x7b, 0x3f,
0xa5, 0x0e, 0xb7, 0xd1, 0x6a, 0x8e, 0x43, 0x45,
0x7d, 0x85, 0xf4, 0x90, 0x6c, 0x00, 0x5b, 0x22,
0x36, 0x61, 0x6c, 0x5d, 0x80, 0x93, 0x9d, 0x08,
0x98, 0xff, 0xf1, 0x5b, 0xb8, 0xb7, 0x71, 0x19,
0xd2, 0xbe, 0x0a, 0xac, 0x42, 0x3e, 0x75, 0x90,
},
};
test_nvme_auth_derive_tls_psk(test, &vals);
}
static void test_nvme_auth_derive_tls_psk_hmac_sha512(struct kunit *test)
{
static const struct nvme_auth_test_values vals = {
.hmac_id = NVME_AUTH_HASH_SHA512,
.hash_len = SHA512_DIGEST_SIZE,
.expected_psk = {
0x9c, 0x9f, 0x08, 0x9a, 0x61, 0x8b, 0x47, 0xd2,
0xd7, 0x5f, 0x4b, 0x6c, 0x28, 0x07, 0x04, 0x24,
0x48, 0x7b, 0x44, 0x5d, 0xd9, 0x6e, 0x70, 0xc4,
0xc0, 0x9b, 0x55, 0xe8, 0xb6, 0x00, 0x01, 0x52,
0xa3, 0x36, 0x3c, 0x34, 0x54, 0x04, 0x3f, 0x38,
0xf0, 0xb8, 0x50, 0x36, 0xde, 0xd4, 0x06, 0x55,
0x35, 0x0a, 0xa8, 0x7b, 0x8b, 0x6a, 0x28, 0x2b,
0x5c, 0x1a, 0xca, 0xe1, 0x62, 0x33, 0xdd, 0x5b,
},
/* nvme_auth_generate_digest() doesn't support SHA-512 yet. */
.expected_psk_digest = NULL,
};
test_nvme_auth_derive_tls_psk(test, &vals);
}
static struct kunit_case nvme_auth_test_cases[] = {
KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha256),
KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha384),
KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha512),
{},
};
static struct kunit_suite nvme_auth_test_suite = {
.name = "nvme-auth",
.test_cases = nvme_auth_test_cases,
};
kunit_test_suite(nvme_auth_test_suite);
MODULE_DESCRIPTION("Unit tests for NVMe authentication functions");
MODULE_LICENSE("GPL");

View File

@@ -7,7 +7,6 @@
#include <linux/base64.h>
#include <linux/prandom.h>
#include <linux/unaligned.h>
#include <crypto/hash.h>
#include <crypto/dh.h>
#include "nvme.h"
#include "fabrics.h"
@@ -22,7 +21,6 @@ struct nvme_dhchap_queue_context {
struct list_head entry;
struct work_struct auth_work;
struct nvme_ctrl *ctrl;
struct crypto_shash *shash_tfm;
struct crypto_kpp *dh_tfm;
struct nvme_dhchap_key *transformed_key;
void *buf;
@@ -38,9 +36,9 @@ struct nvme_dhchap_queue_context {
u8 hash_id;
u8 sc_c;
size_t hash_len;
u8 c1[64];
u8 c2[64];
u8 response[64];
u8 c1[NVME_AUTH_MAX_DIGEST_SIZE];
u8 c2[NVME_AUTH_MAX_DIGEST_SIZE];
u8 response[NVME_AUTH_MAX_DIGEST_SIZE];
u8 *ctrl_key;
u8 *host_key;
u8 *sess_key;
@@ -125,6 +123,8 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
{
struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
u8 dh_list_offset = NVME_AUTH_DHCHAP_MAX_DH_IDS;
u8 *idlist = data->auth_protocol[0].dhchap.idlist;
if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
@@ -141,21 +141,22 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
data->sc_c = NVME_AUTH_SECP_NEWTLSPSK;
} else
data->sc_c = NVME_AUTH_SECP_NOSC;
chap->sc_c = data->sc_c;
data->napd = 1;
data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID;
data->auth_protocol[0].dhchap.halen = 3;
data->auth_protocol[0].dhchap.dhlen = 6;
data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256;
data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384;
data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512;
data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL;
data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048;
data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072;
data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096;
data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144;
data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192;
chap->sc_c = data->sc_c;
idlist[0] = NVME_AUTH_HASH_SHA256;
idlist[1] = NVME_AUTH_HASH_SHA384;
idlist[2] = NVME_AUTH_HASH_SHA512;
if (chap->sc_c == NVME_AUTH_SECP_NOSC)
idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_NULL;
idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_2048;
idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_3072;
idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_4096;
idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_6144;
idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_8192;
data->auth_protocol[0].dhchap.dhlen =
dh_list_offset - NVME_AUTH_DHCHAP_MAX_DH_IDS;
return size;
}
@@ -183,38 +184,17 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
return -EPROTO;
}
if (chap->hash_id == data->hashid && chap->shash_tfm &&
!strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) &&
crypto_shash_digestsize(chap->shash_tfm) == data->hl) {
if (chap->hash_id == data->hashid && chap->hash_len == data->hl) {
dev_dbg(ctrl->device,
"qid %d: reuse existing hash %s\n",
chap->qid, hmac_name);
goto select_kpp;
}
/* Reset if hash cannot be reused */
if (chap->shash_tfm) {
crypto_free_shash(chap->shash_tfm);
chap->hash_id = 0;
chap->hash_len = 0;
}
chap->shash_tfm = crypto_alloc_shash(hmac_name, 0,
CRYPTO_ALG_ALLOCATES_MEMORY);
if (IS_ERR(chap->shash_tfm)) {
dev_warn(ctrl->device,
"qid %d: failed to allocate hash %s, error %ld\n",
chap->qid, hmac_name, PTR_ERR(chap->shash_tfm));
chap->shash_tfm = NULL;
chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED;
return -ENOMEM;
}
if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) {
if (nvme_auth_hmac_hash_len(data->hashid) != data->hl) {
dev_warn(ctrl->device,
"qid %d: invalid hash length %d\n",
chap->qid, data->hl);
crypto_free_shash(chap->shash_tfm);
chap->shash_tfm = NULL;
chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE;
return -EPROTO;
}
@@ -434,7 +414,7 @@ static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl,
static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
struct nvme_dhchap_queue_context *chap)
{
SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
struct nvme_auth_hmac_ctx hmac;
u8 buf[4], *challenge = chap->c1;
int ret;
@@ -454,13 +434,11 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
__func__, chap->qid);
}
ret = crypto_shash_setkey(chap->shash_tfm,
chap->transformed_key->key, chap->transformed_key->len);
if (ret) {
dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
chap->qid, ret);
ret = nvme_auth_hmac_init(&hmac, chap->hash_id,
chap->transformed_key->key,
chap->transformed_key->len);
if (ret)
goto out;
}
if (chap->dh_tfm) {
challenge = kmalloc(chap->hash_len, GFP_KERNEL);
@@ -477,51 +455,36 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl,
goto out;
}
shash->tfm = chap->shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
goto out;
ret = crypto_shash_update(shash, challenge, chap->hash_len);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, challenge, chap->hash_len);
put_unaligned_le32(chap->s1, buf);
ret = crypto_shash_update(shash, buf, 4);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 4);
put_unaligned_le16(chap->transaction, buf);
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 2);
*buf = chap->sc_c;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, "HostHost", 8);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
strlen(ctrl->opts->host->nqn));
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, "HostHost", 8);
nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn,
strlen(ctrl->opts->host->nqn));
memset(buf, 0, sizeof(buf));
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
strlen(ctrl->opts->subsysnqn));
if (ret)
goto out;
ret = crypto_shash_final(shash, chap->response);
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn,
strlen(ctrl->opts->subsysnqn));
nvme_auth_hmac_final(&hmac, chap->response);
ret = 0;
out:
if (challenge != chap->c1)
kfree(challenge);
memzero_explicit(&hmac, sizeof(hmac));
return ret;
}
static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
struct nvme_dhchap_queue_context *chap)
{
SHASH_DESC_ON_STACK(shash, chap->shash_tfm);
struct nvme_auth_hmac_ctx hmac;
struct nvme_dhchap_key *transformed_key;
u8 buf[4], *challenge = chap->c2;
int ret;
@@ -533,10 +496,10 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
return ret;
}
ret = crypto_shash_setkey(chap->shash_tfm,
transformed_key->key, transformed_key->len);
ret = nvme_auth_hmac_init(&hmac, chap->hash_id, transformed_key->key,
transformed_key->len);
if (ret) {
dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n",
dev_warn(ctrl->device, "qid %d: failed to init hmac, error %d\n",
chap->qid, ret);
goto out;
}
@@ -563,43 +526,29 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
__func__, chap->qid, ctrl->opts->subsysnqn);
dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n",
__func__, chap->qid, ctrl->opts->host->nqn);
shash->tfm = chap->shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
goto out;
ret = crypto_shash_update(shash, challenge, chap->hash_len);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, challenge, chap->hash_len);
put_unaligned_le32(chap->s2, buf);
ret = crypto_shash_update(shash, buf, 4);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 4);
put_unaligned_le16(chap->transaction, buf);
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 2);
memset(buf, 0, 4);
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, "Controller", 10);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->opts->subsysnqn,
strlen(ctrl->opts->subsysnqn));
if (ret)
goto out;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->opts->host->nqn,
strlen(ctrl->opts->host->nqn));
if (ret)
goto out;
ret = crypto_shash_final(shash, chap->response);
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, "Controller", 10);
nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn,
strlen(ctrl->opts->subsysnqn));
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn,
strlen(ctrl->opts->host->nqn));
nvme_auth_hmac_final(&hmac, chap->response);
ret = 0;
out:
if (challenge != chap->c2)
kfree(challenge);
memzero_explicit(&hmac, sizeof(hmac));
nvme_auth_free_key(transformed_key);
return ret;
}
@@ -689,8 +638,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
nvme_auth_reset_dhchap(chap);
chap->authenticated = false;
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
crypto_free_kpp(chap->dh_tfm);
}
@@ -708,7 +655,8 @@ EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key);
static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl,
struct nvme_dhchap_queue_context *chap)
{
u8 *psk, *digest, *tls_psk;
u8 *psk, *tls_psk;
char *digest;
struct key *tls_key;
size_t psk_len;
int ret = 0;
@@ -1071,12 +1019,11 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work);
if (!ctrl->opts)
return 0;
ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret,
&ctrl->host_key);
ret = nvme_auth_parse_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
if (ret)
return ret;
ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret,
&ctrl->ctrl_key);
ret = nvme_auth_parse_key(ctrl->opts->dhchap_ctrl_secret,
&ctrl->ctrl_key);
if (ret)
goto err_free_dhchap_secret;

View File

@@ -1875,6 +1875,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
break;
}
bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE;
bi->metadata_size = head->ms;
if (bi->csum_type) {
bi->pi_tuple_size = head->pi_size;
@@ -1883,26 +1884,6 @@ static bool nvme_init_integrity(struct nvme_ns_head *head,
return true;
}
static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
lim->max_hw_discard_sectors =
nvme_lba_to_sect(ns->head, ctrl->dmrsl);
else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
lim->max_hw_discard_sectors = UINT_MAX;
else
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = lim->logical_block_size;
if (ctrl->dmrl)
lim->max_discard_segments = ctrl->dmrl;
else
lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
{
return uuid_equal(&a->uuid, &b->uuid) &&
@@ -2078,12 +2059,15 @@ static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
}
static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
struct queue_limits *lim)
struct nvme_id_ns_nvm *nvm, struct queue_limits *lim)
{
struct nvme_ns_head *head = ns->head;
struct nvme_ctrl *ctrl = ns->ctrl;
u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
u32 npdg = 1, npda = 1;
bool valid = true;
u8 optperf;
/*
* The block layer can't support LBA sizes larger than the page size
@@ -2098,7 +2082,12 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
phys_bs = bs;
atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs);
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
optperf = id->nsfeat >> NVME_NS_FEAT_OPTPERF_SHIFT;
if (ctrl->vs >= NVME_VS(2, 1, 0))
optperf &= NVME_NS_FEAT_OPTPERF_MASK_2_1;
else
optperf &= NVME_NS_FEAT_OPTPERF_MASK;
if (optperf) {
/* NPWG = Namespace Preferred Write Granularity */
phys_bs = bs * (1 + le16_to_cpu(id->npwg));
/* NOWS = Namespace Optimal Write Size */
@@ -2115,11 +2104,54 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
lim->physical_block_size = min(phys_bs, atomic_bs);
lim->io_min = phys_bs;
lim->io_opt = io_opt;
if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
(ns->ctrl->oncs & NVME_CTRL_ONCS_DSM))
if ((ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) &&
(ctrl->oncs & NVME_CTRL_ONCS_DSM))
lim->max_write_zeroes_sectors = UINT_MAX;
else
lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
lim->max_write_zeroes_sectors = ctrl->max_zeroes_sectors;
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
lim->max_hw_discard_sectors =
nvme_lba_to_sect(ns->head, ctrl->dmrsl);
else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
lim->max_hw_discard_sectors = UINT_MAX;
else
lim->max_hw_discard_sectors = 0;
/*
* NVMe namespaces advertise both a preferred deallocate granularity
* (for a discard length) and alignment (for a discard starting offset).
* However, Linux block devices advertise a single discard_granularity.
* From NVM Command Set specification 1.1 section 5.2.2, the NPDGL/NPDAL
* fields in the NVM Command Set Specific Identify Namespace structure
* are preferred to NPDG/NPDA in the Identify Namespace structure since
* they can represent larger values. However, NPDGL or NPDAL may be 0 if
* unsupported. NPDG and NPDA are 0's based.
* From Figure 115 of NVM Command Set specification 1.1, NPDGL and NPDAL
* are supported if the high bit of OPTPERF is set. NPDG is supported if
* the low bit of OPTPERF is set. NPDA is supported if either is set.
* NPDG should be a multiple of NPDA, and likewise NPDGL should be a
* multiple of NPDAL, but the spec doesn't say anything about NPDG vs.
* NPDAL or NPDGL vs. NPDA. So compute the maximum instead of assuming
* NPDG(L) is the larger. If neither NPDG, NPDGL, NPDA, nor NPDAL are
* supported, default the discard_granularity to the logical block size.
*/
if (optperf & 0x2 && nvm && nvm->npdgl)
npdg = le32_to_cpu(nvm->npdgl);
else if (optperf & 0x1)
npdg = from0based(id->npdg);
if (optperf & 0x2 && nvm && nvm->npdal)
npda = le32_to_cpu(nvm->npdal);
else if (optperf)
npda = from0based(id->npda);
if (check_mul_overflow(max(npdg, npda), lim->logical_block_size,
&lim->discard_granularity))
lim->discard_granularity = lim->logical_block_size;
if (ctrl->dmrl)
lim->max_discard_segments = ctrl->dmrl;
else
lim->max_discard_segments = NVME_DSM_MAX_RANGES;
return valid;
}
@@ -2353,7 +2385,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
}
lbaf = nvme_lbaf_index(id->flbas);
if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
if (nvme_id_cns_ok(ns->ctrl, NVME_ID_CNS_CS_NS)) {
ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
if (ret < 0)
goto out;
@@ -2381,10 +2413,9 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
nvme_set_ctrl_limits(ns->ctrl, &lim, false);
nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info);
nvme_set_chunk_sectors(ns, id, &lim);
if (!nvme_update_disk_info(ns, id, &lim))
if (!nvme_update_disk_info(ns, id, nvm, &lim))
capacity = 0;
nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS)
nvme_update_zone_info(ns, &lim, &zi);
@@ -3388,7 +3419,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
ctrl->dmrl = id->dmrl;
ctrl->dmrsl = le32_to_cpu(id->dmrsl);
if (id->wzsl)
if (id->wzsl && !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES))
ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl);
free_data:

View File

@@ -154,21 +154,8 @@ void nvme_failover_req(struct request *req)
}
spin_lock_irqsave(&ns->head->requeue_lock, flags);
for (bio = req->bio; bio; bio = bio->bi_next) {
for (bio = req->bio; bio; bio = bio->bi_next)
bio_set_dev(bio, ns->head->disk->part0);
if (bio->bi_opf & REQ_POLLED) {
bio->bi_opf &= ~REQ_POLLED;
bio->bi_cookie = BLK_QC_T_NONE;
}
/*
* The alternate request queue that we may end up submitting
* the bio to may be frozen temporarily, in this case REQ_NOWAIT
* will fail the I/O immediately with EAGAIN to the issuer.
* We are not in the issuer context which cannot block. Clear
* the flag to avoid spurious EAGAIN I/O failures.
*/
bio->bi_opf &= ~REQ_NOWAIT;
}
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);

View File

@@ -762,6 +762,12 @@ static inline u32 nvme_bytes_to_numd(size_t len)
return (len >> 2) - 1;
}
/* Decode a 2-byte "0's based"/"0-based" field */
static inline u32 from0based(__le16 value)
{
return (u32)le16_to_cpu(value) + 1;
}
static inline bool nvme_is_ana_error(u16 status)
{
switch (status & NVME_SCT_SC_MASK) {

View File

@@ -4178,6 +4178,8 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x2646, 0x502F), /* KINGSTON OM3SGP4xxxxK NVMe SSD */
.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */
.driver_data = NVME_QUIRK_BOGUS_NID, },
{ PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */

View File

@@ -658,7 +658,7 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
struct nvme_dhchap_key *key, *host_key;
int ret;
ret = nvme_auth_generate_key(dhchap_secret, &key);
ret = nvme_auth_parse_key(dhchap_secret, &key);
if (ret) {
kfree(dhchap_secret);
return ret;
@@ -716,7 +716,7 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
struct nvme_dhchap_key *key, *ctrl_key;
int ret;
ret = nvme_auth_generate_key(dhchap_secret, &key);
ret = nvme_auth_parse_key(dhchap_secret, &key);
if (ret) {
kfree(dhchap_secret);
return ret;
@@ -829,7 +829,49 @@ static ssize_t tls_configured_key_show(struct device *dev,
return sysfs_emit(buf, "%08x\n", key_serial(key));
}
static DEVICE_ATTR_RO(tls_configured_key);
static ssize_t tls_configured_key_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
int error, qid;
error = kstrtoint(buf, 10, &qid);
if (error)
return error;
/*
* We currently only allow userspace to write a `0` indicating
* generate a new key.
*/
if (qid)
return -EINVAL;
if (!ctrl->opts || !ctrl->opts->concat)
return -EOPNOTSUPP;
error = nvme_auth_negotiate(ctrl, 0);
if (error < 0) {
nvme_reset_ctrl(ctrl);
return error;
}
error = nvme_auth_wait(ctrl, 0);
if (error < 0) {
nvme_reset_ctrl(ctrl);
return error;
}
/*
* We need to reset the TLS connection, so let's just
* reset the controller.
*/
nvme_reset_ctrl(ctrl);
return count;
}
static DEVICE_ATTR_RW(tls_configured_key);
static ssize_t tls_keyring_show(struct device *dev,
struct device_attribute *attr, char *buf)
@@ -861,7 +903,7 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj,
!ctrl->opts->tls && !ctrl->opts->concat)
return 0;
if (a == &dev_attr_tls_configured_key.attr &&
(!ctrl->opts->tls_key || ctrl->opts->concat))
!ctrl->opts->concat)
return 0;
if (a == &dev_attr_tls_keyring.attr &&
!ctrl->opts->keyring)

View File

@@ -1057,6 +1057,8 @@ static void nvme_execute_identify_ns_nvm(struct nvmet_req *req)
status = NVME_SC_INTERNAL;
goto out;
}
if (req->ns->bdev)
nvmet_bdev_set_nvm_limits(req->ns->bdev, id);
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
kfree(id);
out:
@@ -1603,7 +1605,7 @@ void nvmet_execute_keep_alive(struct nvmet_req *req)
pr_debug("ctrl %d update keep-alive timer for %d secs\n",
ctrl->cntlid, ctrl->kato);
mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ);
mod_delayed_work(system_percpu_wq, &ctrl->ka_work, ctrl->kato * HZ);
out:
nvmet_req_complete(req, status);
}

View File

@@ -9,7 +9,6 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <crypto/hash.h>
#include <linux/crc32.h>
#include <linux/base64.h>
#include <linux/ctype.h>
@@ -45,15 +44,6 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
key_hash);
return -EINVAL;
}
if (key_hash > 0) {
/* Validate selected hash algorithm */
const char *hmac = nvme_auth_hmac_name(key_hash);
if (!crypto_has_shash(hmac, 0, 0)) {
pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac);
return -ENOTSUPP;
}
}
dhchap_secret = kstrdup(secret, GFP_KERNEL);
if (!dhchap_secret)
return -ENOMEM;
@@ -140,7 +130,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
return ret;
}
u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset)
{
int ret = 0;
struct nvmet_host_link *p;
@@ -166,7 +156,7 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
goto out_unlock;
}
if (nvmet_queue_tls_keyid(sq)) {
if (!reset && nvmet_queue_tls_keyid(sq)) {
pr_debug("host %s tls enabled\n", ctrl->hostnqn);
goto out_unlock;
}
@@ -292,47 +282,30 @@ bool nvmet_check_auth_status(struct nvmet_req *req)
int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
SHASH_DESC_ON_STACK(shash, shash_tfm);
struct nvme_auth_hmac_ctx hmac;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c1;
struct nvme_dhchap_key *transformed_key;
u8 buf[4];
int ret;
hash_name = nvme_auth_hmac_name(ctrl->shash_id);
if (!hash_name) {
pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
return -EINVAL;
}
shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
if (IS_ERR(shash_tfm)) {
pr_err("failed to allocate shash %s\n", hash_name);
return PTR_ERR(shash_tfm);
}
if (shash_len != crypto_shash_digestsize(shash_tfm)) {
pr_err("%s: hash len mismatch (len %d digest %d)\n",
__func__, shash_len,
crypto_shash_digestsize(shash_tfm));
ret = -EINVAL;
goto out_free_tfm;
}
transformed_key = nvme_auth_transform_key(ctrl->host_key,
ctrl->hostnqn);
if (IS_ERR(transformed_key)) {
ret = PTR_ERR(transformed_key);
goto out_free_tfm;
}
if (IS_ERR(transformed_key))
return PTR_ERR(transformed_key);
ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key,
transformed_key->len);
if (ret)
goto out_free_response;
if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) {
pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__,
shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id));
ret = -EINVAL;
goto out_free_response;
}
if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
challenge = kmalloc(shash_len, GFP_KERNEL);
if (!challenge) {
@@ -345,101 +318,67 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
req->sq->dhchap_c1,
challenge, shash_len);
if (ret)
goto out;
goto out_free_challenge;
}
pr_debug("ctrl %d qid %d host response seq %u transaction %d\n",
ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1,
req->sq->dhchap_tid);
shash->tfm = shash_tfm;
ret = crypto_shash_init(shash);
if (ret)
goto out;
ret = crypto_shash_update(shash, challenge, shash_len);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, challenge, shash_len);
put_unaligned_le32(req->sq->dhchap_s1, buf);
ret = crypto_shash_update(shash, buf, 4);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 4);
put_unaligned_le16(req->sq->dhchap_tid, buf);
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 2);
*buf = req->sq->sc_c;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, "HostHost", 8);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, "HostHost", 8);
memset(buf, 0, 4);
ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
if (ret)
goto out;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn,
strlen(ctrl->subsys->subsysnqn));
if (ret)
goto out;
ret = crypto_shash_final(shash, response);
out:
nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn));
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn,
strlen(ctrl->subsys->subsysnqn));
nvme_auth_hmac_final(&hmac, response);
ret = 0;
out_free_challenge:
if (challenge != req->sq->dhchap_c1)
kfree(challenge);
out_free_response:
memzero_explicit(&hmac, sizeof(hmac));
nvme_auth_free_key(transformed_key);
out_free_tfm:
crypto_free_shash(shash_tfm);
return ret;
}
int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
unsigned int shash_len)
{
struct crypto_shash *shash_tfm;
struct shash_desc *shash;
struct nvme_auth_hmac_ctx hmac;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
const char *hash_name;
u8 *challenge = req->sq->dhchap_c2;
struct nvme_dhchap_key *transformed_key;
u8 buf[4];
int ret;
hash_name = nvme_auth_hmac_name(ctrl->shash_id);
if (!hash_name) {
pr_warn("Hash ID %d invalid\n", ctrl->shash_id);
return -EINVAL;
}
shash_tfm = crypto_alloc_shash(hash_name, 0, 0);
if (IS_ERR(shash_tfm)) {
pr_err("failed to allocate shash %s\n", hash_name);
return PTR_ERR(shash_tfm);
}
if (shash_len != crypto_shash_digestsize(shash_tfm)) {
pr_debug("%s: hash len mismatch (len %d digest %d)\n",
__func__, shash_len,
crypto_shash_digestsize(shash_tfm));
ret = -EINVAL;
goto out_free_tfm;
}
transformed_key = nvme_auth_transform_key(ctrl->ctrl_key,
ctrl->subsys->subsysnqn);
if (IS_ERR(transformed_key)) {
ret = PTR_ERR(transformed_key);
goto out_free_tfm;
}
if (IS_ERR(transformed_key))
return PTR_ERR(transformed_key);
ret = crypto_shash_setkey(shash_tfm, transformed_key->key,
ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key,
transformed_key->len);
if (ret)
goto out_free_response;
if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) {
pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__,
shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id));
ret = -EINVAL;
goto out_free_response;
}
if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) {
challenge = kmalloc(shash_len, GFP_KERNEL);
if (!challenge) {
@@ -455,55 +394,29 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
goto out_free_challenge;
}
shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm),
GFP_KERNEL);
if (!shash) {
ret = -ENOMEM;
goto out_free_challenge;
}
shash->tfm = shash_tfm;
nvme_auth_hmac_update(&hmac, challenge, shash_len);
ret = crypto_shash_init(shash);
if (ret)
goto out;
ret = crypto_shash_update(shash, challenge, shash_len);
if (ret)
goto out;
put_unaligned_le32(req->sq->dhchap_s2, buf);
ret = crypto_shash_update(shash, buf, 4);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 4);
put_unaligned_le16(req->sq->dhchap_tid, buf);
ret = crypto_shash_update(shash, buf, 2);
if (ret)
goto out;
nvme_auth_hmac_update(&hmac, buf, 2);
memset(buf, 0, 4);
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, "Controller", 10);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn,
strlen(ctrl->subsys->subsysnqn));
if (ret)
goto out;
ret = crypto_shash_update(shash, buf, 1);
if (ret)
goto out;
ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn));
if (ret)
goto out;
ret = crypto_shash_final(shash, response);
out:
kfree(shash);
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, "Controller", 10);
nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn,
strlen(ctrl->subsys->subsysnqn));
nvme_auth_hmac_update(&hmac, buf, 1);
nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn));
nvme_auth_hmac_final(&hmac, response);
ret = 0;
out_free_challenge:
if (challenge != req->sq->dhchap_c2)
kfree(challenge);
out_free_response:
memzero_explicit(&hmac, sizeof(hmac));
nvme_auth_free_key(transformed_key);
out_free_tfm:
crypto_free_shash(shash_tfm);
return ret;
}
@@ -531,7 +444,7 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
}
int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
u8 *pkey, int pkey_size)
const u8 *pkey, int pkey_size)
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
int ret;
@@ -557,7 +470,8 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
void nvmet_auth_insert_psk(struct nvmet_sq *sq)
{
int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id);
u8 *psk, *digest, *tls_psk;
u8 *psk, *tls_psk;
char *digest;
size_t psk_len;
int ret;
#ifdef CONFIG_NVME_TARGET_TCP_TLS

View File

@@ -17,7 +17,6 @@
#include <linux/nvme-auth.h>
#endif
#include <linux/nvme-keyring.h>
#include <crypto/hash.h>
#include <crypto/kpp.h>
#include <linux/nospec.h>
@@ -2181,8 +2180,6 @@ static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item,
hmac_id = nvme_auth_hmac_id(page);
if (hmac_id == NVME_AUTH_HASH_INVALID)
return -EINVAL;
if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0))
return -ENOTSUPP;
host->dhchap_hash_id = hmac_id;
return count;
}

View File

@@ -1688,7 +1688,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
if (args->hostid)
uuid_copy(&ctrl->hostid, args->hostid);
dhchap_status = nvmet_setup_auth(ctrl, args->sq);
dhchap_status = nvmet_setup_auth(ctrl, args->sq, false);
if (dhchap_status) {
pr_err("Failed to setup authentication, dhchap status %u\n",
dhchap_status);
@@ -1944,12 +1944,13 @@ static int __init nvmet_init(void)
if (!nvmet_bvec_cache)
return -ENOMEM;
zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM | WQ_PERCPU,
0);
if (!zbd_wq)
goto out_destroy_bvec_cache;
buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
WQ_MEM_RECLAIM, 0);
WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!buffered_io_wq)
goto out_free_zbd_work_queue;

View File

@@ -8,7 +8,6 @@
#include <linux/blkdev.h>
#include <linux/random.h>
#include <linux/nvme-auth.h>
#include <crypto/hash.h>
#include <crypto/kpp.h>
#include "nvmet.h"
@@ -75,8 +74,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) {
u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i];
if (!fallback_hash_id &&
crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0))
if (!fallback_hash_id && nvme_auth_hmac_hash_len(host_hmac_id))
fallback_hash_id = host_hmac_id;
if (ctrl->shash_id != host_hmac_id)
continue;
@@ -293,7 +291,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
pr_debug("%s: ctrl %d qid %d reset negotiation\n",
__func__, ctrl->cntlid, req->sq->qid);
if (!req->sq->qid) {
dhchap_status = nvmet_setup_auth(ctrl, req->sq);
dhchap_status = nvmet_setup_auth(ctrl, req->sq,
true);
if (dhchap_status) {
pr_err("ctrl %d qid 0 failed to setup re-authentication\n",
ctrl->cntlid);
@@ -391,14 +390,15 @@ done:
req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120;
mod_delayed_work(system_wq, &req->sq->auth_expired_work,
mod_delayed_work(system_percpu_wq, &req->sq->auth_expired_work,
auth_expire_secs * HZ);
goto complete;
}
/* Final states, clear up variables */
nvmet_auth_sq_free(req->sq);
if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2)
if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) {
nvmet_auth_sq_free(req->sq);
nvmet_ctrl_fatal_error(ctrl);
}
complete:
nvmet_req_complete(req, status);
@@ -574,9 +574,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req)
status = nvmet_copy_to_sgl(req, 0, d, al);
kfree(d);
done:
if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2)
nvmet_auth_sq_free(req->sq);
else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) {
nvmet_auth_sq_free(req->sq);
nvmet_ctrl_fatal_error(ctrl);
}

View File

@@ -792,9 +792,9 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc,
if (!queue)
return NULL;
queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0,
assoc->tgtport->fc_target_port.port_num,
assoc->a_id, qid);
queue->work_q = alloc_workqueue("ntfc%d.%d.%d", WQ_PERCPU, 0,
assoc->tgtport->fc_target_port.port_num,
assoc->a_id, qid);
if (!queue->work_q)
goto out_free_queue;

View File

@@ -30,11 +30,11 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
id->nacwu = lpp0b;
/*
* Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and
* NOWS are defined for this namespace and should be used by
* the host for I/O optimization.
* OPTPERF = 11b indicates that the fields NPWG, NPWA, NPDG, NPDA,
* NPDGL, NPDAL, and NOWS are defined for this namespace and should be
* used by the host for I/O optimization.
*/
id->nsfeat |= 1 << 4;
id->nsfeat |= 0x3 << NVME_NS_FEAT_OPTPERF_SHIFT;
/* NPWG = Namespace Preferred Write Granularity. 0's based */
id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev));
/* NPWA = Namespace Preferred Write Alignment. 0's based */
@@ -52,6 +52,17 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
id->dlfeat = (1 << 3) | 0x1;
}
void nvmet_bdev_set_nvm_limits(struct block_device *bdev,
struct nvme_id_ns_nvm *id)
{
/*
* NPDGL = Namespace Preferred Deallocate Granularity Large
* NPDAL = Namespace Preferred Deallocate Alignment Large
*/
id->npdgl = id->npdal = cpu_to_le32(bdev_discard_granularity(bdev) /
bdev_logical_block_size(bdev));
}
void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
{
if (ns->bdev_file) {

View File

@@ -419,7 +419,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_cancel_tagset(&ctrl->ctrl);
nvme_loop_destroy_io_queues(ctrl);
}
@@ -427,7 +426,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE)
nvme_disable_ctrl(&ctrl->ctrl, true);
nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
}

View File

@@ -550,6 +550,8 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl);
u16 nvmet_parse_connect_cmd(struct nvmet_req *req);
u32 nvmet_connect_cmd_data_len(struct nvmet_req *req);
void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id);
void nvmet_bdev_set_nvm_limits(struct block_device *bdev,
struct nvme_id_ns_nvm *id);
u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_file_parse_io_cmd(struct nvmet_req *req);
u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req);
@@ -896,7 +898,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req);
int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
bool set_ctrl);
int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq);
u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset);
void nvmet_auth_sq_init(struct nvmet_sq *sq);
void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
void nvmet_auth_sq_free(struct nvmet_sq *sq);
@@ -913,11 +915,11 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq)
int nvmet_auth_ctrl_exponential(struct nvmet_req *req,
u8 *buf, int buf_size);
int nvmet_auth_ctrl_sesskey(struct nvmet_req *req,
u8 *buf, int buf_size);
const u8 *pkey, int pkey_size);
void nvmet_auth_insert_psk(struct nvmet_sq *sq);
#else
static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl,
struct nvmet_sq *sq)
struct nvmet_sq *sq, bool reset)
{
return 0;
}

View File

@@ -2225,7 +2225,7 @@ static int __init nvmet_tcp_init(void)
int ret;
nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq",
WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU, 0);
if (!nvmet_tcp_wq)
return -ENOMEM;

View File

@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bsg.h>
#include <linux/io_uring/cmd.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include <scsi/scsi_cmnd.h>
@@ -9,6 +10,178 @@
#define uptr64(val) ((void __user *)(uintptr_t)(val))
/*
* Per-command BSG SCSI PDU stored in io_uring_cmd.pdu[32].
* Holds temporary state between submission, completion and task_work.
*/
struct scsi_bsg_uring_cmd_pdu {
struct bio *bio; /* mapped user buffer, unmap in task work */
struct request *req; /* block request, freed in task work */
u64 response_addr; /* user space response buffer address */
};
static_assert(sizeof(struct scsi_bsg_uring_cmd_pdu) <= sizeof_field(struct io_uring_cmd, pdu));
static inline struct scsi_bsg_uring_cmd_pdu *scsi_bsg_uring_cmd_pdu(
struct io_uring_cmd *ioucmd)
{
return io_uring_cmd_to_pdu(ioucmd, struct scsi_bsg_uring_cmd_pdu);
}
/* Task work: build res2 (layout in uapi/linux/bsg.h) and copy sense to user. */
static void scsi_bsg_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw)
{
struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req);
struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd);
struct request *rq = pdu->req;
struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq);
u64 res2;
int ret = 0;
u8 driver_status = 0;
u8 sense_len_wr = 0;
if (pdu->bio)
blk_rq_unmap_user(pdu->bio);
if (scsi_status_is_check_condition(scmd->result)) {
driver_status = DRIVER_SENSE;
if (pdu->response_addr)
sense_len_wr = min_t(u8, scmd->sense_len,
SCSI_SENSE_BUFFERSIZE);
}
if (sense_len_wr) {
if (copy_to_user(uptr64(pdu->response_addr), scmd->sense_buffer,
sense_len_wr))
ret = -EFAULT;
}
res2 = bsg_scsi_res2_build(status_byte(scmd->result), driver_status,
host_byte(scmd->result), sense_len_wr,
scmd->resid_len);
blk_mq_free_request(rq);
io_uring_cmd_done32(ioucmd, ret, res2,
IO_URING_CMD_TASK_WORK_ISSUE_FLAGS);
}
static enum rq_end_io_ret scsi_bsg_uring_cmd_done(struct request *req,
blk_status_t status,
const struct io_comp_batch *iocb)
{
struct io_uring_cmd *ioucmd = req->end_io_data;
io_uring_cmd_do_in_task_lazy(ioucmd, scsi_bsg_uring_task_cb);
return RQ_END_IO_NONE;
}
static int scsi_bsg_map_user_buffer(struct request *req,
struct io_uring_cmd *ioucmd,
unsigned int issue_flags, gfp_t gfp_mask)
{
const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd);
bool is_write = cmd->dout_xfer_len > 0;
u64 buf_addr = is_write ? cmd->dout_xferp : cmd->din_xferp;
unsigned long buf_len = is_write ? cmd->dout_xfer_len : cmd->din_xfer_len;
struct iov_iter iter;
int ret;
if (ioucmd->flags & IORING_URING_CMD_FIXED) {
ret = io_uring_cmd_import_fixed(buf_addr, buf_len,
is_write ? WRITE : READ,
&iter, ioucmd, issue_flags);
if (ret < 0)
return ret;
ret = blk_rq_map_user_iov(req->q, req, NULL, &iter, gfp_mask);
} else {
ret = blk_rq_map_user(req->q, req, NULL, uptr64(buf_addr),
buf_len, gfp_mask);
}
return ret;
}
static int scsi_bsg_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd,
unsigned int issue_flags, bool open_for_write)
{
struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd);
const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd);
struct scsi_cmnd *scmd;
struct request *req;
blk_mq_req_flags_t blk_flags = 0;
gfp_t gfp_mask = GFP_KERNEL;
int ret;
if (cmd->protocol != BSG_PROTOCOL_SCSI ||
cmd->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD)
return -EINVAL;
if (!cmd->request || cmd->request_len == 0)
return -EINVAL;
if (cmd->dout_xfer_len && cmd->din_xfer_len) {
pr_warn_once("BIDI support in bsg has been removed.\n");
return -EOPNOTSUPP;
}
if (cmd->dout_iovec_count > 0 || cmd->din_iovec_count > 0)
return -EOPNOTSUPP;
if (issue_flags & IO_URING_F_NONBLOCK) {
blk_flags = BLK_MQ_REQ_NOWAIT;
gfp_mask = GFP_NOWAIT;
}
req = scsi_alloc_request(q, cmd->dout_xfer_len ?
REQ_OP_DRV_OUT : REQ_OP_DRV_IN, blk_flags);
if (IS_ERR(req))
return PTR_ERR(req);
scmd = blk_mq_rq_to_pdu(req);
if (cmd->request_len > sizeof(scmd->cmnd)) {
ret = -EINVAL;
goto out_free_req;
}
scmd->cmd_len = cmd->request_len;
scmd->allowed = SG_DEFAULT_RETRIES;
if (copy_from_user(scmd->cmnd, uptr64(cmd->request), cmd->request_len)) {
ret = -EFAULT;
goto out_free_req;
}
if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) {
ret = -EPERM;
goto out_free_req;
}
pdu->response_addr = cmd->response;
scmd->sense_len = cmd->max_response_len ?
min(cmd->max_response_len, SCSI_SENSE_BUFFERSIZE) : SCSI_SENSE_BUFFERSIZE;
if (cmd->dout_xfer_len || cmd->din_xfer_len) {
ret = scsi_bsg_map_user_buffer(req, ioucmd, issue_flags, gfp_mask);
if (ret)
goto out_free_req;
pdu->bio = req->bio;
} else {
pdu->bio = NULL;
}
req->timeout = cmd->timeout_ms ?
msecs_to_jiffies(cmd->timeout_ms) : BLK_DEFAULT_SG_TIMEOUT;
req->end_io = scsi_bsg_uring_cmd_done;
req->end_io_data = ioucmd;
pdu->req = req;
blk_execute_rq_nowait(req, false);
return -EIOCBQUEUED;
out_free_req:
blk_mq_free_request(req);
return ret;
}
static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
bool open_for_write, unsigned int timeout)
{
@@ -99,5 +272,6 @@ out_put_request:
struct bsg_device *scsi_bsg_register_queue(struct scsi_device *sdev)
{
return bsg_register_queue(sdev->request_queue, &sdev->sdev_gendev,
dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn);
dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn,
scsi_bsg_uring_cmd);
}

View File

@@ -173,7 +173,7 @@ static int fd_configure_device(struct se_device *dev)
*/
dev->dev_attrib.max_write_same_len = 0xFFFF;
if (bdev_nonrot(bdev))
if (!bdev_rot(bdev))
dev->dev_attrib.is_nonrot = 1;
} else {
if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) {

View File

@@ -148,7 +148,7 @@ static int iblock_configure_device(struct se_device *dev)
else
dev->dev_attrib.max_write_same_len = 0xFFFF;
if (bdev_nonrot(bd))
if (!bdev_rot(bd))
dev->dev_attrib.is_nonrot = 1;
target_configure_write_atomic_from_bdev(&dev->dev_attrib, bd);

View File

@@ -694,7 +694,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
}
if (!bdev_nonrot(file_bdev(bdev_file)))
if (bdev_rot(file_bdev(bdev_file)))
fs_devices->rotating = true;
if (bdev_max_discard_sectors(file_bdev(bdev_file)))
@@ -2919,7 +2919,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
if (!bdev_nonrot(device->bdev))
if (bdev_rot(device->bdev))
fs_devices->rotating = true;
orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);

View File

@@ -73,7 +73,7 @@ static int mbt_mb_init(struct super_block *sb)
ext4_fsblk_t block;
int ret;
/* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
/* needed by ext4_mb_init->bdev_rot(sb->s_bdev) */
sb->s_bdev = kzalloc_obj(*sb->s_bdev);
if (sb->s_bdev == NULL)
return -ENOMEM;

View File

@@ -3840,7 +3840,7 @@ int ext4_mb_init(struct super_block *sb)
spin_lock_init(&lg->lg_prealloc_lock);
}
if (bdev_nonrot(sb->s_bdev))
if (!bdev_rot(sb->s_bdev))
sbi->s_mb_max_linear_groups = 0;
else
sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT;

View File

@@ -670,7 +670,6 @@ xfs_zone_gc_start_chunk(
struct xfs_inode *ip;
struct bio *bio;
xfs_daddr_t daddr;
unsigned int len;
bool is_seq;
if (xfs_is_shutdown(mp))
@@ -685,15 +684,16 @@ xfs_zone_gc_start_chunk(
return false;
}
len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
bio = bio_alloc_bioset(bdev,
min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
REQ_OP_READ, GFP_NOFS, &data->bio_set);
/*
* Scratch allocation can wrap around to the same buffer again,
* provision an extra bvec for that case.
*/
bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS,
&data->bio_set);
chunk = container_of(bio, struct xfs_gc_bio, bio);
chunk->ip = ip;
chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
chunk->len = len;
chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
chunk->old_startblock =
xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
chunk->new_daddr = daddr;
@@ -707,8 +707,9 @@ xfs_zone_gc_start_chunk(
bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
bio->bi_end_io = xfs_zone_gc_end_io;
xfs_zone_gc_add_data(chunk);
data->scratch_head = (data->scratch_head + len) % data->scratch_size;
data->scratch_available -= len;
data->scratch_head =
(data->scratch_head + chunk->len) % data->scratch_size;
data->scratch_available -= chunk->len;
XFS_STATS_INC(mp, xs_gc_read_calls);
@@ -899,9 +900,10 @@ out:
static void
xfs_submit_zone_reset_bio(
struct xfs_rtgroup *rtg,
struct bio *bio)
struct bio *bio,
void *priv)
{
struct xfs_rtgroup *rtg = priv;
struct xfs_mount *mp = rtg_mount(rtg);
trace_xfs_zone_reset(rtg);
@@ -933,26 +935,16 @@ xfs_submit_zone_reset_bio(
submit_bio(bio);
}
static void xfs_bio_wait_endio(struct bio *bio)
{
complete(bio->bi_private);
}
int
xfs_zone_gc_reset_sync(
struct xfs_rtgroup *rtg)
{
DECLARE_COMPLETION_ONSTACK(done);
struct bio bio;
int error;
bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
REQ_OP_ZONE_RESET | REQ_SYNC);
bio.bi_private = &done;
bio.bi_end_io = xfs_bio_wait_endio;
xfs_submit_zone_reset_bio(rtg, &bio);
wait_for_completion_io(&done);
bio_await(&bio, rtg, xfs_submit_zone_reset_bio);
error = blk_status_to_errno(bio.bi_status);
bio_uninit(&bio);
return error;
@@ -989,7 +981,7 @@ xfs_zone_gc_reset_zones(
chunk->data = data;
WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
list_add_tail(&chunk->entry, &data->resetting);
xfs_submit_zone_reset_bio(rtg, bio);
xfs_submit_zone_reset_bio(bio, rtg);
} while (next);
}

View File

@@ -1,20 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* HKDF: HMAC-based Key Derivation Function (HKDF), RFC 5869
*
* Extracted from fs/crypto/hkdf.c, which has
* Copyright 2019 Google LLC
*/
#ifndef _CRYPTO_HKDF_H
#define _CRYPTO_HKDF_H
#include <crypto/hash.h>
int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm,
unsigned int ikmlen, const u8 *salt, unsigned int saltlen,
u8 *prk);
int hkdf_expand(struct crypto_shash *hmac_tfm,
const u8 *info, unsigned int infolen,
u8 *okm, unsigned int okmlen);
#endif

View File

@@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *);
extern int biovec_init_pool(mempool_t *pool, int pool_entries);
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
blk_opf_t opf, gfp_t gfp_mask,
struct bio_set *bs);
blk_opf_t opf, gfp_t gfp, struct bio_set *bs);
struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask);
extern void bio_put(struct bio *);
@@ -433,6 +432,8 @@ extern void bio_uninit(struct bio *);
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
void bio_reuse(struct bio *bio, blk_opf_t opf);
void bio_chain(struct bio *, struct bio *);
void bio_await(struct bio *bio, void *priv,
void (*submit)(struct bio *bio, void *priv));
int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len,
unsigned off);

View File

@@ -14,6 +14,7 @@ enum blk_integrity_flags {
BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2,
BLK_INTEGRITY_REF_TAG = 1 << 3,
BLK_INTEGRITY_STACKED = 1 << 4,
BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5,
};
const char *blk_integrity_profile_name(struct blk_integrity *bi);

View File

@@ -13,6 +13,7 @@
#include <linux/minmax.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/completion.h>
#include <linux/wait.h>
#include <linux/bio.h>
#include <linux/gfp.h>
@@ -201,10 +202,14 @@ struct gendisk {
u8 __rcu *zones_cond;
unsigned int zone_wplugs_hash_bits;
atomic_t nr_zone_wplugs;
spinlock_t zone_wplugs_lock;
spinlock_t zone_wplugs_hash_lock;
struct mempool *zone_wplugs_pool;
struct hlist_head *zone_wplugs_hash;
struct workqueue_struct *zone_wplugs_wq;
spinlock_t zone_wplugs_list_lock;
struct list_head zone_wplugs_list;
struct task_struct *zone_wplugs_worker;
struct completion zone_wplugs_worker_bio_done;
#endif /* CONFIG_BLK_DEV_ZONED */
#if IS_ENABLED(CONFIG_CDROM)
@@ -503,7 +508,7 @@ struct request_queue {
/* hw dispatch queues */
unsigned int nr_hw_queues;
struct blk_mq_hw_ctx * __rcu *queue_hw_ctx;
struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues);
struct percpu_ref q_usage_counter;
struct lock_class_key io_lock_cls_key;
@@ -669,6 +674,7 @@ enum {
QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */
QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */
QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */
QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */
QUEUE_FLAG_MAX
};
@@ -708,6 +714,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q);
test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags)
#define blk_queue_no_elv_switch(q) \
test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags)
#define blk_queue_zoned_qd1_writes(q) \
test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@@ -1468,11 +1476,6 @@ static inline bool bdev_rot(struct block_device *bdev)
return blk_queue_rot(bdev_get_queue(bdev));
}
static inline bool bdev_nonrot(struct block_device *bdev)
{
return !bdev_rot(bdev);
}
static inline bool bdev_synchronous(struct block_device *bdev)
{
return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS;

View File

@@ -7,13 +7,17 @@
struct bsg_device;
struct device;
struct request_queue;
struct io_uring_cmd;
typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr,
bool open_for_write, unsigned int timeout);
typedef int (bsg_uring_cmd_fn)(struct request_queue *q, struct io_uring_cmd *ioucmd,
unsigned int issue_flags, bool open_for_write);
struct bsg_device *bsg_register_queue(struct request_queue *q,
struct device *parent, const char *name,
bsg_sg_io_fn *sg_io_fn);
bsg_sg_io_fn *sg_io_fn, bsg_uring_cmd_fn *uring_cmd_fn);
void bsg_unregister_queue(struct bsg_device *bcd);
#endif /* _LINUX_BSG_H */

View File

@@ -203,15 +203,6 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv,
((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \
bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len))
/* for iterating one bio from start to end */
#define BVEC_ITER_ALL_INIT (struct bvec_iter) \
{ \
.bi_sector = 0, \
.bi_size = UINT_MAX, \
.bi_idx = 0, \
.bi_bvec_done = 0, \
}
static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all)
{
iter_all->done = 0;

Some files were not shown because too many files have changed in this diff Show More