From b26f29b6692f362a343f7cce2e716a16a8fee488 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:55 +0100 Subject: [PATCH 001/146] sed-opal: add UID of Locking Table. As described in ch. 6.3, Table 240 in TCG Storage Architecture Core Specification document. It's also referenced in TCG Storage Opal SSC Feature Set: Single User Mode document, ch. 3.1.1.1 Reactivate method. It will be used later in Reactivate method implemetation for sed-opal interface. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index d247a457bf6e..3dfba3de7be1 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -125,6 +125,7 @@ enum opal_uid { OPAL_LOCKING_INFO_TABLE, OPAL_ENTERPRISE_LOCKING_INFO_TABLE, OPAL_DATASTORE, + OPAL_LOCKING_TABLE, /* C_PIN_TABLE object ID's */ OPAL_C_PIN_MSID, OPAL_C_PIN_SID, diff --git a/block/sed-opal.c b/block/sed-opal.c index 3ded1ca723ca..83bee47aa29f 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -160,6 +160,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, [OPAL_DATASTORE] = { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_LOCKING_TABLE] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x00 }, /* C_PIN_TABLE object ID's */ [OPAL_C_PIN_MSID] = From a184058fb4d3cfcce1a2b4e021451dcc2e88d9c6 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:56 +0100 Subject: [PATCH 002/146] sed-opal: add RangeStartRangeLengthPolicy parameter. As desribed in ch. 3.1.1.1.1.2 of TCG Storage Opal SSC Feature Set: Single User Mode document. To be used later in Reactivate method implementation and in function for retrieving SUM device status. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + 1 file changed, 1 insertion(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index 3dfba3de7be1..b9877eacfe91 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -225,6 +225,7 @@ enum opal_lockingstate { enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, + OPAL_SUM_RANGE_POLICY = 0x060001, }; enum opal_revertlsp { From c6c9dc91cb5fd30d2e11e7f2ae570e614b013ee1 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:57 +0100 Subject: [PATCH 003/146] sed-opal: add Admin1PIN parameter. As desribed in ch. 3.1.1.1.1.3 of TCG Storage Opal SSC Feature Set: Single User Mode document. To be used later in Reactivate method implementation. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + 1 file changed, 1 insertion(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index b9877eacfe91..3ccee5977c10 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -226,6 +226,7 @@ enum opal_lockingstate { enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, OPAL_SUM_RANGE_POLICY = 0x060001, + OPAL_SUM_ADMIN1_PIN = 0x060002, }; enum opal_revertlsp { From aca086ff27c3f67e81617e4b063d1126544a4f19 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:58 +0100 Subject: [PATCH 004/146] sed-opal: add IOC_OPAL_REACTIVATE_LSP. This adds the 'Reactivate' method as described in the "TCG Storage Opal SSC Feature Set: Single User Mode" document (ch. 3.1.1.1). The method enables switching an already active SED OPAL2 device, with appropriate firmware support for Single User Mode (SUM), to or from SUM. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/opal_proto.h | 1 + block/sed-opal.c | 99 +++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 14 +++++ 4 files changed, 115 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index 3ccee5977c10..d138785b8198 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -155,6 +155,7 @@ enum opal_method { OPAL_AUTHENTICATE, OPAL_RANDOM, OPAL_ERASE, + OPAL_REACTIVATE, }; enum opal_token { diff --git a/block/sed-opal.c b/block/sed-opal.c index 83bee47aa29f..5d06f5f433bf 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -220,6 +220,8 @@ static const u8 opalmethod[][OPAL_METHOD_LENGTH] = { { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, [OPAL_ERASE] = { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, + [OPAL_REACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x01 }, }; static int end_opal_session_error(struct opal_dev *dev); @@ -2287,6 +2289,74 @@ static int activate_lsp(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int reactivate_lsp(struct opal_dev *dev, void *data) +{ + struct opal_lr_react *opal_react = data; + u8 user_lr[OPAL_UID_LENGTH]; + int err, i; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REACTIVATE]); + + if (err) { + pr_debug("Error building Reactivate LockingSP command.\n"); + return err; + } + + /* + * If neither 'entire_table' nor 'num_lrs' is set, the device + * gets reactivated with SUM disabled. Only Admin1PIN will change + * if set. + */ + if (opal_react->entire_table) { + /* Entire Locking table (all locking ranges) will be put in SUM. */ + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + } else if (opal_react->num_lrs) { + /* Subset of Locking table (selected locking range(s)) to be put in SUM */ + err = build_locking_range(user_lr, sizeof(user_lr), + opal_react->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_react->num_lrs; i++) { + user_lr[7] = opal_react->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* Skipping the rangle policy parameter is same as setting its value to zero */ + if (opal_react->range_policy && (opal_react->num_lrs || opal_react->entire_table)) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_RANGE_POLICY); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + /* + * Optional parameter. If set, it changes the Admin1 PIN even when SUM + * is being disabled. + */ + if (opal_react->new_admin_key.key_len) { + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_ADMIN1_PIN); + add_token_bytestring(&err, dev, opal_react->new_admin_key.key, + opal_react->new_admin_key.key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + return finalize_and_send(dev, parse_and_check_status); +} + /* Determine if we're in the Manufactured Inactive or Active state */ static int get_lsp_lifecycle(struct opal_dev *dev, void *data) { @@ -2957,6 +3027,32 @@ static int opal_activate_lsp(struct opal_dev *dev, return ret; } +static int opal_reactivate_lsp(struct opal_dev *dev, + struct opal_lr_react *opal_lr_react) +{ + const struct opal_step active_steps[] = { + { start_admin1LSP_opal_session, &opal_lr_react->key }, + { reactivate_lsp, opal_lr_react }, + /* No end_opal_session. The controller terminates the session */ + }; + int ret; + + /* use either 'entire_table' parameter or set of locking ranges */ + if (opal_lr_react->num_lrs > OPAL_MAX_LRS || + (opal_lr_react->num_lrs && opal_lr_react->entire_table)) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lr_react->key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_setup_locking_range(struct opal_dev *dev, struct opal_user_lr_setup *opal_lrs) { @@ -3315,6 +3411,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_SET_SID_PW: ret = opal_set_new_sid_pw(dev, p); break; + case IOC_OPAL_REACTIVATE_LSP: + ret = opal_reactivate_lsp(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 80f33a93f944..2ae5e6b0ac21 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -53,6 +53,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_DISCOVERY: case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: + case IOC_OPAL_REACTIVATE_LSP: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 9025dd5a4f0f..d03e590b6501 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -74,6 +74,19 @@ struct opal_lr_act { __u8 align[2]; /* Align to 8 byte boundary */ }; +struct opal_lr_react { + struct opal_key key; + struct opal_key new_admin_key; /* Set new Admin1 PIN if key_len is > 0 */ + __u8 num_lrs; /* + * Configure selected ranges (from lr[]) in SUM. + * If num_lrs > 0 the 'entire_table' must be 0 + */ + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; /* Set RangeStartRangeLengthPolicy parameter */ + __u8 entire_table; /* Set all locking objects in SUM */ + __u8 align[4]; /* Align to 8 byte boundary */ +}; + struct opal_session_info { __u32 sum; __u32 who; @@ -216,5 +229,6 @@ struct opal_revert_lsp { #define IOC_OPAL_DISCOVERY _IOW('p', 239, struct opal_discovery) #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) +#define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #endif /* _UAPI_SED_OPAL_H */ From 8ff71e6b961beea2ab25850b285287a3350bce92 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:17:59 +0100 Subject: [PATCH 005/146] sed-opal: refactor (split) IOC_OPAL_LR_SETUP internals. IOC_OPAL_LR_SETUP is used to set up a locking range entirely under a single authority (usually Admin1), but for Single User Mode (SUM), the permissions for attributes (RangeStart, RangeLength) and (ReadLockEnable, WriteLockEnable, ReadLocked, WriteLocked) may be split between two different authorities. Typically, it is Admin1 for the former and the User associated with the LockingRange in SUM for the latter. This commit only splits the internals in preparation for the introduction of separate ioctls for setting RangeStart, RangeLength and the rest using new ioctl calls. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 85 +++++++++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 34 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 5d06f5f433bf..7be72f621952 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1518,7 +1518,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, return err; } -static int setup_locking_range(struct opal_dev *dev, void *data) +static int setup_enable_range(struct opal_dev *dev, void *data) { u8 uid[OPAL_UID_LENGTH]; struct opal_user_lr_setup *setup = data; @@ -1532,38 +1532,47 @@ static int setup_locking_range(struct opal_dev *dev, void *data) if (lr == 0) err = enable_global_lr(dev, uid, setup); - else { - err = cmd_start(dev, uid, opalmethod[OPAL_SET]); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_VALUES); - add_token_u8(&err, dev, OPAL_STARTLIST); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_RANGESTART); - add_token_u64(&err, dev, setup->range_start); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_RANGELENGTH); - add_token_u64(&err, dev, setup->range_length); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_READLOCKENABLED); - add_token_u64(&err, dev, !!setup->RLE); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_u8(&err, dev, OPAL_WRITELOCKENABLED); - add_token_u64(&err, dev, !!setup->WLE); - add_token_u8(&err, dev, OPAL_ENDNAME); - - add_token_u8(&err, dev, OPAL_ENDLIST); - add_token_u8(&err, dev, OPAL_ENDNAME); - } + else + err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, 0, 0); if (err) { - pr_debug("Error building Setup Locking range command.\n"); + pr_debug("Failed to create enable lr command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int setup_locking_range_start_length(struct opal_dev *dev, void *data) +{ + int err; + u8 uid[OPAL_UID_LENGTH]; + struct opal_user_lr_setup *setup = data; + + err = build_locking_range(uid, sizeof(uid), setup->session.opal_key.lr); + if (err) + return err; + + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_RANGESTART); + add_token_u64(&err, dev, setup->range_start); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_RANGELENGTH); + add_token_u64(&err, dev, setup->range_length); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error building Setup Locking RangeStartLength command.\n"); return err; } @@ -3058,7 +3067,12 @@ static int opal_setup_locking_range(struct opal_dev *dev, { const struct opal_step lr_steps[] = { { start_auth_opal_session, &opal_lrs->session }, - { setup_locking_range, opal_lrs }, + { setup_locking_range_start_length, opal_lrs }, + { setup_enable_range, opal_lrs }, + { end_opal_session, } + }, lr_global_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_enable_range, opal_lrs }, { end_opal_session, } }; int ret; @@ -3068,7 +3082,10 @@ static int opal_setup_locking_range(struct opal_dev *dev, return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); - ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + if (opal_lrs->session.opal_key.lr == 0) + ret = execute_steps(dev, lr_global_steps, ARRAY_SIZE(lr_global_steps)); + else + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); mutex_unlock(&dev->dev_lock); return ret; From 8e3d34a7ce7386b01947dd649bd24775544e4d3e Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:00 +0100 Subject: [PATCH 006/146] sed-opal: add IOC_OPAL_LR_SET_START_LEN ioctl. This ioctl is used to set up locking range start (offset) and locking range length attributes only. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. Therefore, we need a separate function for setting up locking range start and locking range length because it may require two different authorities (and sessions) if the RangeStartRangeLengthPolicy attribute is set. With the IOC_OPAL_LR_SET_START_LEN ioctl, the opal_user_lr_setup members 'RLE' and 'WLE' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 28 ++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 30 insertions(+) diff --git a/block/sed-opal.c b/block/sed-opal.c index 7be72f621952..55c8a0953d78 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3091,6 +3091,31 @@ static int opal_setup_locking_range(struct opal_dev *dev, return ret; } +static int opal_setup_locking_range_start_length(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_locking_range_start_length, opal_lrs }, + { end_opal_session, } + }; + int ret; + + /* we can not set global locking range offset or length */ + if (opal_lrs->session.opal_key.lr == 0) + return -EINVAL; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_locking_range_status(struct opal_dev *dev, struct opal_lr_status *opal_lrst, void __user *data) @@ -3431,6 +3456,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_REACTIVATE_LSP: ret = opal_reactivate_lsp(dev, p); break; + case IOC_OPAL_LR_SET_START_LEN: + ret = opal_setup_locking_range_start_length(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 2ae5e6b0ac21..a0df6819b0a9 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -54,6 +54,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REVERT_LSP: case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: + case IOC_OPAL_LR_SET_START_LEN: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index d03e590b6501..82de38f3fbeb 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -230,5 +230,6 @@ struct opal_revert_lsp { #define IOC_OPAL_REVERT_LSP _IOW('p', 240, struct opal_revert_lsp) #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) +#define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #endif /* _UAPI_SED_OPAL_H */ From a441a9d22433fea561de131e27fff41715c2d186 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:01 +0100 Subject: [PATCH 007/146] sed-opal: add IOC_OPAL_ENABLE_DISABLE_LR. This ioctl is used to set up RLE (read lock enabled) and WLE (write lock enabled) parameters of the Locking object. In Single User Mode (SUM), if the RangeStartRangeLengthPolicy parameter is set in the 'Reactivate' method, only Admin authority maintains the locking range length and start (offset) attributes of Locking objects set up for SUM. All other attributes from struct opal_user_lr_setup (RLE - read locking enabled, WLE - write locking enabled) shall remain in possession of the User authority associated with the Locking object set for SUM. With the IOC_OPAL_ENABLE_DISABLE_LR ioctl, the opal_user_lr_setup members 'range_start' and 'range_length' of the ioctl argument are ignored. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 24 ++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 3 files changed, 26 insertions(+) diff --git a/block/sed-opal.c b/block/sed-opal.c index 55c8a0953d78..53a73422911e 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3116,6 +3116,27 @@ static int opal_setup_locking_range_start_length(struct opal_dev *dev, return ret; } +static int opal_enable_disable_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_enable_range, opal_lrs }, + { end_opal_session, } + }; + int ret; + + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_locking_range_status(struct opal_dev *dev, struct opal_lr_status *opal_lrst, void __user *data) @@ -3459,6 +3480,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_LR_SET_START_LEN: ret = opal_setup_locking_range_start_length(dev, p); break; + case IOC_OPAL_ENABLE_DISABLE_LR: + ret = opal_enable_disable_range(dev, p); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index a0df6819b0a9..1d63479838cf 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -55,6 +55,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_SET_SID_PW: case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: + case IOC_OPAL_ENABLE_DISABLE_LR: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 82de38f3fbeb..bde023ae2295 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -231,5 +231,6 @@ struct opal_revert_lsp { #define IOC_OPAL_SET_SID_PW _IOW('p', 241, struct opal_new_pw) #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) +#define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) #endif /* _UAPI_SED_OPAL_H */ From 661025cdbc976eadbdfb4c8fcf6d4ead5c67e645 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:02 +0100 Subject: [PATCH 008/146] sed-opal: increase column attribute type size to 64 bits. Change the column parameter in response_get_column() from u8 to u64 to support the full range of column identifiers. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/sed-opal.c b/block/sed-opal.c index 53a73422911e..6146a1b30421 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1581,7 +1581,7 @@ static int setup_locking_range_start_length(struct opal_dev *dev, void *data) static int response_get_column(const struct parsed_resp *resp, int *iter, - u8 column, + u64 column, u64 *value) { const struct opal_resp_tok *tok; @@ -1599,7 +1599,7 @@ static int response_get_column(const struct parsed_resp *resp, n++; if (response_get_u64(resp, n) != column) { - pr_debug("Token %d does not match expected column %u.\n", + pr_debug("Token %d does not match expected column %llu.\n", n, column); return OPAL_INVAL_PARAM; } From 0cc9293bccb234552b81c3ebc074f5839f019e01 Mon Sep 17 00:00:00 2001 From: Ondrej Kozina Date: Fri, 6 Feb 2026 15:18:03 +0100 Subject: [PATCH 009/146] sed-opal: add IOC_OPAL_GET_SUM_STATUS ioctl. This adds a function for retrieving the set of Locking objects enabled for Single User Mode (SUM) and the value of the RangeStartRangeLengthPolicy parameter. It retrieves data from the LockingInfo table, specifically the columns SingleUserModeRanges and RangeStartLengthPolicy, which were added according to the TCG Opal Feature Set: Single User Mode, as described in chapters 4.4.3.1 and 4.4.3.2. Signed-off-by: Ondrej Kozina Reviewed-and-tested-by: Milan Broz Signed-off-by: Jens Axboe --- block/sed-opal.c | 159 ++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 13 +++ 3 files changed, 173 insertions(+) diff --git a/block/sed-opal.c b/block/sed-opal.c index 6146a1b30421..c34d19e91201 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1757,6 +1757,12 @@ static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data) OPAL_ADMINSP_UID, NULL, 0); } +static int start_anybodyLSP_opal_session(struct opal_dev *dev, void *data) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_LOCKINGSP_UID, NULL, 0); +} + static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) { int ret; @@ -3389,6 +3395,156 @@ static int opal_get_geometry(struct opal_dev *dev, void __user *data) return 0; } +static int get_sum_ranges(struct opal_dev *dev, void *data) +{ + const char *lr_uid; + size_t lr_uid_len; + u64 val; + const struct opal_resp_tok *tok; + int err, tok_n = 2; + struct opal_sum_ranges *sranges = data; + const __u8 lr_all[OPAL_MAX_LRS] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; + + err = generic_get_columns(dev, opaluid[OPAL_LOCKING_INFO_TABLE], OPAL_SUM_SET_LIST, + OPAL_SUM_RANGE_POLICY); + if (err) { + pr_debug("Couldn't get locking info table columns %d to %d.\n", + OPAL_SUM_SET_LIST, OPAL_SUM_RANGE_POLICY); + return err; + } + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + if (response_get_u64(&dev->parsed, tok_n) != OPAL_SUM_SET_LIST) { + pr_debug("Token %d does not match expected column %u.\n", + tok_n, OPAL_SUM_SET_LIST); + return OPAL_INVAL_PARAM; + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + /* + * The OPAL_SUM_SET_LIST response contains two distinct values: + * + * - the list of individual locking ranges (UIDs) put in SUM. The list + * may also be empty signaling the SUM is disabled. + * + * - the Locking table UID if the entire Locking table is put in SUM. + */ + if (response_token_matches(tok, OPAL_STARTLIST)) { + sranges->num_lrs = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + while (!response_token_matches(tok, OPAL_ENDLIST)) { + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH)) { + if (lr_uid[5] != LOCKING_RANGE_NON_GLOBAL) { + pr_debug("Unexpected byte %d at LR UUID position 5.\n", + lr_uid[5]); + return OPAL_INVAL_PARAM; + } + sranges->lr[sranges->num_lrs++] = lr_uid[7]; + } else + sranges->lr[sranges->num_lrs++] = 0; + + tok_n++; + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + } + } else { + /* Only OPAL_LOCKING_TABLE UID is an alternative to OPAL_STARTLIST here. */ + lr_uid_len = response_get_string(&dev->parsed, tok_n, &lr_uid); + if (lr_uid_len != OPAL_UID_LENGTH) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + + if (memcmp(lr_uid, opaluid[OPAL_LOCKING_TABLE], OPAL_UID_LENGTH)) { + pr_debug("Unexpected response UID.\n"); + return OPAL_INVAL_PARAM; + } + + /* sed-opal kernel API already provides following limit in Activate command */ + sranges->num_lrs = OPAL_MAX_LRS; + memcpy(sranges->lr, lr_all, OPAL_MAX_LRS); + } + tok_n++; + + tok = response_get_token(&dev->parsed, tok_n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", tok_n); + return OPAL_INVAL_PARAM; + } + tok_n++; + + err = response_get_column(&dev->parsed, &tok_n, OPAL_SUM_RANGE_POLICY, &val); + if (err) + return err; + + sranges->range_policy = val ? 1 : 0; + + return 0; +} + +static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opal_sum_rngs, + void __user *data) +{ + const struct opal_step admin_steps[] = { + { start_admin1LSP_opal_session, &opal_sum_rngs->key }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }, anybody_steps[] = { + { start_anybodyLSP_opal_session, NULL }, + { get_sum_ranges, opal_sum_rngs }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + if (opal_sum_rngs->key.key_len) + /* Use Admin1 session (authenticated by PIN) to retrieve LockingInfo columns */ + ret = execute_steps(dev, admin_steps, ARRAY_SIZE(admin_steps)); + else + /* Use Anybody session (no key) to retrieve LockingInfo columns */ + ret = execute_steps(dev, anybody_steps, ARRAY_SIZE(anybody_steps)); + mutex_unlock(&dev->dev_lock); + + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_sum_ranges, num_lrs), + (void *)opal_sum_rngs + offsetof(struct opal_sum_ranges, num_lrs), + sizeof(*opal_sum_rngs) - offsetof(struct opal_sum_ranges, num_lrs))) { + pr_debug("Error copying SUM ranges info to userspace\n"); + return -EFAULT; + } + + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -3483,6 +3639,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_ENABLE_DISABLE_LR: ret = opal_enable_disable_range(dev, p); break; + case IOC_OPAL_GET_SUM_STATUS: + ret = opal_get_sum_ranges(dev, p, arg); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index 1d63479838cf..aa006edb612b 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -56,6 +56,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_REACTIVATE_LSP: case IOC_OPAL_LR_SET_START_LEN: case IOC_OPAL_ENABLE_DISABLE_LR: + case IOC_OPAL_GET_SUM_STATUS: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index bde023ae2295..9830298ec51c 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -111,6 +111,18 @@ struct opal_lr_status { __u8 align[4]; }; +struct opal_sum_ranges { + /* + * Initiate Admin1 session if key_len > 0, + * use Anybody session otherwise. + */ + struct opal_key key; + __u8 num_lrs; + __u8 lr[OPAL_MAX_LRS]; + __u8 range_policy; + __u8 align[5]; /* Align to 8 byte boundary */ +}; + struct opal_lock_unlock { struct opal_session_info session; __u32 l_state; @@ -232,5 +244,6 @@ struct opal_revert_lsp { #define IOC_OPAL_REACTIVATE_LSP _IOW('p', 242, struct opal_lr_react) #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) +#define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges) #endif /* _UAPI_SED_OPAL_H */ From 0ee8ab5d4dc51704be1157470f3df8090629f9fc Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Wed, 25 Feb 2026 20:51:05 +0000 Subject: [PATCH 010/146] block: annotate struct request_queue with __counted_by_ptr The queue_hw_ctx field in struct request_queue is an array of pointers to struct blk_mq_hw_ctx. The number of elements in this array is tracked by the nr_hw_queues field. The array is allocated in __blk_mq_realloc_hw_ctxs() using kcalloc_node() with set->nr_hw_queues elements. q->nr_hw_queues is subsequently updated to set->nr_hw_queues. When growing the array, the new array is assigned to queue_hw_ctx before nr_hw_queues is updated. This is safe because nr_hw_queues (the old smaller count) is used for bounds checking, which is within the new larger allocation. When shrinking the array, nr_hw_queues is updated to the smaller value, while queue_hw_ctx retains the larger allocation. This is also safe as the count is within the allocation bounds. Annotating queue_hw_ctx with __counted_by_ptr(nr_hw_queues) allows the compiler (with kSAN) to verify that accesses to queue_hw_ctx are within the valid range defined by nr_hw_queues. This patch was generated by CodeMender and reviewed by Bill Wendling. Tested by running blktests. Reviewed-by: Daniel Wagner Signed-off-by: Bill Wendling [axboe: massage commit message] Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d463b9b5a0a5..540c2c6c9afd 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -502,7 +502,7 @@ struct request_queue { /* hw dispatch queues */ unsigned int nr_hw_queues; - struct blk_mq_hw_ctx * __rcu *queue_hw_ctx; + struct blk_mq_hw_ctx * __rcu *queue_hw_ctx __counted_by_ptr(nr_hw_queues); struct percpu_ref q_usage_counter; struct lock_class_key io_lock_cls_key; From b7d4ffb510373cc6ecf16022dd0e510a023034fb Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:44 +0900 Subject: [PATCH 011/146] block: fix zone write plug removal Commit 7b295187287e ("block: Do not remove zone write plugs still in use") modified disk_should_remove_zone_wplug() to add a check on the reference count of a zone write plug to prevent removing zone write plugs from a disk hash table when the plugs are still being referenced by BIOs or requests in-flight. However, this check does not take into account that a BIO completion may happen right after its submission by a zone write plug BIO work, and before the zone write plug BIO work releases the zone write plug reference count. This situation leads to disk_should_remove_zone_wplug() returning false as in this case the zone write plug reference count is at least equal to 3. If the BIO that completes in such manner transitioned the zone to the FULL condition, the zone write plug for the FULL zone will remain in the disk hash table. Furthermore, relying on a particular value of a zone write plug reference count to set the BLK_ZONE_WPLUG_UNHASHED flag is fragile as reading the atomic reference count and doing a comparison with some value is not overall atomic at all. Address these issues by reworking the reference counting of zone write plugs so that removing plugs from a disk hash table can be done directly from disk_put_zone_wplug() when the last reference on a plug is dropped. To do so, replace the function disk_remove_zone_wplug() with disk_mark_zone_wplug_dead(). This new function sets the zone write plug flag BLK_ZONE_WPLUG_DEAD (which replaces BLK_ZONE_WPLUG_UNHASHED) and drops the initial reference on the zone write plug taken when the plug was added to the disk hash table. This function is called either for zones that are empty or full, or directly in the case of a forced plug removal (e.g. when the disk hash table is being destroyed on disk removal). With this change, disk_should_remove_zone_wplug() is also removed. disk_put_zone_wplug() is modified to call the function disk_free_zone_wplug() to remove a zone write plug from a disk hash table and free the plug structure (with a call_rcu()), when the last reference on a zone write plug is dropped. disk_free_zone_wplug() always checks that the BLK_ZONE_WPLUG_DEAD flag is set. In order to avoid having multiple zone write plugs for the same zone in the disk hash table, disk_get_and_lock_zone_wplug() checked for the BLK_ZONE_WPLUG_UNHASHED flag. This check is removed and a check for the new BLK_ZONE_WPLUG_DEAD flag is added to blk_zone_wplug_handle_write(). With this change, we continue preventing adding multiple zone write plugs for the same zone and at the same time re-inforce checks on the user behavior by failing new incoming write BIOs targeting a zone that is marked as dead. This case can happen only if the user erroneously issues write BIOs to zones that are full, or to zones that are currently being reset or finished. Fixes: 7b295187287e ("block: Do not remove zone write plugs still in use") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-zoned.c | 151 +++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 94 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 9d1dd6ccfad7..6e3ef181e837 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -99,17 +99,17 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) * being executed or the zone write plug bio list is not empty. * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone * write pointer offset and need to update it. - * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed - * from the disk hash table and that the initial reference to the zone - * write plug set when the plug was first added to the hash table has been - * dropped. This flag is set when a zone is reset, finished or become full, - * to prevent new references to the zone write plug to be taken for - * newly incoming BIOs. A zone write plug flagged with this flag will be - * freed once all remaining references from BIOs or functions are dropped. + * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be + * removed from the disk hash table of zone write plugs when the last + * reference on the zone write plug is dropped. If set, this flag also + * indicates that the initial extra reference on the zone write plug was + * dropped, meaning that the reference count indicates the current number of + * active users (code context or BIOs and requests in flight). This flag is + * set when a zone is reset, finished or becomes full. */ #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) -#define BLK_ZONE_WPLUG_UNHASHED (1U << 2) +#define BLK_ZONE_WPLUG_DEAD (1U << 2) /** * blk_zone_cond_str - Return a zone condition name string @@ -587,64 +587,15 @@ static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); } -static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) -{ - if (refcount_dec_and_test(&zwplug->ref)) { - WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); - WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); - - call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); - } -} - -static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) -{ - lockdep_assert_held(&zwplug->lock); - - /* If the zone write plug was already removed, we are done. */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) - return false; - - /* If the zone write plug is still plugged, it cannot be removed. */ - if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) - return false; - - /* - * Completions of BIOs with blk_zone_write_plug_bio_endio() may - * happen after handling a request completion with - * blk_zone_write_plug_finish_request() (e.g. with split BIOs - * that are chained). In such case, disk_zone_wplug_unplug_bio() - * should not attempt to remove the zone write plug until all BIO - * completions are seen. Check by looking at the zone write plug - * reference count, which is 2 when the plug is unused (one reference - * taken when the plug was allocated and another reference taken by the - * caller context). - */ - if (refcount_read(&zwplug->ref) > 2) - return false; - - /* We can remove zone write plugs for zones that are empty or full. */ - return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); -} - -static void disk_remove_zone_wplug(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; unsigned long flags; - /* If the zone write plug was already removed, we have nothing to do. */ - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) - return; + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)); + WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - /* - * Mark the zone write plug as unhashed and drop the extra reference we - * took when the plug was inserted in the hash table. Also update the - * disk zone condition array with the current condition of the zone - * write plug. - */ - zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, lockdep_is_held(&disk->zone_wplugs_lock)), @@ -652,7 +603,29 @@ static void disk_remove_zone_wplug(struct gendisk *disk, hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); - disk_put_zone_wplug(zwplug); + + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); +} + +static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +{ + if (refcount_dec_and_test(&zwplug->ref)) + disk_free_zone_wplug(zwplug); +} + +/* + * Flag the zone write plug as dead and drop the initial reference we got when + * the zone write plug was added to the hash table. The zone write plug will be + * unhashed when its last reference is dropped. + */ +static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) +{ + lockdep_assert_held(&zwplug->lock); + + if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) { + zwplug->flags |= BLK_ZONE_WPLUG_DEAD; + disk_put_zone_wplug(zwplug); + } } static void blk_zone_wplug_bio_work(struct work_struct *work); @@ -672,18 +645,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, again: zwplug = disk_get_zone_wplug(disk, sector); if (zwplug) { - /* - * Check that a BIO completion or a zone reset or finish - * operation has not already removed the zone write plug from - * the hash table and dropped its reference count. In such case, - * we need to get a new plug so start over from the beginning. - */ spin_lock_irqsave(&zwplug->lock, *flags); - if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { - spin_unlock_irqrestore(&zwplug->lock, *flags); - disk_put_zone_wplug(zwplug); - goto again; - } return zwplug; } @@ -788,14 +750,8 @@ static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, disk_zone_wplug_update_cond(disk, zwplug); disk_zone_wplug_abort(zwplug); - - /* - * The zone write plug now has no BIO plugged: remove it from the - * hash table so that it cannot be seen. The plug will be freed - * when the last reference is dropped. - */ - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); + if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) + disk_mark_zone_wplug_dead(zwplug); } static unsigned int blk_zone_wp_offset(struct blk_zone *zone) @@ -1447,6 +1403,19 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } + /* + * If we got a zone write plug marked as dead, then the user is issuing + * writes to a full zone, or without synchronizing with zone reset or + * zone finish operations. In such case, fail the BIO to signal this + * invalid usage. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + bio_io_error(bio); + return true; + } + /* Indicate that this BIO is being handled using zone write plugging. */ bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); @@ -1527,7 +1496,7 @@ static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) disk->disk_name, zwplug->zone_no); disk_zone_wplug_abort(zwplug); } - disk_remove_zone_wplug(disk, zwplug); + disk_mark_zone_wplug_dead(zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); @@ -1630,14 +1599,8 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, } zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; - - /* - * If the zone is full (it was fully written or finished, or empty - * (it was reset), remove its zone write plug from the hash table. - */ - if (disk_should_remove_zone_wplug(disk, zwplug)) - disk_remove_zone_wplug(disk, zwplug); - + if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) + disk_mark_zone_wplug_dead(zwplug); spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1848,9 +1811,9 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) while (!hlist_empty(&disk->zone_wplugs_hash[i])) { zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, struct blk_zone_wplug, node); - refcount_inc(&zwplug->ref); - disk_remove_zone_wplug(disk, zwplug); - disk_put_zone_wplug(zwplug); + spin_lock_irq(&zwplug->lock); + disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irq(&zwplug->lock); } } From 0a8b8af896e0ef83e188e1fe20f98f2bbb1c2459 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:45 +0900 Subject: [PATCH 012/146] block: fix zone write plugs refcount handling in disk_zone_wplug_schedule_bio_work() The function disk_zone_wplug_schedule_bio_work() always takes a reference on the zone write plug of the BIO work being scheduled. This ensures that the zone write plug cannot be freed while the BIO work is being scheduled but has not run yet. However, this unconditional reference taking is fragile since the reference taken is released by the BIO work blk_zone_wplug_bio_work() function, which implies that there always must be a 1:1 relation between the work being scheduled and the work running. Make sure to drop the reference taken when scheduling the BIO work if the work is already scheduled, that is, when queue_work() returns false. Fixes: 9e78c38ab30b ("block: Hold a reference on zone write plugs to schedule submission") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-zoned.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6e3ef181e837..7aae3c236cad 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -1154,13 +1154,17 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, lockdep_assert_held(&zwplug->lock); /* - * Take a reference on the zone write plug and schedule the submission - * of the next plugged BIO. blk_zone_wplug_bio_work() will release the - * reference we take here. + * Schedule the submission of the next plugged BIO. Taking a reference + * to the zone write plug is required as the bio_work belongs to the + * plug, and thus we must ensure that the write plug does not go away + * while the work is being scheduled but has not run yet. + * blk_zone_wplug_bio_work() will release the reference we take here, + * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); refcount_inc(&zwplug->ref); - queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); + if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) + disk_put_zone_wplug(zwplug); } static inline void disk_zone_wplug_add_bio(struct gendisk *disk, From 1084e41deeada93eebfd83572cf29029c24e5443 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:46 +0900 Subject: [PATCH 013/146] block: rename and simplify disk_get_and_lock_zone_wplug() disk_get_and_lock_zone_wplug() always returns a zone write plug with the plug lock held. This is unnecessary since this function does not look at the fields of existing plugs, and new plugs need to be locked only after their insertion in the disk hash table, when they are being used. Remove the zone write plug locking from disk_get_and_lock_zone_wplug() and rename this function disk_get_or_alloc_zone_wplug(). blk_zone_wplug_handle_write() is modified to add locking of the zone write plug after calling disk_get_or_alloc_zone_wplug() and before starting to use the plug. This change also simplifies blk_revalidate_seq_zone() as unlocking the plug becomes unnecessary. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 7aae3c236cad..185651a0d617 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -631,23 +631,20 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) static void blk_zone_wplug_bio_work(struct work_struct *work); /* - * Get a reference on the write plug for the zone containing @sector. - * If the plug does not exist, it is allocated and hashed. - * Return a pointer to the zone write plug with the plug spinlock held. + * Get a zone write plug for the zone containing @sector. + * If the plug does not exist, it is allocated and inserted in the disk hash + * table. */ -static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, - sector_t sector, gfp_t gfp_mask, - unsigned long *flags) +static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, + sector_t sector, gfp_t gfp_mask) { unsigned int zno = disk_zone_no(disk, sector); struct blk_zone_wplug *zwplug; again: zwplug = disk_get_zone_wplug(disk, sector); - if (zwplug) { - spin_lock_irqsave(&zwplug->lock, *flags); + if (zwplug) return zwplug; - } /* * Allocate and initialize a zone write plug with an extra reference @@ -668,15 +665,12 @@ again: INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); zwplug->disk = disk; - spin_lock_irqsave(&zwplug->lock, *flags); - /* * Insert the new zone write plug in the hash table. This can fail only * if another context already inserted a plug. Retry from the beginning * in such case. */ if (!disk_insert_zone_wplug(disk, zwplug)) { - spin_unlock_irqrestore(&zwplug->lock, *flags); mempool_free(zwplug, disk->zone_wplugs_pool); goto again; } @@ -1398,7 +1392,7 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) if (bio->bi_opf & REQ_NOWAIT) gfp_mask = GFP_NOWAIT; - zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); + zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask); if (!zwplug) { if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); @@ -1407,6 +1401,8 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) return true; } + spin_lock_irqsave(&zwplug->lock, flags); + /* * If we got a zone write plug marked as dead, then the user is issuing * writes to a full zone, or without synchronizing with zone reset or @@ -2045,7 +2041,6 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, struct gendisk *disk = args->disk; struct blk_zone_wplug *zwplug; unsigned int wp_offset; - unsigned long flags; /* * Remember the capacity of the first sequential zone and check @@ -2075,10 +2070,9 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, if (!wp_offset || wp_offset >= zone->capacity) return 0; - zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); + zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO); if (!zwplug) return -ENOMEM; - spin_unlock_irqrestore(&zwplug->lock, flags); disk_put_zone_wplug(zwplug); return 0; From c30e8c4bb0e088068a7aae2d98882ec1cfa57d4c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:47 +0900 Subject: [PATCH 014/146] block: remove disk_zone_is_full() The helper function disk_zone_is_full() is only used in disk_zone_wplug_is_full(). So remove it and open code it directly in this single caller. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 185651a0d617..26c2aa79faf6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -492,18 +492,12 @@ static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) return zone->start + zone->len >= get_capacity(disk); } -static bool disk_zone_is_full(struct gendisk *disk, - unsigned int zno, unsigned int offset_in_zone) -{ - if (zno < disk->nr_zones - 1) - return offset_in_zone >= disk->zone_capacity; - return offset_in_zone >= disk->last_zone_capacity; -} - static bool disk_zone_wplug_is_full(struct gendisk *disk, struct blk_zone_wplug *zwplug) { - return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); + if (zwplug->zone_no < disk->nr_zones - 1) + return zwplug->wp_offset >= disk->zone_capacity; + return zwplug->wp_offset >= disk->last_zone_capacity; } static bool disk_insert_zone_wplug(struct gendisk *disk, From b7cbc30e93e3a64ea058230f6d0c764d6d80276f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:48 +0900 Subject: [PATCH 015/146] block: rename struct gendisk zone_wplugs_lock field Rename struct gendisk zone_wplugs_lock field to zone_wplugs_hash_lock to clearly indicates that this is the spinlock used for manipulating the hash table of zone write plugs. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-zoned.c | 23 ++++++++++++----------- include/linux/blkdev.h | 2 +- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 26c2aa79faf6..78810e726222 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -514,10 +514,11 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * are racing with other submission context, so we may already have a * zone write plug for the same zone. */ - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { if (zwplg->zone_no == zwplug->zone_no) { - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, + flags); return false; } } @@ -529,7 +530,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, * necessarilly in the active condition. */ zones_cond = rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); if (zones_cond) zwplug->cond = zones_cond[zwplug->zone_no]; else @@ -537,7 +538,7 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); atomic_inc(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); return true; } @@ -590,13 +591,13 @@ static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)), + lockdep_is_held(&disk->zone_wplugs_hash_lock)), zwplug->zone_no, zwplug->cond); hlist_del_init_rcu(&zwplug->node); atomic_dec(&disk->nr_zone_wplugs); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); } @@ -1739,7 +1740,7 @@ put_zwplug: void disk_init_zone_resources(struct gendisk *disk) { - spin_lock_init(&disk->zone_wplugs_lock); + spin_lock_init(&disk->zone_wplugs_hash_lock); } /* @@ -1829,10 +1830,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) { unsigned long flags; - spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, - lockdep_is_held(&disk->zone_wplugs_lock)); - spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + lockdep_is_held(&disk->zone_wplugs_hash_lock)); + spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); kfree_rcu_mightsleep(zones_cond); } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 540c2c6c9afd..a49a1e38c6e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -200,7 +200,7 @@ struct gendisk { u8 __rcu *zones_cond; unsigned int zone_wplugs_hash_bits; atomic_t nr_zone_wplugs; - spinlock_t zone_wplugs_lock; + spinlock_t zone_wplugs_hash_lock; struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; From 1365b6904fd050bf22ab9f3df375a396de5837a1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:49 +0900 Subject: [PATCH 016/146] block: allow submitting all zone writes from a single context In order to maintain sequential write patterns per zone with zoned block devices, zone write plugging issues only a single write BIO per zone at any time. This works well but has the side effect that when large sequential write streams are issued by the user and these streams cross zone boundaries, the device ends up receiving a discontiguous set of write commands for different zones. The same also happens when a user writes simultaneously at high queue depth multiple zones: the device does not see all sequential writes per zone and receives discontiguous writes to different zones. While this does not affect the performance of solid state zoned block devices, when using an SMR HDD, this pattern change from sequential writes to discontiguous writes to different zones significantly increases head seek which results in degraded write throughput. In order to reduce this seek overhead for rotational media devices, introduce a per disk zone write plugs kernel thread to issue all write BIOs to zones. This single zone write issuing context is enabled for any zoned block device that has a request queue flagged with the new QUEUE_ZONED_QD1_WRITES flag. The flag QUEUE_ZONED_QD1_WRITES is visible as the sysfs queue attribute zoned_qd1_writes for zoned devices. For regular block devices, this attribute is not visible. For zoned block devices, a user can override the default value set to force the global write maximum queue depth of 1 for a zoned block device, or clear this attribute to fallback to the default behavior of zone write plugging which limits writes to QD=1 per sequential zone. Writing to a zoned block device flagged with QUEUE_ZONED_QD1_WRITES is implemented using a list of zone write plugs that have a non-empty BIO list. Listed zone write plugs are processed by the disk zone write plugs worker kthread in FIFO order, and all BIOs of a zone write plug are all processed before switching to the next listed zone write plug. A newly submitted BIO for a non-FULL zone write plug that is not yet listed causes the addition of the zone write plug at the end of the disk list of zone write plugs. Since the write BIOs queued in a zone write plug BIO list are necessarilly sequential, for rotational media, using the single zone write plugs kthread to issue all BIOs maintains a sequential write pattern and thus reduces seek overhead and improves write throughput. This processing essentially result in always writing to HDDs at QD=1, which is not an issue for HDDs operating with write caching enabled. Performance with write cache disabled is also not degraded thanks to the efficient write handling of modern SMR HDDs. A disk list of zone write plugs is defined using the new struct gendisk zone_wplugs_list, and accesses to this list is protected using the zone_wplugs_list_lock spinlock. The per disk kthread (zone_wplugs_worker) code is implemented by the function disk_zone_wplugs_worker(). A reference on listed zone write plugs is always held until all BIOs of the zone write plug are processed by the worker kthread. BIO issuing at QD=1 is driven using a completion structure (zone_wplugs_worker_bio_done) and calls to blk_io_wait(). With this change, performance when sequentially writing the zones of a 30 TB SMR SATA HDD connected to an AHCI adapter changes as follows (1MiB direct I/Os, results in MB/s unit): +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Sequential write | Baseline | Patched | | Queue Depth | 6.19-rc8 | | +------------------+----------+---------+ | 1 | 244 | 245 | | 2 | 244 | 245 | | 4 | 245 | 245 | | 8 | 242 | 245 | | 16 | 222 | 246 | | 32 | 211 | 245 | | 64 | 193 | 244 | | 128 | 112 | 246 | +------------------+----------+---------+ With the current code (baseline), as the sequential write stream crosses a zone boundary, higher queue depth creates a gap between the last IO to the previous zone and the first IOs to the following zones, causing head seeks and degrading performance. Using the disk zone write plugs worker thread, this pattern disappears and the maximum throughput of the drive is maintained, leading to over 100% improvements in throughput for high queue depth write. Using 16 fio jobs all writing to randomly chosen zones at QD=32 with 1 MiB direct IOs, write throughput also increases significantly. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Random write | Baseline | Patched | | Number of zones | 6.19-rc7 | | +------------------+----------+---------+ | 1 | 191 | 192 | | 2 | 101 | 128 | | 4 | 115 | 123 | | 8 | 90 | 120 | | 16 | 64 | 115 | | 32 | 58 | 105 | | 64 | 56 | 101 | | 128 | 55 | 99 | +------------------+----------+---------+ Tests using XFS shows that buffered write speed with 8 jobs writing files increases by 12% to 35% depending on the workload. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 212 | 238 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 213 | 243 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 179 | 242 | | random file size | | | +------------------+----------+---------+ Performance gains are even more significant when using an HBA that limits the maximum size of commands to a small value, e.g. HBAs controlled with the mpi3mr driver limit commands to a maximum of 1 MiB. In such case, the write throughput gains are over 40%. +--------------------+ | Write BW (MB/s) | +------------------+----------+---------+ | Workload | Baseline | Patched | | | 6.19-rc7 | | +------------------+----------+---------+ | 256MiB file size | 175 | 245 | +------------------+----------+---------+ | 4MiB .. 128 MiB | 174 | 244 | | random file size | | | +------------------+----------+---------+ | 2MiB .. 8 MiB | 171 | 243 | | random file size | | | +------------------+----------+---------+ Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 1 + block/blk-sysfs.c | 35 +++++++- block/blk-zoned.c | 190 ++++++++++++++++++++++++++++++++++++----- include/linux/blkdev.h | 8 ++ 4 files changed, 212 insertions(+), 22 deletions(-) diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 28167c9baa55..047ec887456b 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -97,6 +97,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(NO_ELV_SWITCH), QUEUE_FLAG_NAME(QOS_ENABLED), QUEUE_FLAG_NAME(BIO_ISSUE_TIME), + QUEUE_FLAG_NAME(ZONED_QD1_WRITES), }; #undef QUEUE_FLAG_NAME diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 55a1bbfef7d4..ca8033e6d699 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -390,6 +390,36 @@ static ssize_t queue_nr_zones_show(struct gendisk *disk, char *page) return queue_var_show(disk_nr_zones(disk), page); } +static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page) +{ + return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue), + page); +} + +static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk, + const char *page, size_t count) +{ + struct request_queue *q = disk->queue; + unsigned long qd1_writes; + unsigned int memflags; + ssize_t ret; + + ret = queue_var_store(&qd1_writes, page, count); + if (ret < 0) + return ret; + + memflags = blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + if (qd1_writes) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + else + blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q, memflags); + + return count; +} + static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page) { return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page); @@ -617,6 +647,7 @@ QUEUE_LIM_RO_ENTRY(queue_max_zone_append_sectors, "zone_append_max_bytes"); QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes"); QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones"); QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones"); @@ -754,6 +785,7 @@ static struct attribute *queue_attrs[] = { &queue_nomerges_entry.attr, &queue_poll_entry.attr, &queue_poll_delay_entry.attr, + &queue_zoned_qd1_writes_entry.attr, NULL, }; @@ -786,7 +818,8 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, struct request_queue *q = disk->queue; if ((attr == &queue_max_open_zones_entry.attr || - attr == &queue_max_active_zones_entry.attr) && + attr == &queue_max_active_zones_entry.attr || + attr == &queue_zoned_qd1_writes_entry.attr) && !blk_queue_is_zoned(q)) return 0; diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 78810e726222..e1a23c8b676d 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include @@ -40,6 +42,8 @@ static const char *const zone_cond_name[] = { /* * Per-zone write plug. * @node: hlist_node structure for managing the plug using a hash table. + * @entry: list_head structure for listing the plug in the disk list of active + * zone write plugs. * @bio_list: The list of BIOs that are currently plugged. * @bio_work: Work struct to handle issuing of plugged BIOs * @rcu_head: RCU head to free zone write plugs with an RCU grace period. @@ -62,6 +66,7 @@ static const char *const zone_cond_name[] = { */ struct blk_zone_wplug { struct hlist_node node; + struct list_head entry; struct bio_list bio_list; struct work_struct bio_work; struct rcu_head rcu_head; @@ -623,7 +628,19 @@ static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) } } -static void blk_zone_wplug_bio_work(struct work_struct *work); +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug); + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + + disk_zone_wplug_submit_bio(zwplug->disk, zwplug); + + /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ + disk_put_zone_wplug(zwplug); +} /* * Get a zone write plug for the zone containing @sector. @@ -658,6 +675,7 @@ again: zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); bio_list_init(&zwplug->bio_list); INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + INIT_LIST_HEAD(&zwplug->entry); zwplug->disk = disk; /* @@ -690,6 +708,7 @@ static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, */ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { + struct gendisk *disk = zwplug->disk; struct bio *bio; lockdep_assert_held(&zwplug->lock); @@ -703,6 +722,20 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) blk_zone_wplug_bio_io_error(zwplug, bio); zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If we are using the per disk zone write plugs worker thread, remove + * the zone write plug from the work list and drop the reference we + * took when the zone write plug was added to that list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (!list_empty(&zwplug->entry)) { + list_del_init(&zwplug->entry); + disk_put_zone_wplug(zwplug); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1137,8 +1170,8 @@ void blk_zone_mgmt_bio_endio(struct bio *bio) } } -static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, - struct blk_zone_wplug *zwplug) +static void disk_zone_wplug_schedule_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { lockdep_assert_held(&zwplug->lock); @@ -1151,6 +1184,7 @@ static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, * and we also drop this reference if the work is already scheduled. */ WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); refcount_inc(&zwplug->ref); if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) disk_put_zone_wplug(zwplug); @@ -1190,6 +1224,22 @@ static inline void disk_zone_wplug_add_bio(struct gendisk *disk, bio_list_add(&zwplug->bio_list, bio); trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, bio->bi_iter.bi_sector, bio_sectors(bio)); + + /* + * If we are using the disk zone write plugs worker instead of the per + * zone write plug BIO work, add the zone write plug to the work list + * if it is not already there. Make sure to also get an extra reference + * on the zone write plug so that it does not go away until it is + * removed from the work list. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) { + spin_lock(&disk->zone_wplugs_list_lock); + if (list_empty(&zwplug->entry)) { + list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); + refcount_inc(&zwplug->ref); + } + spin_unlock(&disk->zone_wplugs_list_lock); + } } /* @@ -1423,6 +1473,13 @@ static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) goto queue_bio; } + /* + * For rotational devices, we will use the gendisk zone write plugs + * work instead of the per zone write plug BIO work, so queue the BIO. + */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + goto queue_bio; + /* If the zone is already plugged, add the BIO to the BIO plug list. */ if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) goto queue_bio; @@ -1445,7 +1502,10 @@ queue_bio: if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; - disk_zone_wplug_schedule_bio_work(disk, zwplug); + if (blk_queue_zoned_qd1_writes(disk->queue)) + wake_up_process(disk->zone_wplugs_worker); + else + disk_zone_wplug_schedule_work(disk, zwplug); } spin_unlock_irqrestore(&zwplug->lock, flags); @@ -1586,16 +1646,22 @@ static void disk_zone_wplug_unplug_bio(struct gendisk *disk, spin_lock_irqsave(&zwplug->lock, flags); - /* Schedule submission of the next plugged BIO if we have one. */ - if (!bio_list_empty(&zwplug->bio_list)) { - disk_zone_wplug_schedule_bio_work(disk, zwplug); - spin_unlock_irqrestore(&zwplug->lock, flags); - return; - } + /* + * For rotational devices, signal the BIO completion to the zone write + * plug work. Otherwise, schedule submission of the next plugged BIO + * if we have one. + */ + if (bio_list_empty(&zwplug->bio_list)) + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + if (blk_queue_zoned_qd1_writes(disk->queue)) + complete(&disk->zone_wplugs_worker_bio_done); + else if (!bio_list_empty(&zwplug->bio_list)) + disk_zone_wplug_schedule_work(disk, zwplug); - zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) disk_mark_zone_wplug_dead(zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); } @@ -1685,10 +1751,9 @@ void blk_zone_write_plug_finish_request(struct request *req) disk_put_zone_wplug(zwplug); } -static void blk_zone_wplug_bio_work(struct work_struct *work) +static bool disk_zone_wplug_submit_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) { - struct blk_zone_wplug *zwplug = - container_of(work, struct blk_zone_wplug, bio_work); struct block_device *bdev; unsigned long flags; struct bio *bio; @@ -1704,7 +1769,7 @@ again: if (!bio) { zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; spin_unlock_irqrestore(&zwplug->lock, flags); - goto put_zwplug; + return false; } trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, @@ -1718,14 +1783,15 @@ again: goto again; } - bdev = bio->bi_bdev; - /* * blk-mq devices will reuse the extra reference on the request queue * usage counter we took when the BIO was plugged, but the submission * path for BIO-based devices will not do that. So drop this extra * reference here. */ + if (blk_queue_zoned_qd1_writes(disk->queue)) + reinit_completion(&disk->zone_wplugs_worker_bio_done); + bdev = bio->bi_bdev; if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { bdev->bd_disk->fops->submit_bio(bio); blk_queue_exit(bdev->bd_disk->queue); @@ -1733,14 +1799,78 @@ again: blk_mq_submit_bio(bio); } -put_zwplug: - /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ - disk_put_zone_wplug(zwplug); + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + + spin_lock_irq(&disk->zone_wplugs_list_lock); + zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, + struct blk_zone_wplug, entry); + if (zwplug) + list_del_init(&zwplug->entry); + spin_unlock_irq(&disk->zone_wplugs_list_lock); + + return zwplug; +} + +static int disk_zone_wplugs_worker(void *data) +{ + struct gendisk *disk = data; + struct blk_zone_wplug *zwplug; + unsigned int noio_flag; + + noio_flag = memalloc_noio_save(); + set_user_nice(current, MIN_NICE); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + + zwplug = disk_get_zone_wplugs_work(disk); + if (zwplug) { + /* + * Process all BIOs of this zone write plug and then + * drop the reference we took when adding the zone write + * plug to the active list. + */ + set_current_state(TASK_RUNNING); + while (disk_zone_wplug_submit_bio(disk, zwplug)) + blk_wait_io(&disk->zone_wplugs_worker_bio_done); + disk_put_zone_wplug(zwplug); + continue; + } + + /* + * Only sleep if nothing sets the state to running. Else check + * for zone write plugs work again as a newly submitted BIO + * might have added a zone write plug to the work list. + */ + if (get_current_state() == TASK_RUNNING) { + try_to_freeze(); + } else { + if (kthread_should_stop()) { + set_current_state(TASK_RUNNING); + break; + } + schedule(); + } + } + + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + memalloc_noio_restore(noio_flag); + + return 0; } void disk_init_zone_resources(struct gendisk *disk) { spin_lock_init(&disk->zone_wplugs_hash_lock); + spin_lock_init(&disk->zone_wplugs_list_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_list); + init_completion(&disk->zone_wplugs_worker_bio_done); } /* @@ -1756,6 +1886,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, unsigned int pool_size) { unsigned int i; + int ret = -ENOMEM; atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = @@ -1781,8 +1912,21 @@ static int disk_alloc_zone_resources(struct gendisk *disk, if (!disk->zone_wplugs_wq) goto destroy_pool; + disk->zone_wplugs_worker = + kthread_create(disk_zone_wplugs_worker, disk, + "%s_zwplugs_worker", disk->disk_name); + if (IS_ERR(disk->zone_wplugs_worker)) { + ret = PTR_ERR(disk->zone_wplugs_worker); + disk->zone_wplugs_worker = NULL; + goto destroy_wq; + } + wake_up_process(disk->zone_wplugs_worker); + return 0; +destroy_wq: + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; destroy_pool: mempool_destroy(disk->zone_wplugs_pool); disk->zone_wplugs_pool = NULL; @@ -1790,7 +1934,7 @@ free_hash: kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; - return -ENOMEM; + return ret; } static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) @@ -1840,6 +1984,10 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) void disk_free_zone_resources(struct gendisk *disk) { + if (disk->zone_wplugs_worker) + kthread_stop(disk->zone_wplugs_worker); + WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); + if (disk->zone_wplugs_wq) { destroy_workqueue(disk->zone_wplugs_wq); disk->zone_wplugs_wq = NULL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a49a1e38c6e7..ef6457487d23 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -204,6 +205,10 @@ struct gendisk { struct mempool *zone_wplugs_pool; struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; + spinlock_t zone_wplugs_list_lock; + struct list_head zone_wplugs_list; + struct task_struct *zone_wplugs_worker; + struct completion zone_wplugs_worker_bio_done; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -668,6 +673,7 @@ enum { QUEUE_FLAG_NO_ELV_SWITCH, /* can't switch elevator any more */ QUEUE_FLAG_QOS_ENABLED, /* qos is enabled */ QUEUE_FLAG_BIO_ISSUE_TIME, /* record bio->issue_time_ns */ + QUEUE_FLAG_ZONED_QD1_WRITES, /* Limit zoned devices writes to QD=1 */ QUEUE_FLAG_MAX }; @@ -707,6 +713,8 @@ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); test_bit(QUEUE_FLAG_DISABLE_WBT_DEF, &(q)->queue_flags) #define blk_queue_no_elv_switch(q) \ test_bit(QUEUE_FLAG_NO_ELV_SWITCH, &(q)->queue_flags) +#define blk_queue_zoned_qd1_writes(q) \ + test_bit(QUEUE_FLAG_ZONED_QD1_WRITES, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); From 3d9782f62fb7c2c9ec3020c579425d634559d600 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:50 +0900 Subject: [PATCH 017/146] block: default to QD=1 writes for blk-mq rotational zoned devices For blk-mq rotational zoned block devices (e.g. SMR HDDs), default to having zone write plugging limit write operations to a maximum queue depth of 1 for all zones. This significantly reduce write seek overhead and improves SMR HDD write throughput. For remotely connected disks with a very high network latency this features might not be useful. However, remotely connected zoned devices are rare at the moment, and we cannot know the round trip latency to pick a good default for network attached devices. System administrators can however disable this feature in that case. For BIO based (non blk-mq) rotational zoned block devices, the device driver (e.g. a DM target driver) can directly set an appropriate default. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index ca8033e6d699..878b8a4b55bb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -967,6 +967,14 @@ int blk_register_queue(struct gendisk *disk) blk_mq_debugfs_register(q); blk_debugfs_unlock(q, memflags); + /* + * For blk-mq rotational zoned devices, default to using QD=1 + * writes. For non-mq rotational zoned devices, the device driver can + * set an appropriate default. + */ + if (queue_is_mq(q) && blk_queue_rot(q) && blk_queue_is_zoned(q)) + blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q); + ret = disk_register_independent_access_ranges(disk); if (ret) goto out_debugfs_remove; From b0e497db68ae5b0af8e9a0bd1a761607757d5dfe Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Feb 2026 22:19:51 +0900 Subject: [PATCH 018/146] Documentation: ABI: stable: document the zoned_qd1_writes attribute Update the documentation file Documentation/ABI/stable/sysfs-block to describe the zoned_qd1_writes sysfs queue attribute file. Signed-off-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- Documentation/ABI/stable/sysfs-block | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 09a9d4aca0fd..900b3fc4c72d 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -886,6 +886,21 @@ Description: zone commands, they will be treated as regular block devices and zoned will report "none". +What: /sys/block//queue/zoned_qd1_writes +Date: January 2026 +Contact: Damien Le Moal +Description: + [RW] zoned_qd1_writes indicates if write operations to a zoned + block device are being handled using a single issuer context (a + kernel thread) operating at a maximum queue depth of 1. This + attribute is visible only for zoned block devices. The default + value for zoned block devices that are not rotational devices + (e.g. ZNS SSDs or zoned UFS devices) is 0. For rotational zoned + block devices (e.g. SMR HDDs) the default value is 1. Since + this default may not be appropriate for some devices, e.g. + remotely connected devices over high latency networks, the user + can disable this feature by setting this attribute to 0. + What: /sys/block//hidden Date: March 2023 From d0e5fc70620266f975cfa5137795a8c1697ba362 Mon Sep 17 00:00:00 2001 From: John Garry Date: Fri, 27 Feb 2026 10:44:38 +0000 Subject: [PATCH 019/146] block: Correct comments on bio_alloc_clone() and bio_init_clone() Correct the comments that the cloned bio must be freed before the memory pointed to by @bio_src->bi_io_vecs (is freed). Christoph Hellwig contributed most the of the update wording. Signed-off-by: John Garry Reviewed-by: Damien Le Moal Signed-off-by: Jens Axboe --- block/bio.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index d80d5d26804e..eadf4c1e9994 100644 --- a/block/bio.c +++ b/block/bio.c @@ -897,10 +897,11 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) * @gfp: allocation priority * @bs: bio_set to allocate from * - * Allocate a new bio that is a clone of @bio_src. The caller owns the returned - * bio, but not the actual data it points to. - * - * The caller must ensure that the return bio is not freed before @bio_src. + * Allocate a new bio that is a clone of @bio_src. This reuses the bio_vecs + * pointed to by @bio_src->bi_io_vec, and clones the iterator pointing to + * the current position in it. The caller owns the returned bio, but not + * the bio_vecs, and must ensure the bio is freed before the memory + * pointed to by @bio_Src->bi_io_vecs. */ struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs) @@ -929,9 +930,7 @@ EXPORT_SYMBOL(bio_alloc_clone); * @gfp: allocation priority * * Initialize a new bio in caller provided memory that is a clone of @bio_src. - * The caller owns the returned bio, but not the actual data it points to. - * - * The caller must ensure that @bio_src is not freed before @bio. + * The same bio_vecs reuse and bio lifetime rules as bio_alloc_clone() apply. */ int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp) From ecd92cfec5349876d6a80f8188ea98c5920094b6 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 26 Feb 2026 16:54:48 +0900 Subject: [PATCH 020/146] block: remove bdev_nonrot() bdev_nonrot() is simply the negative return value of bdev_rot(). So replace all call sites of bdev_nonrot() with calls to bdev_rot() and remove bdev_nonrot(). Signed-off-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Paul Menzel Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 2 +- drivers/target/target_core_file.c | 2 +- drivers/target/target_core_iblock.c | 2 +- fs/btrfs/volumes.c | 4 ++-- fs/ext4/mballoc-test.c | 2 +- fs/ext4/mballoc.c | 2 +- include/linux/blkdev.h | 5 ----- mm/swapfile.c | 2 +- 10 files changed, 10 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 181400e147c0..cda6af0712b9 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1878,7 +1878,7 @@ static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, if (info->rdev) return false; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { set_bit(Nonrot, &rdev->flags); WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 0653b5d8545a..cfbd345805ca 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -806,7 +806,7 @@ static struct md_rdev *read_balance(struct r10conf *conf, if (!do_balance) break; - nonrot = bdev_nonrot(rdev->bdev); + nonrot = !bdev_rot(rdev->bdev); has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); if (min_pending > pending && nonrot) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8e8d431071b..ba9d6d05b089 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7541,7 +7541,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) rdev_for_each(rdev, mddev) { if (test_bit(Journal, &rdev->flags)) continue; - if (bdev_nonrot(rdev->bdev)) { + if (!bdev_rot(rdev->bdev)) { conf->batch_bio_dispatch = false; break; } diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 3ae1f7137d9d..d6e3e5214652 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -173,7 +173,7 @@ static int fd_configure_device(struct se_device *dev) */ dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bdev)) + if (!bdev_rot(bdev)) dev->dev_attrib.is_nonrot = 1; } else { if (!(fd_dev->fbd_flags & FBDF_HAS_SIZE)) { diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 3c92f94497b4..1087d1d17c36 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -148,7 +148,7 @@ static int iblock_configure_device(struct se_device *dev) else dev->dev_attrib.max_write_same_len = 0xFFFF; - if (bdev_nonrot(bd)) + if (!bdev_rot(bd)) dev->dev_attrib.is_nonrot = 1; target_configure_write_atomic_from_bdev(&dev->dev_attrib, bd); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 648bb09fc416..353c9caa8ab9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -694,7 +694,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(file_bdev(bdev_file))) + if (bdev_rot(file_bdev(bdev_file))) fs_devices->rotating = true; if (bdev_max_discard_sectors(file_bdev(bdev_file))) @@ -2919,7 +2919,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(device->bdev)) + if (bdev_rot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 9fbdf6a09489..b9f22e3a8d5c 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -72,7 +72,7 @@ static int mbt_mb_init(struct super_block *sb) ext4_fsblk_t block; int ret; - /* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */ + /* needed by ext4_mb_init->bdev_rot(sb->s_bdev) */ sb->s_bdev = kzalloc_obj(*sb->s_bdev); if (sb->s_bdev == NULL) return -ENOMEM; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 20e9fdaf4301..8a4dfe19878c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3836,7 +3836,7 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } - if (bdev_nonrot(sb->s_bdev)) + if (!bdev_rot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ef6457487d23..8d93d8e356d8 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1475,11 +1475,6 @@ static inline bool bdev_rot(struct block_device *bdev) return blk_queue_rot(bdev_get_queue(bdev)); } -static inline bool bdev_nonrot(struct block_device *bdev) -{ - return !bdev_rot(bdev); -} - static inline bool bdev_synchronous(struct block_device *bdev) { return bdev->bd_disk->queue->limits.features & BLK_FEAT_SYNCHRONOUS; diff --git a/mm/swapfile.c b/mm/swapfile.c index 94af29d1de88..60e21414624b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3460,7 +3460,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_synchronous(si->bdev)) si->flags |= SWP_SYNCHRONOUS_IO; - if (si->bdev && bdev_nonrot(si->bdev)) { + if (si->bdev && !bdev_rot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; } else { atomic_inc(&nr_rotate_swap); From b2c45ced591e6cf947560d2d290a51855926b774 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 25 Feb 2026 19:12:42 -0800 Subject: [PATCH 021/146] block: move bio queue-transition flag fixups into blk_steal_bios() blk_steal_bios() transfers bios from a request to a bio_list when the request is requeued to a different queue. The NVMe multipath failover path (nvme_failover_req) currently open-codes clearing of REQ_POLLED, bi_cookie, and REQ_NOWAIT on each bio before calling blk_steal_bios(). Move these fixups into blk_steal_bios() itself so that any caller automatically gets correct flag state when bios cross queue boundaries. Simplify nvme_failover_req() accordingly. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260226031243.87200-2-kch@nvidia.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +++++++++++++++++ drivers/nvme/host/multipath.c | 15 +-------------- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index a047faf3b0ec..4aebc6b479ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3424,6 +3424,23 @@ EXPORT_SYMBOL_GPL(blk_rq_prep_clone); */ void blk_steal_bios(struct bio_list *list, struct request *rq) { + struct bio *bio; + + for (bio = rq->bio; bio; bio = bio->bi_next) { + if (bio->bi_opf & REQ_POLLED) { + bio->bi_opf &= ~REQ_POLLED; + bio->bi_cookie = BLK_QC_T_NONE; + } + /* + * The alternate request queue that we may end up submitting + * the bio to may be frozen temporarily, in this case REQ_NOWAIT + * will fail the I/O immediately with EAGAIN to the issuer. + * We are not in the issuer context which cannot block. Clear + * the flag to avoid spurious EAGAIN I/O failures. + */ + bio->bi_opf &= ~REQ_NOWAIT; + } + if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc6800a9f7f9..ba00f0b72b85 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -154,21 +154,8 @@ void nvme_failover_req(struct request *req) } spin_lock_irqsave(&ns->head->requeue_lock, flags); - for (bio = req->bio; bio; bio = bio->bi_next) { + for (bio = req->bio; bio; bio = bio->bi_next) bio_set_dev(bio, ns->head->disk->part0); - if (bio->bi_opf & REQ_POLLED) { - bio->bi_opf &= ~REQ_POLLED; - bio->bi_cookie = BLK_QC_T_NONE; - } - /* - * The alternate request queue that we may end up submitting - * the bio to may be frozen temporarily, in this case REQ_NOWAIT - * will fail the I/O immediately with EAGAIN to the issuer. - * We are not in the issuer context which cannot block. Clear - * the flag to avoid spurious EAGAIN I/O failures. - */ - bio->bi_opf &= ~REQ_NOWAIT; - } blk_steal_bios(&ns->head->requeue_list, req); spin_unlock_irqrestore(&ns->head->requeue_lock, flags); From daa6c79858e9ca75c548452bf71db8a9e61bde42 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Wed, 25 Feb 2026 19:12:43 -0800 Subject: [PATCH 022/146] block: clear BIO_QOS flags in blk_steal_bios() When a bio goes through the rq_qos infrastructure on a path's request queue, it gets BIO_QOS_THROTTLED or BIO_QOS_MERGED flags set. These flags indicate that rq_qos_done_bio() should be called on completion to update rq_qos accounting. During path failover in nvme_failover_req(), the bio's bi_bdev is redirected from the failed path's disk to the multipath head's disk via bio_set_dev(). However, the BIO_QOS flags are not cleared. When the bio eventually completes (either successfully via a new path or with an error via bio_io_error()), rq_qos_done_bio() checks for these flags and calls __rq_qos_done_bio(q->rq_qos, bio) where q is obtained from the bio's current bi_bdev - which is now the multipath head's queue, not the original path's queue. The multipath head's queue does not have rq_qos enabled (q->rq_qos is NULL), but the code assumes that if BIO_QOS_* flags are set, q->rq_qos must be valid. This breaks when a bio is moved between queues during NVMe multipath failover, leading to a NULL pointer dereference. Execution Context timeline :- * =====> dd process context [USER] dd process [SYSCALL] write() - dd process context submit_bio() nvme_ns_head_submit_bio() - path selection blk_mq_submit_bio() #### QOS FLAGS SET HERE [USER] dd waits or returns ==== I/O in flight on NVMe hardware ===== ===== End of submission path ==== ------------------------------------------------------ * dd ====> Interrupt context; [IRQ] NVMe completion interrupt nvme_irq() nvme_complete_rq() nvme_failover_req() ### BIO MOVED TO HEAD spin_lock_irqsave (atomic section) bio_set_dev() changes bi_bdev ### BUG: QOS flags NOT cleared kblockd_schedule_work() * Interrupt context =====> kblockd workqueue [WQ] kblockd workqueue - kworker process nvme_requeue_work() submit_bio_noacct() nvme_ns_head_submit_bio() nvme_find_path() returns NULL bio_io_error() bio_endio() rq_qos_done_bio() ### CRASH ### KERNEL PANIC / OOPS Crash from blktests nvme/058 (rapid namespace remapping): [ 1339.636033] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 1339.641025] nvme nvme4: rescanning namespaces. [ 1339.642064] #PF: supervisor read access in kernel mode [ 1339.642067] #PF: error_code(0x0000) - not-present page [ 1339.642070] PGD 0 P4D 0 [ 1339.642073] Oops: Oops: 0000 [#1] SMP NOPTI [ 1339.642078] CPU: 35 UID: 0 PID: 4579 Comm: kworker/35:2H Tainted: G O N 6.17.0-rc3nvme+ #5 PREEMPT(voluntary) [ 1339.642084] Tainted: [O]=OOT_MODULE, [N]=TEST [ 1339.673446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 [ 1339.682359] Workqueue: kblockd nvme_requeue_work [nvme_core] [ 1339.686613] RIP: 0010:__rq_qos_done_bio+0xd/0x40 [ 1339.690161] Code: 75 dd 5b 5d 41 5c c3 cc cc cc cc 66 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 48 89 f5 53 48 89 fb <48> 8b 03 48 8b 40 30 48 85 c0 74 0b 48 89 ee 48 89 df ff d0 0f 1f [ 1339.703691] RSP: 0018:ffffc900066f3c90 EFLAGS: 00010202 [ 1339.706844] RAX: ffff888148b9ef00 RBX: 0000000000000000 RCX: 0000000000000000 [ 1339.711136] RDX: 00000000000001c0 RSI: ffff8882aaab8a80 RDI: 0000000000000000 [ 1339.715691] RBP: ffff8882aaab8a80 R08: 0000000000000000 R09: 0000000000000000 [ 1339.720472] R10: 0000000000000000 R11: fefefefefefefeff R12: ffff8882aa3b6010 [ 1339.724650] R13: 0000000000000000 R14: ffff8882338bcef0 R15: ffff8882aa3b6020 [ 1339.729029] FS: 0000000000000000(0000) GS:ffff88985c0cf000(0000) knlGS:0000000000000000 [ 1339.734525] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1339.738563] CR2: 0000000000000000 CR3: 0000000111045000 CR4: 0000000000350ef0 [ 1339.742750] DR0: ffffffff845ccbec DR1: ffffffff845ccbed DR2: ffffffff845ccbee [ 1339.745630] DR3: ffffffff845ccbef DR6: 00000000ffff0ff0 DR7: 0000000000000600 [ 1339.748488] Call Trace: [ 1339.749512] [ 1339.750449] bio_endio+0x71/0x2e0 [ 1339.751833] nvme_ns_head_submit_bio+0x290/0x320 [nvme_core] [ 1339.754073] __submit_bio+0x222/0x5e0 [ 1339.755623] ? rcu_is_watching+0xd/0x40 [ 1339.757201] ? submit_bio_noacct_nocheck+0x131/0x370 [ 1339.759210] submit_bio_noacct_nocheck+0x131/0x370 [ 1339.761189] ? submit_bio_noacct+0x20/0x620 [ 1339.762849] nvme_requeue_work+0x4b/0x60 [nvme_core] [ 1339.764828] process_one_work+0x20e/0x630 [ 1339.766528] worker_thread+0x184/0x330 [ 1339.768129] ? __pfx_worker_thread+0x10/0x10 [ 1339.769942] kthread+0x10a/0x250 [ 1339.771263] ? __pfx_kthread+0x10/0x10 [ 1339.772776] ? __pfx_kthread+0x10/0x10 [ 1339.774381] ret_from_fork+0x273/0x2e0 [ 1339.775948] ? __pfx_kthread+0x10/0x10 [ 1339.777504] ret_from_fork_asm+0x1a/0x30 [ 1339.779163] Fix this by clearing both BIO_QOS_THROTTLED and BIO_QOS_MERGED flags when bios are redirected to the multipath head in nvme_failover_req(). This is consistent with the existing code that clears REQ_POLLED and REQ_NOWAIT flags when the bio changes queues. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260226031243.87200-3-kch@nvidia.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index 4aebc6b479ef..4c5c16cce4f8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3439,6 +3439,8 @@ void blk_steal_bios(struct bio_list *list, struct request *rq) * the flag to avoid spurious EAGAIN I/O failures. */ bio->bi_opf &= ~REQ_NOWAIT; + bio_clear_flag(bio, BIO_QOS_THROTTLED); + bio_clear_flag(bio, BIO_QOS_MERGED); } if (rq->bio) { From 3dbaacf6ab68f81e3375fe769a2ecdbd3ce386fd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 11 Mar 2026 11:28:37 +0800 Subject: [PATCH 023/146] blk-cgroup: wait for blkcg cleanup before initializing new disk When a queue is shared across disk rebind (e.g., SCSI unbind/bind), the previous disk's blkcg state is cleaned up asynchronously via disk_release() -> blkcg_exit_disk(). If the new disk's blkcg_init_disk() runs before that cleanup finishes, we may overwrite q->root_blkg while the old one is still alive, and radix_tree_insert() in blkg_create() fails with -EEXIST because the old blkg entries still occupy the same queue id slot in blkcg->blkg_tree. This causes the sd probe to fail with -ENOMEM. Fix it by waiting in blkcg_init_disk() for root_blkg to become NULL, which indicates the previous disk's blkcg cleanup has completed. Fixes: 1059699f87eb ("block: move blkcg initialization/destroy into disk allocation/release handler") Cc: Yi Zhang Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260311032837.2368714-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b70096497d38..2d7b18eb7291 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -611,6 +612,8 @@ restart: q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); + + wake_up_var(&q->root_blkg); } static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) @@ -1498,6 +1501,18 @@ int blkcg_init_disk(struct gendisk *disk) struct blkcg_gq *new_blkg, *blkg; bool preloaded; + /* + * If the queue is shared across disk rebind (e.g., SCSI), the + * previous disk's blkcg state is cleaned up asynchronously via + * disk_release() -> blkcg_exit_disk(). Wait for that cleanup to + * finish (indicated by root_blkg becoming NULL) before setting up + * new blkcg state. Otherwise, we may overwrite q->root_blkg while + * the old one is still alive, and radix_tree_insert() in + * blkg_create() will fail with -EEXIST because the old entries + * still occupy the same queue id slot in blkcg->blkg_tree. + */ + wait_var_event(&q->root_blkg, !READ_ONCE(q->root_blkg)); + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; From 203247c5cb972af5d46bdb7d41ef40078048810b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 13 Mar 2026 07:47:00 -0700 Subject: [PATCH 024/146] blk-integrity: support arbitrary buffer alignment A bio segment may have partial interval block data with the rest continuing into the next segments because direct-io data payloads only need to align in memory to the device's DMA limits. At the same time, the protection information may also be split in multiple segments. The most likely way that may happen is if two requests merge, or if we're directly using the io_uring user metadata. The generate/verify, however, only ever accessed the first bip_vec. Further, it may be possible to unalign the protection fields from the user space buffer, or if there are odd additional opaque bytes in front or in back of the protection information metadata region. Change up the iteration to allow spanning multiple segments. This patch is mostly a re-write of the protection information handling to allow any arbitrary alignments, so it's probably easier to review the end result rather than the diff. Many controllers are not able to handle interval data composed of multiple segments when PI is used, so this patch introduces a new integrity limit that a low level driver can set to notify that it is capable, default to false. The nvme driver is the first one to enable it in this patch. Everyone else will force DMA alignment to the logical block size as before to ensure interval data is always aligned within a single segment. Reviewed-by: Martin K. Petersen Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Link: https://patch.msgid.link/20260313144701.1221652-2-kbusch@meta.com Signed-off-by: Jens Axboe --- block/blk-settings.c | 12 +- block/t10-pi.c | 854 +++++++++++++++++++--------------- drivers/nvme/host/core.c | 1 + include/linux/blk-integrity.h | 1 + 4 files changed, 484 insertions(+), 384 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index dabfab97fbab..78c83817b9d3 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -189,11 +189,11 @@ static int blk_validate_integrity_limits(struct queue_limits *lim) } /* - * The PI generation / validation helpers do not expect intervals to - * straddle multiple bio_vecs. Enforce alignment so that those are + * Some IO controllers can not handle data intervals straddling + * multiple bio_vecs. For those, enforce alignment so that those are * never generated, and that each buffer is aligned as expected. */ - if (bi->csum_type) { + if (!(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE) && bi->csum_type) { lim->dma_alignment = max(lim->dma_alignment, (1U << bi->interval_exp) - 1); } @@ -992,10 +992,14 @@ bool queue_limits_stack_integrity(struct queue_limits *t, if ((ti->flags & BLK_INTEGRITY_REF_TAG) != (bi->flags & BLK_INTEGRITY_REF_TAG)) goto incompatible; + if ((ti->flags & BLK_SPLIT_INTERVAL_CAPABLE) && + !(bi->flags & BLK_SPLIT_INTERVAL_CAPABLE)) + ti->flags &= ~BLK_SPLIT_INTERVAL_CAPABLE; } else { ti->flags = BLK_INTEGRITY_STACKED; ti->flags |= (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) | - (bi->flags & BLK_INTEGRITY_REF_TAG); + (bi->flags & BLK_INTEGRITY_REF_TAG) | + (bi->flags & BLK_SPLIT_INTERVAL_CAPABLE); ti->csum_type = bi->csum_type; ti->pi_tuple_size = bi->pi_tuple_size; ti->metadata_size = bi->metadata_size; diff --git a/block/t10-pi.c b/block/t10-pi.c index d27be6041fd3..a19b4e102a83 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,230 +12,115 @@ #include #include "blk.h" -struct blk_integrity_iter { - void *prot_buf; - void *data_buf; - sector_t seed; - unsigned int data_size; - unsigned short interval; - const char *disk_name; +#define APP_TAG_ESCAPE 0xffff +#define REF_TAG_ESCAPE 0xffffffff + +/* + * This union is used for onstack allocations when the pi field is split across + * segments. blk_validate_integrity_limits() guarantees pi_tuple_size matches + * the sizeof one of these two types. + */ +union pi_tuple { + struct crc64_pi_tuple crc64_pi; + struct t10_pi_tuple t10_pi; }; -static __be16 t10_pi_csum(__be16 csum, void *data, unsigned int len, - unsigned char csum_type) +struct blk_integrity_iter { + struct bio *bio; + struct bio_integrity_payload *bip; + struct blk_integrity *bi; + struct bvec_iter data_iter; + struct bvec_iter prot_iter; + unsigned int interval_remaining; + u64 seed; + u64 csum; +}; + +static void blk_calculate_guard(struct blk_integrity_iter *iter, void *data, + unsigned int len) { - if (csum_type == BLK_INTEGRITY_CSUM_IP) - return (__force __be16)ip_compute_csum(data, len); - return cpu_to_be16(crc_t10dif_update(be16_to_cpu(csum), data, len)); + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + iter->csum = crc64_nvme(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_CRC: + iter->csum = crc_t10dif_update(iter->csum, data, len); + break; + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u32)csum_partial(data, len, + (__force __wsum)iter->csum); + break; + default: + WARN_ON_ONCE(1); + iter->csum = U64_MAX; + break; + } +} + +static void blk_integrity_csum_finish(struct blk_integrity_iter *iter) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_IP: + iter->csum = (__force u16)csum_fold((__force __wsum)iter->csum); + break; + default: + break; + } } /* - * Type 1 and Type 2 protection use the same format: 16 bit guard tag, - * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref - * tag. + * Update the csum for formats that have metadata padding in front of the data + * integrity field */ -static void t10_pi_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_integrity_csum_offset(struct blk_integrity_iter *iter) { - u8 offset = bi->pi_offset; - unsigned int i; + unsigned int offset = iter->bi->pi_offset; + struct bio_vec *bvec = iter->bip->bip_vec; - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; + while (offset > 0) { + struct bio_vec pbv = bvec_iter_bvec(bvec, iter->prot_iter); + unsigned int len = min(pbv.bv_len, offset); + void *prot_buf = bvec_kmap_local(&pbv); - pi->guard_tag = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - pi->guard_tag = t10_pi_csum(pi->guard_tag, - iter->prot_buf, offset, bi->csum_type); - pi->app_tag = 0; + blk_calculate_guard(iter, prot_buf, len); + kunmap_local(prot_buf); + offset -= len; + bvec_iter_advance_single(bvec, &iter->prot_iter, len); + } + blk_integrity_csum_finish(iter); +} - if (bi->flags & BLK_INTEGRITY_REF_TAG) - pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); - else - pi->ref_tag = 0; +static void blk_integrity_copy_from_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) +{ + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + memcpy(prot_buf, tuple, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; } } -static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static void blk_integrity_copy_to_tuple(struct bio_integrity_payload *bip, + struct bvec_iter *iter, void *tuple, + unsigned int tuple_size) { - u8 offset = bi->pi_offset; - unsigned int i; + while (tuple_size) { + struct bio_vec pbv = bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(tuple_size, pbv.bv_len); + void *prot_buf = bvec_kmap_local(&pbv); - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf + offset; - __be16 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - if (be32_to_cpu(pi->ref_tag) != - lower_32_bits(iter->seed)) { - pr_err("%s: ref tag error at location %llu " \ - "(rcvd %u)\n", iter->disk_name, - (unsigned long long) - iter->seed, be32_to_cpu(pi->ref_tag)); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - pi->ref_tag == T10_PI_REF_ESCAPE) - goto next; - } - - csum = t10_pi_csum(0, iter->data_buf, iter->interval, - bi->csum_type); - if (offset) - csum = t10_pi_csum(csum, iter->prot_buf, offset, - bi->csum_type); - - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %04x, want %04x)\n", iter->disk_name, - (unsigned long long)iter->seed, - be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); - return BLK_STS_PROTECTION; - } - -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; - } - - return BLK_STS_OK; -} - -/** - * t10_pi_type1_prepare - prepare PI prior submitting request to device - * @rq: request with PI that should be prepared - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Remap protection information to match the - * physical LBA. - */ -static void t10_pi_type1_prepare(struct request *rq) -{ - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; - - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; - - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == virt) - pi->ref_tag = cpu_to_be32(ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); - } - - bip->bip_flags |= BIP_MAPPED_INTEGRITY; - } -} - -/** - * t10_pi_type1_complete - prepare PI prior returning request to the blk layer - * @rq: request with PI that should be prepared - * @nr_bytes: total bytes to prepare - * - * For Type 1/Type 2, the virtual start sector is the one that was - * originally submitted by the block layer for the ref_tag usage. Due to - * partitioning, MD/DM cloning, etc. the actual physical start sector is - * likely to be different. Since the physical start sector was submitted - * to the device, we should remap it back to virtual values expected by the - * block layer. - */ -static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) -{ - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u32 ref_tag = t10_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; - - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u32 virt = bip_get_seed(bip) & 0xffffffff; - struct bio_vec iv; - struct bvec_iter iter; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p + offset; - - if (be32_to_cpu(pi->ref_tag) == ref_tag) - pi->ref_tag = cpu_to_be32(virt); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } - } -} - -static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) -{ - return cpu_to_be64(crc64_nvme(crc, data, len)); -} - -static void ext_pi_crc64_generate(struct blk_integrity_iter *iter, - struct blk_integrity *bi) -{ - u8 offset = bi->pi_offset; - unsigned int i; - - for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; - - pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), - iter->prot_buf, offset); - pi->app_tag = 0; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) - put_unaligned_be48(iter->seed, pi->ref_tag); - else - put_unaligned_be48(0ULL, pi->ref_tag); - - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + memcpy(tuple, prot_buf, len); + kunmap_local(prot_buf); + bvec_iter_advance_single(bip->bip_vec, iter, len); + tuple_size -= len; + tuple += len; } } @@ -246,228 +131,437 @@ static bool ext_pi_ref_escape(const u8 ref_tag[6]) return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; } -static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, - struct blk_integrity *bi) +static blk_status_t blk_verify_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) { - u8 offset = bi->pi_offset; - unsigned int i; + u64 seed = lower_48_bits(iter->seed); + u64 guard = get_unaligned_be64(&pi->guard_tag); + u64 ref = get_unaligned_be48(pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); - for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf + offset; - u64 ref, seed; - __be64 csum; - - if (bi->flags & BLK_INTEGRITY_REF_TAG) { - if (pi->app_tag == T10_PI_APP_ESCAPE) - goto next; - - ref = get_unaligned_be48(pi->ref_tag); - seed = lower_48_bits(iter->seed); - if (ref != seed) { - pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", - iter->disk_name, seed, ref); - return BLK_STS_PROTECTION; - } - } else { - if (pi->app_tag == T10_PI_APP_ESCAPE && - ext_pi_ref_escape(pi->ref_tag)) - goto next; - } - - csum = ext_pi_crc64(0, iter->data_buf, iter->interval); - if (offset) - csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, - offset); - - if (pi->guard_tag != csum) { - pr_err("%s: guard tag error at sector %llu " \ - "(rcvd %016llx, want %016llx)\n", - iter->disk_name, (unsigned long long)iter->seed, - be64_to_cpu(pi->guard_tag), be64_to_cpu(csum)); + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ext_pi_ref_escape(pi->ref_tag)) { + return BLK_STS_OK; + } -next: - iter->data_buf += iter->interval; - iter->prot_buf += bi->metadata_size; - iter->seed++; + if (guard != iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %016llx, want %016llx)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, iter->csum); + return BLK_STS_PROTECTION; } return BLK_STS_OK; } -static void ext_pi_type1_prepare(struct request *rq) +static blk_status_t blk_verify_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, u16 guard) { - struct blk_integrity *bi = &rq->q->limits.integrity; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + u32 seed = lower_32_bits(iter->seed); + u32 ref = get_unaligned_be32(&pi->ref_tag); + u16 app = get_unaligned_be16(&pi->app_tag); - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; - - /* Already remapped? */ - if (bip->bip_flags & BIP_MAPPED_INTEGRITY) - break; - - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; - - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); - - if (ref == virt) - put_unaligned_be48(ref_tag, pi->ref_tag); - virt++; - ref_tag++; - p += tuple_sz; - } - kunmap_local(p); + if (iter->bi->flags & BLK_INTEGRITY_REF_TAG) { + if (app == APP_TAG_ESCAPE) + return BLK_STS_OK; + if (ref != seed) { + pr_err("%s: ref tag error at location %u (rcvd %u)\n", + iter->bio->bi_bdev->bd_disk->disk_name, seed, + ref); + return BLK_STS_PROTECTION; } + } else if (app == APP_TAG_ESCAPE && ref == REF_TAG_ESCAPE) { + return BLK_STS_OK; + } - bip->bip_flags |= BIP_MAPPED_INTEGRITY; + if (guard != (u16)iter->csum) { + pr_err("%s: guard tag error at sector %llu (rcvd %04x, want %04x)\n", + iter->bio->bi_bdev->bd_disk->disk_name, iter->seed, + guard, (u16)iter->csum); + return BLK_STS_PROTECTION; + } + + return BLK_STS_OK; +} + +static blk_status_t blk_verify_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + u16 guard = get_unaligned_be16(&pi->guard_tag); + + return blk_verify_pi(iter, pi, guard); +} + +static blk_status_t blk_verify_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + u16 guard = get_unaligned((u16 *)&pi->guard_tag); + + return blk_verify_pi(iter, pi, guard); +} + +static blk_status_t blk_integrity_verify(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_verify_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_verify_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_verify_ip_pi(iter, &tuple->t10_pi); + default: + return BLK_STS_OK; } } -static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +static void blk_set_ext_pi(struct blk_integrity_iter *iter, + struct crc64_pi_tuple *pi) { - struct blk_integrity *bi = &rq->q->limits.integrity; - unsigned intervals = nr_bytes >> bi->interval_exp; - const int tuple_sz = bi->metadata_size; - u64 ref_tag = ext_pi_ref_tag(rq); - u8 offset = bi->pi_offset; - struct bio *bio; + put_unaligned_be64(iter->csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be48(iter->seed, &pi->ref_tag); +} - __rq_for_each_bio(bio, rq) { - struct bio_integrity_payload *bip = bio_integrity(bio); - u64 virt = lower_48_bits(bip_get_seed(bip)); - struct bio_vec iv; - struct bvec_iter iter; +static void blk_set_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi, __be16 csum) +{ + put_unaligned(csum, &pi->guard_tag); + put_unaligned_be16(0, &pi->app_tag); + put_unaligned_be32(iter->seed, &pi->ref_tag); +} - bip_for_each_vec(iv, bip, iter) { - unsigned int j; - void *p; +static void blk_set_t10_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + blk_set_pi(iter, pi, cpu_to_be16((u16)iter->csum)); +} - p = bvec_kmap_local(&iv); - for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p + offset; - u64 ref = get_unaligned_be48(pi->ref_tag); +static void blk_set_ip_pi(struct blk_integrity_iter *iter, + struct t10_pi_tuple *pi) +{ + blk_set_pi(iter, pi, (__force __be16)(u16)iter->csum); +} - if (ref == ref_tag) - put_unaligned_be48(virt, pi->ref_tag); - virt++; - ref_tag++; - intervals--; - p += tuple_sz; - } - kunmap_local(p); - } +static void blk_integrity_set(struct blk_integrity_iter *iter, + union pi_tuple *tuple) +{ + switch (iter->bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + return blk_set_ext_pi(iter, &tuple->crc64_pi); + case BLK_INTEGRITY_CSUM_CRC: + return blk_set_t10_pi(iter, &tuple->t10_pi); + case BLK_INTEGRITY_CSUM_IP: + return blk_set_ip_pi(iter, &tuple->t10_pi); + default: + WARN_ON_ONCE(1); + return; } } +static blk_status_t blk_integrity_interval(struct blk_integrity_iter *iter, + bool verify) +{ + blk_status_t ret = BLK_STS_OK; + union pi_tuple tuple; + void *ptuple = &tuple; + struct bio_vec pbv; + + blk_integrity_csum_offset(iter); + pbv = bvec_iter_bvec(iter->bip->bip_vec, iter->prot_iter); + if (pbv.bv_len >= iter->bi->pi_tuple_size) { + ptuple = bvec_kmap_local(&pbv); + bvec_iter_advance_single(iter->bip->bip_vec, &iter->prot_iter, + iter->bi->metadata_size - iter->bi->pi_offset); + } else if (verify) { + blk_integrity_copy_to_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); + } + + if (verify) + ret = blk_integrity_verify(iter, ptuple); + else + blk_integrity_set(iter, ptuple); + + if (likely(ptuple != &tuple)) { + kunmap_local(ptuple); + } else if (!verify) { + blk_integrity_copy_from_tuple(iter->bip, &iter->prot_iter, + ptuple, iter->bi->pi_tuple_size); + } + + iter->interval_remaining = 1 << iter->bi->interval_exp; + iter->csum = 0; + iter->seed++; + return ret; +} + +static blk_status_t blk_integrity_iterate(struct bio *bio, + struct bvec_iter *data_iter, + bool verify) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity_iter iter = { + .bio = bio, + .bip = bip, + .bi = bi, + .data_iter = *data_iter, + .prot_iter = bip->bip_iter, + .interval_remaining = 1 << bi->interval_exp, + .seed = data_iter->bi_sector, + .csum = 0, + }; + blk_status_t ret = BLK_STS_OK; + + while (iter.data_iter.bi_size && ret == BLK_STS_OK) { + struct bio_vec bv = bvec_iter_bvec(iter.bio->bi_io_vec, + iter.data_iter); + void *kaddr = bvec_kmap_local(&bv); + void *data = kaddr; + unsigned int len; + + bvec_iter_advance_single(iter.bio->bi_io_vec, &iter.data_iter, + bv.bv_len); + while (bv.bv_len && ret == BLK_STS_OK) { + len = min(iter.interval_remaining, bv.bv_len); + blk_calculate_guard(&iter, data, len); + bv.bv_len -= len; + data += len; + iter.interval_remaining -= len; + if (!iter.interval_remaining) + ret = blk_integrity_interval(&iter, verify); + } + kunmap_local(kaddr); + } + + return ret; +} + void bio_integrity_generate(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = bio->bi_iter.bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - bio_for_each_segment(bv, bio, bviter) { - void *kaddr = bvec_kmap_local(&bv); - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ext_pi_crc64_generate(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - t10_pi_generate(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_integrity_iterate(bio, &bio->bi_iter, false); + break; + default: + break; } } blk_status_t bio_integrity_verify(struct bio *bio, struct bvec_iter *saved_iter) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); - struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity_iter iter; - struct bvec_iter bviter; - struct bio_vec bv; - /* - * At the moment verify is called bi_iter has been advanced during split - * and completion, so use the copy created during submission here. - */ - iter.disk_name = bio->bi_bdev->bd_disk->disk_name; - iter.interval = 1 << bi->interval_exp; - iter.seed = saved_iter->bi_sector; - iter.prot_buf = bvec_virt(bip->bip_vec); - __bio_for_each_segment(bv, bio, bviter, *saved_iter) { - void *kaddr = bvec_kmap_local(&bv); - blk_status_t ret = BLK_STS_OK; - - iter.data_buf = kaddr; - iter.data_size = bv.bv_len; - switch (bi->csum_type) { - case BLK_INTEGRITY_CSUM_CRC64: - ret = ext_pi_crc64_verify(&iter, bi); - break; - case BLK_INTEGRITY_CSUM_CRC: - case BLK_INTEGRITY_CSUM_IP: - ret = t10_pi_verify(&iter, bi); - break; - default: - break; - } - kunmap_local(kaddr); - - if (ret) - return ret; + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + return blk_integrity_iterate(bio, saved_iter, true); + default: + break; } return BLK_STS_OK; } -void blk_integrity_prepare(struct request *rq) +/* + * Advance @iter past the protection offset for protection formats that + * contain front padding on the metadata region. + */ +static void blk_pi_advance_offset(struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + unsigned int offset = bi->pi_offset; + + while (offset > 0) { + struct bio_vec bv = mp_bvec_iter_bvec(bip->bip_vec, *iter); + unsigned int len = min(bv.bv_len, offset); + + bvec_iter_advance_single(bip->bip_vec, iter, len); + offset -= len; + } +} + +static void *blk_tuple_remap_begin(union pi_tuple *tuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + struct bvec_iter titer; + struct bio_vec pbv; + + blk_pi_advance_offset(bi, bip, iter); + pbv = bvec_iter_bvec(bip->bip_vec, *iter); + if (likely(pbv.bv_len >= bi->pi_tuple_size)) + return bvec_kmap_local(&pbv); + + /* + * We need to preserve the state of the original iter for the + * copy_from_tuple at the end, so make a temp iter for here. + */ + titer = *iter; + blk_integrity_copy_to_tuple(bip, &titer, tuple, bi->pi_tuple_size); + return tuple; +} + +static void blk_tuple_remap_end(union pi_tuple *tuple, void *ptuple, + struct blk_integrity *bi, + struct bio_integrity_payload *bip, + struct bvec_iter *iter) +{ + unsigned int len = bi->metadata_size - bi->pi_offset; + + if (likely(ptuple != tuple)) { + kunmap_local(ptuple); + } else { + blk_integrity_copy_from_tuple(bip, iter, ptuple, + bi->pi_tuple_size); + len -= bi->pi_tuple_size; + } + + bvec_iter_advance(bip->bip_vec, iter, len); +} + +static void blk_set_ext_unmap_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) +{ + u64 ref = get_unaligned_be48(&pi->ref_tag); + + if (ref == lower_48_bits(ref_tag) && ref != lower_48_bits(virt)) + put_unaligned_be48(virt, pi->ref_tag); +} + +static void blk_set_t10_unmap_ref(struct t10_pi_tuple *pi, u32 virt, + u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == ref_tag && ref != virt) + put_unaligned_be32(virt, &pi->ref_tag); +} + +static void blk_reftag_remap_complete(struct blk_integrity *bi, + union pi_tuple *tuple, u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_unmap_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_unmap_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void blk_set_ext_map_ref(struct crc64_pi_tuple *pi, u64 virt, + u64 ref_tag) +{ + u64 ref = get_unaligned_be48(&pi->ref_tag); + + if (ref == lower_48_bits(virt) && ref != ref_tag) + put_unaligned_be48(ref_tag, pi->ref_tag); +} + +static void blk_set_t10_map_ref(struct t10_pi_tuple *pi, u32 virt, u32 ref_tag) +{ + u32 ref = get_unaligned_be32(&pi->ref_tag); + + if (ref == virt && ref != ref_tag) + put_unaligned_be32(ref_tag, &pi->ref_tag); +} + +static void blk_reftag_remap_prepare(struct blk_integrity *bi, + union pi_tuple *tuple, + u64 virt, u64 ref) +{ + switch (bi->csum_type) { + case BLK_INTEGRITY_CSUM_CRC64: + blk_set_ext_map_ref(&tuple->crc64_pi, virt, ref); + break; + case BLK_INTEGRITY_CSUM_CRC: + case BLK_INTEGRITY_CSUM_IP: + blk_set_t10_map_ref(&tuple->t10_pi, virt, ref); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + +static void __blk_reftag_remap(struct bio *bio, struct blk_integrity *bi, + unsigned *intervals, u64 *ref, bool prep) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bvec_iter iter = bip->bip_iter; + u64 virt = bip_get_seed(bip); + union pi_tuple *ptuple; + union pi_tuple tuple; + + if (prep && bip->bip_flags & BIP_MAPPED_INTEGRITY) { + *ref += bio->bi_iter.bi_size >> bi->interval_exp; + return; + } + + while (iter.bi_size && *intervals) { + ptuple = blk_tuple_remap_begin(&tuple, bi, bip, &iter); + + if (prep) + blk_reftag_remap_prepare(bi, ptuple, virt, *ref); + else + blk_reftag_remap_complete(bi, ptuple, virt, *ref); + + blk_tuple_remap_end(&tuple, ptuple, bi, bip, &iter); + (*intervals)--; + (*ref)++; + virt++; + } + + if (prep) + bip->bip_flags |= BIP_MAPPED_INTEGRITY; +} + +static void blk_integrity_remap(struct request *rq, unsigned int nr_bytes, + bool prep) { struct blk_integrity *bi = &rq->q->limits.integrity; + u64 ref = blk_rq_pos(rq) >> (bi->interval_exp - SECTOR_SHIFT); + unsigned intervals = nr_bytes >> bi->interval_exp; + struct bio *bio; if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) return; - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_prepare(rq); - else - t10_pi_type1_prepare(rq); + __rq_for_each_bio(bio, rq) { + __blk_reftag_remap(bio, bi, &intervals, &ref, prep); + if (!intervals) + break; + } +} + +void blk_integrity_prepare(struct request *rq) +{ + blk_integrity_remap(rq, blk_rq_bytes(rq), true); } void blk_integrity_complete(struct request *rq, unsigned int nr_bytes) { - struct blk_integrity *bi = &rq->q->limits.integrity; - - if (!(bi->flags & BLK_INTEGRITY_REF_TAG)) - return; - - if (bi->csum_type == BLK_INTEGRITY_CSUM_CRC64) - ext_pi_type1_complete(rq, nr_bytes); - else - t10_pi_type1_complete(rq, nr_bytes); + blk_integrity_remap(rq, nr_bytes, false); } diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 58bf432ec5e6..3de52f1d2723 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1875,6 +1875,7 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, break; } + bi->flags |= BLK_SPLIT_INTERVAL_CAPABLE; bi->metadata_size = head->ms; if (bi->csum_type) { bi->pi_tuple_size = head->pi_size; diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index ea6d7d322ae3..b1b530613c34 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -14,6 +14,7 @@ enum blk_integrity_flags { BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_REF_TAG = 1 << 3, BLK_INTEGRITY_STACKED = 1 << 4, + BLK_SPLIT_INTERVAL_CAPABLE = 1 << 5, }; const char *blk_integrity_profile_name(struct blk_integrity *bi); From 5d540162059598c3f79e12f96064825cc91f0f9c Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 13 Mar 2026 07:47:01 -0700 Subject: [PATCH 025/146] ublk: report BLK_SPLIT_INTERVAL_CAPABLE The ublk driver doesn't access request integrity buffers directly, it only copies them to/from the ublk server in ublk_copy_user_integrity(). ublk_copy_user_integrity() uses bio_for_each_integrity_vec() to walk all the integrity segments. ublk devices are therefore capable of handling requests with integrity intervals split across segments. Set BLK_SPLIT_INTERVAL_CAPABLE in the struct blk_integrity flags for ublk devices to opt out of the integrity-interval dma_alignment limit. Reviewed-by: Ming Lei Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Caleb Sander Mateos Signed-off-by: Keith Busch Link: https://patch.msgid.link/20260313144701.1221652-3-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 004f367243b6..34ed4f6a02ef 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -808,7 +808,7 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) static int ublk_integrity_flags(u32 flags) { - int ret_flags = 0; + int ret_flags = BLK_SPLIT_INTERVAL_CAPABLE; if (flags & LBMD_PI_CAP_INTEGRITY) { flags &= ~LBMD_PI_CAP_INTEGRITY; From eff0d74c6c8fd358bc9474c05002e51fa5aa56ad Mon Sep 17 00:00:00 2001 From: Chen Cheng Date: Tue, 10 Feb 2026 21:38:47 +0800 Subject: [PATCH 026/146] md: suppress spurious superblock update error message for dm-raid dm-raid has external metadata management (mddev->external = 1) and no persistent superblock (mddev->persistent = 0). For these arrays, there's no superblock to update, so the error message is spurious. The error appears as: md_update_sb: can't update sb for read-only array md0 Fixes: 8c9e376b9d1a ("md: warn about updating super block failure") Reported-by: Tj Closes: https://lore.kernel.org/all/20260128082430.96788-1-tj.iam.tj@proton.me/ Signed-off-by: Chen Cheng Reviewed-by: Paul Menzel Link: https://lore.kernel.org/linux-raid/20260210133847.269986-1-chencheng@fnnas.com Signed-off-by: Yu Kuai --- drivers/md/md.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ce6f9e9d38e..c2cc2302d727 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2788,7 +2788,9 @@ void md_update_sb(struct mddev *mddev, int force_change) if (!md_is_rdwr(mddev)) { if (force_change) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("%s: can't update sb for read-only array %s\n", __func__, mdname(mddev)); + if (!mddev_is_dm(mddev)) + pr_err_ratelimited("%s: can't update sb for read-only array %s\n", + __func__, mdname(mddev)); return; } From 7d96f3120a7fb7210d21b520c5b6f495da6ba436 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Mon, 2 Mar 2026 19:56:19 -0500 Subject: [PATCH 027/146] md/raid10: fix deadlock with check operation and nowait requests When an array check is running it will raise the barrier at which point normal requests will become blocked and increment the nr_pending value to signal there is work pending inside of wait_barrier(). NOWAIT requests do not block and so will return immediately with an error, and additionally do not increment nr_pending in wait_barrier(). Upstream change commit 43806c3d5b9b ("raid10: cleanup memleak at raid10_make_request") added a call to raid_end_bio_io() to fix a memory leak when NOWAIT requests hit this condition. raid_end_bio_io() eventually calls allow_barrier() and it will unconditionally do an atomic_dec_and_test(&conf->nr_pending) even though the corresponding increment on nr_pending didn't happen in the NOWAIT case. This can be easily seen by starting a check operation while an application is doing nowait IO on the same array. This results in a deadlocked state due to nr_pending value underflowing and so the md resync thread gets stuck waiting for nr_pending to == 0. Output of r10conf state of the array when we hit this condition: crash> struct r10conf barrier = 1, nr_pending = { counter = -41 }, nr_waiting = 15, nr_queued = 0, Example of md_sync thread stuck waiting on raise_barrier() and other requests stuck in wait_barrier(): md1_resync [<0>] raise_barrier+0xce/0x1c0 [<0>] raid10_sync_request+0x1ca/0x1ed0 [<0>] md_do_sync+0x779/0x1110 [<0>] md_thread+0x90/0x160 [<0>] kthread+0xbe/0xf0 [<0>] ret_from_fork+0x34/0x50 [<0>] ret_from_fork_asm+0x1a/0x30 kworker/u1040:2+flush-253:4 [<0>] wait_barrier+0x1de/0x220 [<0>] regular_request_wait+0x30/0x180 [<0>] raid10_make_request+0x261/0x1000 [<0>] md_handle_request+0x13b/0x230 [<0>] __submit_bio+0x107/0x1f0 [<0>] submit_bio_noacct_nocheck+0x16f/0x390 [<0>] ext4_io_submit+0x24/0x40 [<0>] ext4_do_writepages+0x254/0xc80 [<0>] ext4_writepages+0x84/0x120 [<0>] do_writepages+0x7a/0x260 [<0>] __writeback_single_inode+0x3d/0x300 [<0>] writeback_sb_inodes+0x1dd/0x470 [<0>] __writeback_inodes_wb+0x4c/0xe0 [<0>] wb_writeback+0x18b/0x2d0 [<0>] wb_workfn+0x2a1/0x400 [<0>] process_one_work+0x149/0x330 [<0>] worker_thread+0x2d2/0x410 [<0>] kthread+0xbe/0xf0 [<0>] ret_from_fork+0x34/0x50 [<0>] ret_from_fork_asm+0x1a/0x30 Fixes: 43806c3d5b9b ("raid10: cleanup memleak at raid10_make_request") Cc: stable@vger.kernel.org Signed-off-by: Josh Hunt Link: https://lore.kernel.org/linux-raid/20260303005619.1352958-1-johunt@akamai.com Signed-off-by: Yu Kuai --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index cfbd345805ca..4901ebe45c87 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1184,7 +1184,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } if (!regular_request_wait(mddev, conf, bio, r10_bio->sectors)) { - raid_end_bio_io(r10_bio); + free_r10bio(r10_bio); return; } @@ -1372,7 +1372,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sectors = r10_bio->sectors; if (!regular_request_wait(mddev, conf, bio, sectors)) { - raid_end_bio_io(r10_bio); + free_r10bio(r10_bio); return; } From d51e1668fad6d7d34feea5735264929aabb95975 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 23 Feb 2026 11:58:34 +0800 Subject: [PATCH 028/146] md/raid5: set chunk_sectors to enable full stripe I/O splitting Set chunk_sectors to the full stripe width (io_opt) so that the block layer splits I/O at full stripe boundaries. This ensures that large writes are aligned to full stripes, avoiding the read-modify-write overhead that occurs with partial stripe writes in RAID-5/6. When chunk_sectors is set, the block layer's bio splitting logic in get_max_io_size() uses blk_boundary_sectors_left() to limit I/O size to the boundary. This naturally aligns split bios to full stripe boundaries, enabling more efficient full stripe writes. Test results with 24-disk RAID5 (chunk_size=64k): dd if=/dev/zero of=/dev/md0 bs=10M oflag=direct Before: 461 MB/s After: 520 MB/s (+12.8%) Link: https://lore.kernel.org/linux-raid/20260223035834.3132498-1-yukuai@fnnas.com Suggested-by: Christoph Hellwig Reviewed-by: Paul Menzel Reviewed-by: Christoph Hellwig Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ba9d6d05b089..2ec6dd6ddd93 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7780,6 +7780,7 @@ static int raid5_set_limits(struct mddev *mddev) lim.logical_block_size = mddev->logical_block_size; lim.io_min = mddev->chunk_sectors << 9; lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); + lim.chunk_sectors = lim.io_opt >> 9; lim.features |= BLK_FEAT_RAID_PARTIAL_STRIPES_EXPENSIVE; lim.discard_granularity = stripe; lim.max_write_zeroes_sectors = 0; From 7701e68b5072faa03a8f30b4081dc16df9092381 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 23 Feb 2026 10:40:34 +0800 Subject: [PATCH 029/146] md/md-llbitmap: skip reading rdevs that are not in_sync When reading bitmap pages from member disks, the code iterates through all rdevs and attempts to read from the first available one. However, it only checks for raid_disk assignment and Faulty flag, missing the In_sync flag check. This can cause bitmap data to be read from spare disks that are still being rebuilt and don't have valid bitmap information yet. Reading stale or uninitialized bitmap data from such disks can lead to incorrect dirty bit tracking, potentially causing data corruption during recovery or normal operation. Add the In_sync flag check to ensure bitmap pages are only read from fully synchronized member disks that have valid bitmap data. Cc: stable@vger.kernel.org Fixes: 5ab829f1971d ("md/md-llbitmap: introduce new lockless bitmap") Link: https://lore.kernel.org/linux-raid/20260223024038.3084853-2-yukuai@fnnas.com Signed-off-by: Yu Kuai --- drivers/md/md-llbitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index bf398d7476b3..6b2d27de1528 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -459,7 +459,8 @@ static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) rdev_for_each(rdev, mddev) { sector_t sector; - if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags) || + !test_bit(In_sync, &rdev->flags)) continue; sector = mddev->bitmap_info.offset + From ef4ca3d4bf09716cff9ba00eb0351deadc8417ab Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 23 Feb 2026 10:40:35 +0800 Subject: [PATCH 030/146] md/md-llbitmap: raise barrier before state machine transition Move the barrier raise operation before calling llbitmap_state_machine() in both llbitmap_start_write() and llbitmap_start_discard(). This ensures the barrier is in place before any state transitions occur, preventing potential race conditions where the state machine could complete before the barrier is properly raised. Cc: stable@vger.kernel.org Fixes: 5ab829f1971d ("md/md-llbitmap: introduce new lockless bitmap") Link: https://lore.kernel.org/linux-raid/20260223024038.3084853-3-yukuai@fnnas.com Signed-off-by: Yu Kuai --- drivers/md/md-llbitmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index 6b2d27de1528..cdfecaca216b 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -1070,12 +1070,12 @@ static void llbitmap_start_write(struct mddev *mddev, sector_t offset, int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; - llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); - while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionStartwrite); } static void llbitmap_end_write(struct mddev *mddev, sector_t offset, @@ -1102,12 +1102,12 @@ static void llbitmap_start_discard(struct mddev *mddev, sector_t offset, int page_start = (start + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; int page_end = (end + BITMAP_DATA_OFFSET) >> PAGE_SHIFT; - llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); - while (page_start <= page_end) { llbitmap_raise_barrier(llbitmap, page_start); page_start++; } + + llbitmap_state_machine(llbitmap, start, end, BitmapActionDiscard); } static void llbitmap_end_discard(struct mddev *mddev, sector_t offset, From fed406f3c1c2feb97adcbc557218713c5f7ec6a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 17:11:29 +0100 Subject: [PATCH 031/146] block: mark bvec_{alloc,free} static Only used in bio.c these days. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni -ck Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260316161144.1607877-2-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 7 +++++-- block/blk.h | 5 ----- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/block/bio.c b/block/bio.c index bf1f3670e85a..6131ccb7284a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -34,6 +34,8 @@ struct bio_alloc_cache { unsigned int nr_irq; }; +#define BIO_INLINE_VECS 4 + static struct biovec_slab { int nr_vecs; char *name; @@ -159,7 +161,8 @@ out: mutex_unlock(&bio_slab_lock); } -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) +static void bvec_free(struct mempool *pool, struct bio_vec *bv, + unsigned short nr_vecs) { BUG_ON(nr_vecs > BIO_MAX_VECS); @@ -179,7 +182,7 @@ static inline gfp_t bvec_alloc_gfp(gfp_t gfp) __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, +static struct bio_vec *bvec_alloc(struct mempool *pool, unsigned short *nr_vecs, gfp_t gfp_mask) { struct biovec_slab *bvs = biovec_slab(*nr_vecs); diff --git a/block/blk.h b/block/blk.h index c5b2115b9ea4..103cb1d0b9cb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -108,11 +108,6 @@ static inline void blk_wait_io(struct completion *done) struct block_device *blkdev_get_no_open(dev_t dev, bool autoload); void blkdev_put_no_open(struct block_device *bdev); -#define BIO_INLINE_VECS 4 -struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, - gfp_t gfp_mask); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); - bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, struct page *page, unsigned len, unsigned offset); From b520c4eef83dd406591431f936de0908c3ed7fb9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 17:11:30 +0100 Subject: [PATCH 032/146] block: split bio_alloc_bioset more clearly into a fast and slowpath bio_alloc_bioset tries non-waiting slab allocations first for the bio and bvec array, but does so in a somewhat convoluted way. Restructure the function so that it first open codes these slab allocations, and then falls back to the mempools with the original gfp mask. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni -ck Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260316161144.1607877-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 190 ++++++++++++++++++-------------------------- include/linux/bio.h | 3 +- 2 files changed, 79 insertions(+), 114 deletions(-) diff --git a/block/bio.c b/block/bio.c index 6131ccb7284a..5982bf069cef 100644 --- a/block/bio.c +++ b/block/bio.c @@ -176,43 +176,12 @@ static void bvec_free(struct mempool *pool, struct bio_vec *bv, * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. */ -static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +static inline gfp_t try_alloc_gfp(gfp_t gfp) { return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -static struct bio_vec *bvec_alloc(struct mempool *pool, unsigned short *nr_vecs, - gfp_t gfp_mask) -{ - struct biovec_slab *bvs = biovec_slab(*nr_vecs); - - if (WARN_ON_ONCE(!bvs)) - return NULL; - - /* - * Upgrade the nr_vecs request to take full advantage of the allocation. - * We also rely on this in the bvec_free path. - */ - *nr_vecs = bvs->nr_vecs; - - /* - * Try a slab allocation first for all smaller allocations. If that - * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. - * The mempool is sized to handle up to BIO_MAX_VECS entries. - */ - if (*nr_vecs < BIO_MAX_VECS) { - struct bio_vec *bvl; - - bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); - if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) - return bvl; - *nr_vecs = BIO_MAX_VECS; - } - - return mempool_alloc(pool, gfp_mask); -} - void bio_uninit(struct bio *bio) { #ifdef CONFIG_BLK_CGROUP @@ -433,13 +402,31 @@ static void bio_alloc_rescue(struct work_struct *work) } } +/* + * submit_bio_noacct() converts recursion to iteration; this means if we're + * running beneath it, any bios we allocate and submit will not be submitted + * (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios from the + * same bio_set while running underneath submit_bio_noacct(). If we were to + * allocate multiple bios (say a stacking block driver that was splitting bios), + * we would deadlock if we exhausted the mempool's reserve. + * + * We solve this, and guarantee forward progress by punting the bios on + * current->bio_list to a per bio_set rescuer workqueue before blocking to wait + * for elements being returned to the mempool. + */ static void punt_bios_to_rescuer(struct bio_set *bs) { struct bio_list punt, nopunt; struct bio *bio; - if (WARN_ON_ONCE(!bs->rescue_workqueue)) + if (!current->bio_list || !bs->rescue_workqueue) return; + if (bio_list_empty(¤t->bio_list[0]) && + bio_list_empty(¤t->bio_list[1])) + return; + /* * In order to guarantee forward progress we must punt only bios that * were allocated from this bio_set; otherwise, if there was a bio on @@ -486,9 +473,7 @@ static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) local_irq_restore(flags); } -static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, - unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, - struct bio_set *bs) +static struct bio *bio_alloc_percpu_cache(struct bio_set *bs) { struct bio_alloc_cache *cache; struct bio *bio; @@ -506,11 +491,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, cache->free_list = bio->bi_next; cache->nr--; put_cpu(); - - if (nr_vecs) - bio_init_inline(bio, bdev, nr_vecs, opf); - else - bio_init(bio, bdev, NULL, nr_vecs, opf); bio->bi_pool = bs; return bio; } @@ -520,7 +500,7 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * @bdev: block device to allocate the bio for (can be %NULL) * @nr_vecs: number of bvecs to pre-allocate * @opf: operation and flags for bio - * @gfp_mask: the GFP_* mask given to the slab allocator + * @gfp: the GFP_* mask given to the slab allocator * @bs: the bio_set to allocate from. * * Allocate a bio from the mempools in @bs. @@ -550,91 +530,77 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, * Returns: Pointer to new bio on success, NULL on failure. */ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs) + blk_opf_t opf, gfp_t gfp, struct bio_set *bs) { - gfp_t saved_gfp = gfp_mask; - struct bio *bio; + struct bio_vec *bvecs = NULL; + struct bio *bio = NULL; + gfp_t saved_gfp = gfp; void *p; /* should not use nobvec bioset for nr_vecs > 0 */ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) return NULL; + gfp = try_alloc_gfp(gfp); if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { - opf |= REQ_ALLOC_CACHE; - bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, - gfp_mask, bs); - if (bio) - return bio; /* - * No cached bio available, bio returned below marked with - * REQ_ALLOC_CACHE to participate in per-cpu alloc cache. + * Set REQ_ALLOC_CACHE even if no cached bio is available to + * return the allocated bio to the percpu cache when done. */ - } else - opf &= ~REQ_ALLOC_CACHE; - - /* - * submit_bio_noacct() converts recursion to iteration; this means if - * we're running beneath it, any bios we allocate and submit will not be - * submitted (and thus freed) until after we return. - * - * This exposes us to a potential deadlock if we allocate multiple bios - * from the same bio_set() while running underneath submit_bio_noacct(). - * If we were to allocate multiple bios (say a stacking block driver - * that was splitting bios), we would deadlock if we exhausted the - * mempool's reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are bios on - * current->bio_list, we first try the allocation without - * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be - * blocking to the rescuer workqueue before we retry with the original - * gfp_flags. - */ - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; - - p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - } - if (unlikely(!p)) - return NULL; - if (!mempool_is_saturated(&bs->bio_pool)) - opf &= ~REQ_ALLOC_CACHE; - - bio = p + bs->front_pad; - if (nr_vecs > BIO_INLINE_VECS) { - struct bio_vec *bvl = NULL; - - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); - if (!bvl && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); - } - if (unlikely(!bvl)) - goto err_free; - - bio_init(bio, bdev, bvl, nr_vecs, opf); - } else if (nr_vecs) { - bio_init_inline(bio, bdev, BIO_INLINE_VECS, opf); + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_percpu_cache(bs); } else { - bio_init(bio, bdev, NULL, 0, opf); + opf &= ~REQ_ALLOC_CACHE; + p = kmem_cache_alloc(bs->bio_slab, gfp); + if (p) + bio = p + bs->front_pad; } + if (bio && nr_vecs > BIO_INLINE_VECS) { + struct biovec_slab *bvs = biovec_slab(nr_vecs); + + /* + * Upgrade nr_vecs to take full advantage of the allocation. + * We also rely on this in bvec_free(). + */ + nr_vecs = bvs->nr_vecs; + bvecs = kmem_cache_alloc(bvs->slab, gfp); + if (unlikely(!bvecs)) { + kmem_cache_free(bs->bio_slab, p); + bio = NULL; + } + } + + if (unlikely(!bio)) { + /* + * Give up if we are not allow to sleep as non-blocking mempool + * allocations just go back to the slab allocation. + */ + if (!(saved_gfp & __GFP_DIRECT_RECLAIM)) + return NULL; + + punt_bios_to_rescuer(bs); + + /* + * Don't rob the mempools by returning to the per-CPU cache if + * we're tight on memory. + */ + opf &= ~REQ_ALLOC_CACHE; + + p = mempool_alloc(&bs->bio_pool, gfp); + bio = p + bs->front_pad; + if (nr_vecs > BIO_INLINE_VECS) { + nr_vecs = BIO_MAX_VECS; + bvecs = mempool_alloc(&bs->bvec_pool, gfp); + } + } + + if (nr_vecs && nr_vecs <= BIO_INLINE_VECS) + bio_init_inline(bio, bdev, nr_vecs, opf); + else + bio_init(bio, bdev, bvecs, nr_vecs, opf); bio->bi_pool = bs; return bio; - -err_free: - mempool_free(p, &bs->bio_pool); - return NULL; } EXPORT_SYMBOL(bio_alloc_bioset); diff --git a/include/linux/bio.h b/include/linux/bio.h index 9693a0d6fefe..984844d2870b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -350,8 +350,7 @@ extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, - blk_opf_t opf, gfp_t gfp_mask, - struct bio_set *bs); + blk_opf_t opf, gfp_t gfp, struct bio_set *bs); struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); From e80fd7a08940093aad5ea247a42046b57709a7bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Mar 2026 17:11:31 +0100 Subject: [PATCH 033/146] block: remove bvec_free bvec_free is only called by bio_free, so inline it there. Signed-off-by: Christoph Hellwig Reviewed-by: Chaitanya Kulkarni -ck Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260316161144.1607877-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/block/bio.c b/block/bio.c index 5982bf069cef..b58bce6b5fea 100644 --- a/block/bio.c +++ b/block/bio.c @@ -161,17 +161,6 @@ out: mutex_unlock(&bio_slab_lock); } -static void bvec_free(struct mempool *pool, struct bio_vec *bv, - unsigned short nr_vecs) -{ - BUG_ON(nr_vecs > BIO_MAX_VECS); - - if (nr_vecs == BIO_MAX_VECS) - mempool_free(bv, pool); - else if (nr_vecs > BIO_INLINE_VECS) - kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); -} - /* * Make the first allocation restricted and don't dump info on allocation * failures, since we'll fall back to the mempool in case of failure. @@ -203,9 +192,14 @@ static void bio_free(struct bio *bio) void *p = bio; WARN_ON_ONCE(!bs); + WARN_ON_ONCE(bio->bi_max_vecs > BIO_MAX_VECS); bio_uninit(bio); - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + if (bio->bi_max_vecs == BIO_MAX_VECS) + mempool_free(bio->bi_io_vec, &bs->bvec_pool); + else if (bio->bi_max_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(bio->bi_max_vecs)->slab, + bio->bi_io_vec); mempool_free(p - bs->front_pad, &bs->bio_pool); } @@ -561,7 +555,7 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, /* * Upgrade nr_vecs to take full advantage of the allocation. - * We also rely on this in bvec_free(). + * We also rely on this in bio_free(). */ nr_vecs = bvs->nr_vecs; bvecs = kmem_cache_alloc(bvs->slab, gfp); From 223983874d0366ac12d30eab3b633d699bdf763e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Mar 2026 23:43:51 +0100 Subject: [PATCH 034/146] block: make queue_sysfs_entry instances const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The queue_sysfs_entry structures are never modified, mark them as const. Signed-off-by: Thomas Weißschuh Reviewed-by: John Garry Link: https://patch.msgid.link/20260316-b4-sysfs-const-attr-block-v1-1-a35d73b986b0@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 878b8a4b55bb..f22c1f253eb3 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -581,27 +581,27 @@ static int queue_wc_store(struct gendisk *disk, const char *page, return 0; } -#define QUEUE_RO_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0444 }, \ - .show = _prefix##_show, \ +#define QUEUE_RO_ENTRY(_prefix, _name) \ +static const struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show = _prefix##_show, \ }; -#define QUEUE_RW_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ - .attr = { .name = _name, .mode = 0644 }, \ - .show = _prefix##_show, \ - .store = _prefix##_store, \ +#define QUEUE_RW_ENTRY(_prefix, _name) \ +static const struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store = _prefix##_store, \ }; #define QUEUE_LIM_RO_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ +static const struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0444 }, \ .show_limit = _prefix##_show, \ } #define QUEUE_LIM_RW_ENTRY(_prefix, _name) \ -static struct queue_sysfs_entry _prefix##_entry = { \ +static const struct queue_sysfs_entry _prefix##_entry = { \ .attr = { .name = _name, .mode = 0644 }, \ .show_limit = _prefix##_show, \ .store_limit = _prefix##_store, \ @@ -665,7 +665,7 @@ QUEUE_LIM_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_LIM_RO_ENTRY(queue_dma_alignment, "dma_alignment"); /* legacy alias for logical_block_size: */ -static struct queue_sysfs_entry queue_hw_sector_size_entry = { +static const struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, .show_limit = queue_logical_block_size_show, }; @@ -731,7 +731,7 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif /* Common attributes for bio-based and request-based queues. */ -static struct attribute *queue_attrs[] = { +static const struct attribute *const queue_attrs[] = { /* * Attributes which are protected with q->limits_lock. */ @@ -791,7 +791,7 @@ static struct attribute *queue_attrs[] = { }; /* Request-based queue attributes that are not relevant for bio-based queues. */ -static struct attribute *blk_mq_queue_attrs[] = { +static const struct attribute *const blk_mq_queue_attrs[] = { /* * Attributes which require some form of locking other than * q->sysfs_lock. @@ -811,7 +811,7 @@ static struct attribute *blk_mq_queue_attrs[] = { NULL, }; -static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, +static umode_t queue_attr_visible(struct kobject *kobj, const struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); @@ -827,7 +827,7 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, } static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, - struct attribute *attr, int n) + const struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; @@ -841,17 +841,17 @@ static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, return attr->mode; } -static struct attribute_group queue_attr_group = { - .attrs = queue_attrs, - .is_visible = queue_attr_visible, +static const struct attribute_group queue_attr_group = { + .attrs_const = queue_attrs, + .is_visible_const = queue_attr_visible, }; -static struct attribute_group blk_mq_queue_attr_group = { - .attrs = blk_mq_queue_attrs, - .is_visible = blk_mq_queue_attr_visible, +static const struct attribute_group blk_mq_queue_attr_group = { + .attrs_const = blk_mq_queue_attrs, + .is_visible_const = blk_mq_queue_attr_visible, }; -#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) +#define to_queue(atr) container_of_const((atr), struct queue_sysfs_entry, attr) static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) From 3c9122630953520e2a2b6c6a28751da23457e4ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Mar 2026 23:43:52 +0100 Subject: [PATCH 035/146] block: ia-ranges: make blk_ia_range_sysfs_entry instances const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The blk_ia_range_sysfs_entry structures are never modified, mark them as const. Signed-off-by: Thomas Weißschuh Reviewed-by: John Garry Link: https://patch.msgid.link/20260316-b4-sysfs-const-attr-block-v1-2-a35d73b986b0@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-ia-ranges.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index d479f5481b66..7be8b58893c9 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -30,17 +30,17 @@ struct blk_ia_range_sysfs_entry { ssize_t (*show)(struct blk_independent_access_range *iar, char *buf); }; -static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { +static const struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { .attr = { .name = "sector", .mode = 0444 }, .show = blk_ia_range_sector_show, }; -static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { +static const struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { .attr = { .name = "nr_sectors", .mode = 0444 }, .show = blk_ia_range_nr_sectors_show, }; -static struct attribute *blk_ia_range_attrs[] = { +static const struct attribute *const blk_ia_range_attrs[] = { &blk_ia_range_sector_entry.attr, &blk_ia_range_nr_sectors_entry.attr, NULL, From f00d826f1b8ee39a4e9283f2eb537f5b49e07829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Mar 2026 23:43:53 +0100 Subject: [PATCH 036/146] blk-crypto: make blk_crypto_attr instances const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The blk_crypto_attrs structures are never modified, mark them as const. Signed-off-by: Thomas Weißschuh Reviewed-by: John Garry > Link: https://patch.msgid.link/20260316-b4-sysfs-const-attr-block-v1-3-a35d73b986b0@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-crypto-sysfs.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c index ea7a0b85a46f..b069c418b6cc 100644 --- a/block/blk-crypto-sysfs.c +++ b/block/blk-crypto-sysfs.c @@ -18,7 +18,7 @@ struct blk_crypto_kobj { struct blk_crypto_attr { struct attribute attr; ssize_t (*show)(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page); + const struct blk_crypto_attr *attr, char *page); }; static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) @@ -26,39 +26,39 @@ static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; } -static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) +static const struct blk_crypto_attr *attr_to_crypto_attr(const struct attribute *attr) { - return container_of(attr, struct blk_crypto_attr, attr); + return container_of_const(attr, struct blk_crypto_attr, attr); } static ssize_t hw_wrapped_keys_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { /* Always show supported, since the file doesn't exist otherwise. */ return sysfs_emit(page, "supported\n"); } static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); } static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { return sysfs_emit(page, "%u\n", profile->num_slots); } static ssize_t raw_keys_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { /* Always show supported, since the file doesn't exist otherwise. */ return sysfs_emit(page, "supported\n"); } #define BLK_CRYPTO_RO_ATTR(_name) \ - static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) + static const struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) BLK_CRYPTO_RO_ATTR(hw_wrapped_keys); BLK_CRYPTO_RO_ATTR(max_dun_bits); @@ -66,10 +66,10 @@ BLK_CRYPTO_RO_ATTR(num_keyslots); BLK_CRYPTO_RO_ATTR(raw_keys); static umode_t blk_crypto_is_visible(struct kobject *kobj, - struct attribute *attr, int n) + const struct attribute *attr, int n) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); - struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + const struct blk_crypto_attr *a = attr_to_crypto_attr(attr); if (a == &hw_wrapped_keys_attr && !(profile->key_types_supported & BLK_CRYPTO_KEY_TYPE_HW_WRAPPED)) @@ -81,7 +81,7 @@ static umode_t blk_crypto_is_visible(struct kobject *kobj, return 0444; } -static struct attribute *blk_crypto_attrs[] = { +static const struct attribute *const blk_crypto_attrs[] = { &hw_wrapped_keys_attr.attr, &max_dun_bits_attr.attr, &num_keyslots_attr.attr, @@ -90,8 +90,8 @@ static struct attribute *blk_crypto_attrs[] = { }; static const struct attribute_group blk_crypto_attr_group = { - .attrs = blk_crypto_attrs, - .is_visible = blk_crypto_is_visible, + .attrs_const = blk_crypto_attrs, + .is_visible_const = blk_crypto_is_visible, }; /* @@ -99,13 +99,13 @@ static const struct attribute_group blk_crypto_attr_group = { * modes, these are initialized at boot time by blk_crypto_sysfs_init(). */ static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; -static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; +static const struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, - struct attribute *attr, int n) + const struct attribute *attr, int n) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); - struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + const struct blk_crypto_attr *a = attr_to_crypto_attr(attr); int mode_num = a - __blk_crypto_mode_attrs; if (profile->modes_supported[mode_num]) @@ -114,7 +114,7 @@ static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, } static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, - struct blk_crypto_attr *attr, char *page) + const struct blk_crypto_attr *attr, char *page) { int mode_num = attr - __blk_crypto_mode_attrs; @@ -123,8 +123,8 @@ static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, static const struct attribute_group blk_crypto_modes_attr_group = { .name = "modes", - .attrs = blk_crypto_mode_attrs, - .is_visible = blk_crypto_mode_is_visible, + .attrs_const = blk_crypto_mode_attrs, + .is_visible_const = blk_crypto_mode_is_visible, }; static const struct attribute_group *blk_crypto_attr_groups[] = { @@ -137,7 +137,7 @@ static ssize_t blk_crypto_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); - struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + const struct blk_crypto_attr *a = attr_to_crypto_attr(attr); return a->show(profile, a, page); } From 3141e0e536b43ab3555737cb2ee6ea1ed0aff69f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Mon, 16 Mar 2026 23:43:54 +0100 Subject: [PATCH 037/146] blk-mq: make blk_mq_hw_ctx_sysfs_entry instances const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The blk_mq_hw_ctx_sysfs_entry structures are never modified, mark them as const. Signed-off-by: Thomas Weißschuh Reviewed-by: John Garry Link: https://patch.msgid.link/20260316-b4-sysfs-const-attr-block-v1-4-a35d73b986b0@weissschuh.net Signed-off-by: Jens Axboe --- block/blk-mq-sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 58ec293373c6..895397831ecc 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -53,7 +53,7 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct request_queue *q; ssize_t res; - entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + entry = container_of_const(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); q = hctx->queue; @@ -101,20 +101,20 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return pos + ret; } -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { +static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { +static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { .attr = {.name = "nr_reserved_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; -static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { +static const struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = 0444 }, .show = blk_mq_hw_sysfs_cpus_show, }; -static struct attribute *default_hw_ctx_attrs[] = { +static const struct attribute *const default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, From 643893647cac7317bafca4040dd0cfb815b510d4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 17 Mar 2026 18:36:34 +1030 Subject: [PATCH 038/146] block: reject zero length in bio_add_page() The function bio_add_page() returns the number of bytes added to the bio, and if that failed it should return 0. However there is a special quirk, if a caller is passing a page with length 0, that function will always return 0 but with different results: - The page is added to the bio If there is enough bvec slot or the folio can be merged with the last bvec. The return value 0 is just the length passed in, which is also 0. - The page is not added to the bio If the page is not mergeable with the last bvec, or there is no bvec slot available. The return value 0 means page is not added into the bio. Unfortunately the caller is not able to distinguish the above two cases, and will treat the 0 return value as page addition failure. In that case, this can lead to the double releasing of the last page: - By the bio cleanup Which normally goes through every page of the bio, including the last page which is added into the bio. - By the caller Which believes the page is not added into the bio, thus would manually release the page. I do not think anyone should call bio_add_folio()/bio_add_page() with zero length, but idiots like me can still show up. So add an extra WARN_ON_ONCE() check for zero length and rejects it early to avoid double freeing. Signed-off-by: Qu Wenruo Link: https://patch.msgid.link/bc2223c080f38d0b63f968f605c918181c840f40.1773734749.git.wqu@suse.com Signed-off-by: Jens Axboe --- block/bio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bio.c b/block/bio.c index b58bce6b5fea..5057047194c4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1026,6 +1026,8 @@ int bio_add_page(struct bio *bio, struct page *page, { if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; + if (WARN_ON_ONCE(len == 0)) + return 0; if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) return 0; From 7da9261bab0a82bdbc4aafd2ad4bc3529b7cb772 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Tue, 17 Mar 2026 15:22:24 +0800 Subject: [PATCH 039/146] bsg: add bsg_uring_cmd uapi structure Add the bsg_uring_cmd structure to the BSG UAPI header to support io_uring-based SCSI passthrough operations via IORING_OP_URING_CMD. Signed-off-by: Yang Xiuwei Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260317072226.2598233-2-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- include/uapi/linux/bsg.h | 75 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/include/uapi/linux/bsg.h b/include/uapi/linux/bsg.h index cd6302def5ed..6cff77f5b857 100644 --- a/include/uapi/linux/bsg.h +++ b/include/uapi/linux/bsg.h @@ -2,6 +2,9 @@ #ifndef _UAPIBSG_H #define _UAPIBSG_H +#ifdef __KERNEL__ +#include +#endif /* __KERNEL__ */ #include #define BSG_PROTOCOL_SCSI 0 @@ -63,5 +66,77 @@ struct sg_io_v4 { __u32 padding; }; +struct bsg_uring_cmd { + __u64 request; /* [i], [*i] command descriptor address */ + __u32 request_len; /* [i] command descriptor length in bytes */ + __u32 protocol; /* [i] protocol type (BSG_PROTOCOL_*) */ + __u32 subprotocol; /* [i] subprotocol type (BSG_SUB_PROTOCOL_*) */ + __u32 max_response_len; /* [i] response buffer size in bytes */ + + __u64 response; /* [i], [*o] response data address */ + __u64 dout_xferp; /* [i], [*i] */ + __u32 dout_xfer_len; /* [i] bytes to be transferred to device */ + __u32 dout_iovec_count; /* [i] 0 -> "flat" dout transfer else + * dout_xferp points to array of iovec + */ + __u64 din_xferp; /* [i], [*o] */ + __u32 din_xfer_len; /* [i] bytes to be transferred from device */ + __u32 din_iovec_count; /* [i] 0 -> "flat" din transfer */ + + __u32 timeout_ms; /* [i] timeout in milliseconds */ + __u8 reserved[12]; /* reserved for future extension */ +}; + +#ifdef __KERNEL__ +/* Must match IORING_OP_URING_CMD payload size (e.g. SQE128). */ +static_assert(sizeof(struct bsg_uring_cmd) == 80); +#endif /* __KERNEL__ */ + + +/* + * SCSI BSG io_uring completion (res2, 64-bit) + * + * When using BSG_PROTOCOL_SCSI + BSG_SUB_PROTOCOL_SCSI_CMD with + * IORING_OP_URING_CMD, the completion queue entry (CQE) contains: + * - result: errno (0 on success) + * - res2: packed SCSI status + * + * res2 bit layout: + * [0..7] device_status (SCSI status byte, e.g. CHECK_CONDITION) + * [8..15] driver_status (e.g. DRIVER_SENSE when sense data is valid) + * [16..23] host_status (e.g. DID_OK, DID_TIME_OUT) + * [24..31] sense_len_wr (bytes of sense data written to response buffer) + * [32..63] resid_len (residual transfer length) + */ +static inline __u8 bsg_scsi_res2_device_status(__u64 res2) +{ + return res2 & 0xff; +} +static inline __u8 bsg_scsi_res2_driver_status(__u64 res2) +{ + return res2 >> 8; +} +static inline __u8 bsg_scsi_res2_host_status(__u64 res2) +{ + return res2 >> 16; +} +static inline __u8 bsg_scsi_res2_sense_len(__u64 res2) +{ + return res2 >> 24; +} +static inline __u32 bsg_scsi_res2_resid_len(__u64 res2) +{ + return res2 >> 32; +} +static inline __u64 bsg_scsi_res2_build(__u8 device_status, __u8 driver_status, + __u8 host_status, __u8 sense_len_wr, + __u32 resid_len) +{ + return ((__u64)(__u32)(resid_len) << 32) | + ((__u64)sense_len_wr << 24) | + ((__u64)host_status << 16) | + ((__u64)driver_status << 8) | + (__u64)device_status; +} #endif /* _UAPIBSG_H */ From a1e97ce80d9f41d0bb83951d758ff6fe49f3de60 Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Tue, 17 Mar 2026 15:22:25 +0800 Subject: [PATCH 040/146] bsg: add io_uring command support to generic layer Add an io_uring command handler to the generic BSG layer. The new .uring_cmd file operation validates io_uring features and delegates handling to a per-queue bsg_uring_cmd_fn callback. Extend bsg_register_queue() so transport drivers can register both sg_io and io_uring command handlers. Signed-off-by: Yang Xiuwei Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260317072226.2598233-3-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- block/bsg-lib.c | 2 +- block/bsg.c | 33 ++++++++++++++++++++++++++++++++- drivers/scsi/scsi_bsg.c | 10 +++++++++- include/linux/bsg.h | 6 +++++- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 20cd0ef3c394..fdb4b290ca68 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -393,7 +393,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); - bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn); + bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn, NULL); if (IS_ERR(bset->bd)) { ret = PTR_ERR(bset->bd); goto out_cleanup_queue; diff --git a/block/bsg.c b/block/bsg.c index e0af6206ed28..82aaf3cee582 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -28,6 +29,7 @@ struct bsg_device { unsigned int timeout; unsigned int reserved_size; bsg_sg_io_fn *sg_io_fn; + bsg_uring_cmd_fn *uring_cmd_fn; }; static inline struct bsg_device *to_bsg_device(struct inode *inode) @@ -158,11 +160,38 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } } +static int bsg_check_uring_features(unsigned int issue_flags) +{ + /* BSG passthrough requires big SQE/CQE support */ + if ((issue_flags & (IO_URING_F_SQE128|IO_URING_F_CQE32)) != + (IO_URING_F_SQE128|IO_URING_F_CQE32)) + return -EOPNOTSUPP; + return 0; +} + +static int bsg_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags) +{ + struct bsg_device *bd = to_bsg_device(file_inode(ioucmd->file)); + bool open_for_write = ioucmd->file->f_mode & FMODE_WRITE; + struct request_queue *q = bd->queue; + int ret; + + ret = bsg_check_uring_features(issue_flags); + if (ret) + return ret; + + if (!bd->uring_cmd_fn) + return -EOPNOTSUPP; + + return bd->uring_cmd_fn(q, ioucmd, issue_flags, open_for_write); +} + static const struct file_operations bsg_fops = { .open = bsg_open, .release = bsg_release, .unlocked_ioctl = bsg_ioctl, .compat_ioctl = compat_ptr_ioctl, + .uring_cmd = bsg_uring_cmd, .owner = THIS_MODULE, .llseek = default_llseek, }; @@ -187,7 +216,8 @@ void bsg_unregister_queue(struct bsg_device *bd) EXPORT_SYMBOL_GPL(bsg_unregister_queue); struct bsg_device *bsg_register_queue(struct request_queue *q, - struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn) + struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn, + bsg_uring_cmd_fn *uring_cmd_fn) { struct bsg_device *bd; int ret; @@ -199,6 +229,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, bd->reserved_size = INT_MAX; bd->queue = q; bd->sg_io_fn = sg_io_fn; + bd->uring_cmd_fn = uring_cmd_fn; ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL); if (ret < 0) { diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index a9a9ec086a7e..4d57e524e141 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include #include @@ -9,6 +10,12 @@ #define uptr64(val) ((void __user *)(uintptr_t)(val)) +static int scsi_bsg_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd, + unsigned int issue_flags, bool open_for_write) +{ + return -EOPNOTSUPP; +} + static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, bool open_for_write, unsigned int timeout) { @@ -99,5 +106,6 @@ out_put_request: struct bsg_device *scsi_bsg_register_queue(struct scsi_device *sdev) { return bsg_register_queue(sdev->request_queue, &sdev->sdev_gendev, - dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn); + dev_name(&sdev->sdev_gendev), scsi_bsg_sg_io_fn, + scsi_bsg_uring_cmd); } diff --git a/include/linux/bsg.h b/include/linux/bsg.h index ee2df73edf83..162730bfc2d8 100644 --- a/include/linux/bsg.h +++ b/include/linux/bsg.h @@ -7,13 +7,17 @@ struct bsg_device; struct device; struct request_queue; +struct io_uring_cmd; typedef int (bsg_sg_io_fn)(struct request_queue *, struct sg_io_v4 *hdr, bool open_for_write, unsigned int timeout); +typedef int (bsg_uring_cmd_fn)(struct request_queue *q, struct io_uring_cmd *ioucmd, + unsigned int issue_flags, bool open_for_write); + struct bsg_device *bsg_register_queue(struct request_queue *q, struct device *parent, const char *name, - bsg_sg_io_fn *sg_io_fn); + bsg_sg_io_fn *sg_io_fn, bsg_uring_cmd_fn *uring_cmd_fn); void bsg_unregister_queue(struct bsg_device *bcd); #endif /* _LINUX_BSG_H */ From 7b6d3255e7f8c6df2d21504c47808e3ce84649ac Mon Sep 17 00:00:00 2001 From: Yang Xiuwei Date: Tue, 17 Mar 2026 15:22:26 +0800 Subject: [PATCH 041/146] scsi: bsg: add io_uring passthrough handler Implement the SCSI-specific io_uring command handler for BSG using struct bsg_uring_cmd. The handler builds a SCSI request from the io_uring command, maps user buffers (including fixed buffers), and completes asynchronously via a request end_io callback and task_work. Completion returns a 32-bit status and packed residual/sense information via CQE res and res2, and supports IO_URING_F_NONBLOCK. Signed-off-by: Yang Xiuwei Reviewed-by: Bart Van Assche Link: https://patch.msgid.link/20260317072226.2598233-4-yangxiuwei@kylinos.cn Signed-off-by: Jens Axboe --- drivers/scsi/scsi_bsg.c | 168 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index 4d57e524e141..c3ce497a3b94 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -10,10 +10,176 @@ #define uptr64(val) ((void __user *)(uintptr_t)(val)) +/* + * Per-command BSG SCSI PDU stored in io_uring_cmd.pdu[32]. + * Holds temporary state between submission, completion and task_work. + */ +struct scsi_bsg_uring_cmd_pdu { + struct bio *bio; /* mapped user buffer, unmap in task work */ + struct request *req; /* block request, freed in task work */ + u64 response_addr; /* user space response buffer address */ +}; +static_assert(sizeof(struct scsi_bsg_uring_cmd_pdu) <= sizeof_field(struct io_uring_cmd, pdu)); + +static inline struct scsi_bsg_uring_cmd_pdu *scsi_bsg_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return io_uring_cmd_to_pdu(ioucmd, struct scsi_bsg_uring_cmd_pdu); +} + +/* Task work: build res2 (layout in uapi/linux/bsg.h) and copy sense to user. */ +static void scsi_bsg_uring_task_cb(struct io_tw_req tw_req, io_tw_token_t tw) +{ + struct io_uring_cmd *ioucmd = io_uring_cmd_from_tw(tw_req); + struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd); + struct request *rq = pdu->req; + struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(rq); + u64 res2; + int ret = 0; + u8 driver_status = 0; + u8 sense_len_wr = 0; + + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + + if (scsi_status_is_check_condition(scmd->result)) { + driver_status = DRIVER_SENSE; + if (pdu->response_addr) + sense_len_wr = min_t(u8, scmd->sense_len, + SCSI_SENSE_BUFFERSIZE); + } + + if (sense_len_wr) { + if (copy_to_user(uptr64(pdu->response_addr), scmd->sense_buffer, + sense_len_wr)) + ret = -EFAULT; + } + + res2 = bsg_scsi_res2_build(status_byte(scmd->result), driver_status, + host_byte(scmd->result), sense_len_wr, + scmd->resid_len); + + blk_mq_free_request(rq); + io_uring_cmd_done32(ioucmd, ret, res2, + IO_URING_CMD_TASK_WORK_ISSUE_FLAGS); +} + +static enum rq_end_io_ret scsi_bsg_uring_cmd_done(struct request *req, + blk_status_t status, + const struct io_comp_batch *iocb) +{ + struct io_uring_cmd *ioucmd = req->end_io_data; + + io_uring_cmd_do_in_task_lazy(ioucmd, scsi_bsg_uring_task_cb); + return RQ_END_IO_NONE; +} + +static int scsi_bsg_map_user_buffer(struct request *req, + struct io_uring_cmd *ioucmd, + unsigned int issue_flags, gfp_t gfp_mask) +{ + const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd); + bool is_write = cmd->dout_xfer_len > 0; + u64 buf_addr = is_write ? cmd->dout_xferp : cmd->din_xferp; + unsigned long buf_len = is_write ? cmd->dout_xfer_len : cmd->din_xfer_len; + struct iov_iter iter; + int ret; + + if (ioucmd->flags & IORING_URING_CMD_FIXED) { + ret = io_uring_cmd_import_fixed(buf_addr, buf_len, + is_write ? WRITE : READ, + &iter, ioucmd, issue_flags); + if (ret < 0) + return ret; + ret = blk_rq_map_user_iov(req->q, req, NULL, &iter, gfp_mask); + } else { + ret = blk_rq_map_user(req->q, req, NULL, uptr64(buf_addr), + buf_len, gfp_mask); + } + + return ret; +} + static int scsi_bsg_uring_cmd(struct request_queue *q, struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool open_for_write) { - return -EOPNOTSUPP; + struct scsi_bsg_uring_cmd_pdu *pdu = scsi_bsg_uring_cmd_pdu(ioucmd); + const struct bsg_uring_cmd *cmd = io_uring_sqe128_cmd(ioucmd->sqe, struct bsg_uring_cmd); + struct scsi_cmnd *scmd; + struct request *req; + blk_mq_req_flags_t blk_flags = 0; + gfp_t gfp_mask = GFP_KERNEL; + int ret; + + if (cmd->protocol != BSG_PROTOCOL_SCSI || + cmd->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD) + return -EINVAL; + + if (!cmd->request || cmd->request_len == 0) + return -EINVAL; + + if (cmd->dout_xfer_len && cmd->din_xfer_len) { + pr_warn_once("BIDI support in bsg has been removed.\n"); + return -EOPNOTSUPP; + } + + if (cmd->dout_iovec_count > 0 || cmd->din_iovec_count > 0) + return -EOPNOTSUPP; + + if (issue_flags & IO_URING_F_NONBLOCK) { + blk_flags = BLK_MQ_REQ_NOWAIT; + gfp_mask = GFP_NOWAIT; + } + + req = scsi_alloc_request(q, cmd->dout_xfer_len ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, blk_flags); + if (IS_ERR(req)) + return PTR_ERR(req); + + scmd = blk_mq_rq_to_pdu(req); + scmd->cmd_len = cmd->request_len; + if (scmd->cmd_len > sizeof(scmd->cmnd)) { + ret = -EINVAL; + goto out_free_req; + } + scmd->allowed = SG_DEFAULT_RETRIES; + + if (copy_from_user(scmd->cmnd, uptr64(cmd->request), cmd->request_len)) { + ret = -EFAULT; + goto out_free_req; + } + + if (!scsi_cmd_allowed(scmd->cmnd, open_for_write)) { + ret = -EPERM; + goto out_free_req; + } + + pdu->response_addr = cmd->response; + scmd->sense_len = cmd->max_response_len ? + min(cmd->max_response_len, SCSI_SENSE_BUFFERSIZE) : SCSI_SENSE_BUFFERSIZE; + + if (cmd->dout_xfer_len || cmd->din_xfer_len) { + ret = scsi_bsg_map_user_buffer(req, ioucmd, issue_flags, gfp_mask); + if (ret) + goto out_free_req; + pdu->bio = req->bio; + } else { + pdu->bio = NULL; + } + + req->timeout = cmd->timeout_ms ? + msecs_to_jiffies(cmd->timeout_ms) : BLK_DEFAULT_SG_TIMEOUT; + + req->end_io = scsi_bsg_uring_cmd_done; + req->end_io_data = ioucmd; + pdu->req = req; + + blk_execute_rq_nowait(req, false); + return -EIOCBQUEUED; + +out_free_req: + blk_mq_free_request(req); + return ret; } static int scsi_bsg_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, From c2d466b9fe1913f8dbe2701156c38719c94188f7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 20 Mar 2026 17:48:44 -0700 Subject: [PATCH 042/146] block: partitions: Replace pp_buf with struct seq_buf In preparation for removing the strlcat API[1], replace the char *pp_buf with a struct seq_buf, which tracks the current write position and remaining space internally. This allows for: - Direct use of seq_buf_printf() in place of snprintf()+strlcat() pairs, eliminating local tmp buffers throughout. - Adjacent strlcat() calls that build strings piece-by-piece (e.g., strlcat("["); strlcat(name); strlcat("]")) to be collapsed into single seq_buf_printf() calls. - Simpler call sites: seq_buf_puts() takes only the buffer and string, with no need to pass PAGE_SIZE at every call. The backing buffer allocation is unchanged (__get_free_page), and the output path uses seq_buf_str() to NUL-terminate before passing to printk(). Link: https://github.com/KSPP/linux/issues/370 [1] Cc: Andy Shevchenko Cc: Josh Law Signed-off-by: Kees Cook Reviewed-by: Josh Law Link: https://patch.msgid.link/20260321004840.work.670-kees@kernel.org Signed-off-by: Jens Axboe --- block/partitions/acorn.c | 32 ++++++++---------- block/partitions/aix.c | 21 +++++------- block/partitions/amiga.c | 35 +++++++++----------- block/partitions/atari.c | 12 +++---- block/partitions/check.h | 8 ++--- block/partitions/cmdline.c | 6 ++-- block/partitions/core.c | 20 ++++++------ block/partitions/efi.c | 2 +- block/partitions/ibm.c | 27 ++++++--------- block/partitions/karma.c | 2 +- block/partitions/ldm.c | 4 +-- block/partitions/mac.c | 4 +-- block/partitions/msdos.c | 67 +++++++++++++------------------------- block/partitions/of.c | 6 ++-- block/partitions/osf.c | 2 +- block/partitions/sgi.c | 2 +- block/partitions/sun.c | 2 +- block/partitions/sysv68.c | 9 ++--- block/partitions/ultrix.c | 2 +- 19 files changed, 106 insertions(+), 157 deletions(-) diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index d2fc122d7426..9f7389f174d0 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -40,9 +40,7 @@ adfs_partition(struct parsed_partitions *state, char *name, char *data, (le32_to_cpu(dr->disc_size) >> 9); if (name) { - strlcat(state->pp_buf, " [", PAGE_SIZE); - strlcat(state->pp_buf, name, PAGE_SIZE); - strlcat(state->pp_buf, "]", PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " [%s]", name); } put_partition(state, slot, first_sector, nr_sects); return dr; @@ -78,14 +76,14 @@ static int riscix_partition(struct parsed_partitions *state, if (!rr) return -1; - strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [RISCiX]"); if (rr->magic == RISCIX_MAGIC) { unsigned long size = nr_sects > 2 ? 2 : nr_sects; int part; - strlcat(state->pp_buf, " <", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " <"); put_partition(state, slot++, first_sect, size); for (part = 0; part < 8; part++) { @@ -94,13 +92,11 @@ static int riscix_partition(struct parsed_partitions *state, put_partition(state, slot++, le32_to_cpu(rr->part[part].start), le32_to_cpu(rr->part[part].length)); - strlcat(state->pp_buf, "(", PAGE_SIZE); - strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); - strlcat(state->pp_buf, ")", PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(%s)", rr->part[part].name); } } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); } else { put_partition(state, slot++, first_sect, nr_sects); } @@ -130,7 +126,7 @@ static int linux_partition(struct parsed_partitions *state, struct linux_part *linuxp; unsigned long size = nr_sects > 2 ? 2 : nr_sects; - strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [Linux]"); put_partition(state, slot++, first_sect, size); @@ -138,7 +134,7 @@ static int linux_partition(struct parsed_partitions *state, if (!linuxp) return -1; - strlcat(state->pp_buf, " <", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " <"); while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { if (slot == state->limit) @@ -148,7 +144,7 @@ static int linux_partition(struct parsed_partitions *state, le32_to_cpu(linuxp->nr_sects)); linuxp ++; } - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); put_dev_sector(sect); return slot; @@ -293,7 +289,7 @@ int adfspart_check_ADFS(struct parsed_partitions *state) break; } } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } #endif @@ -366,7 +362,7 @@ int adfspart_check_ICS(struct parsed_partitions *state) return 0; } - strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [ICS]"); for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { u32 start = le32_to_cpu(p->start); @@ -400,7 +396,7 @@ int adfspart_check_ICS(struct parsed_partitions *state) } put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } #endif @@ -460,7 +456,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state) return 0; } - strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [POWERTEC]"); for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { u32 start = le32_to_cpu(p->start); @@ -471,7 +467,7 @@ int adfspart_check_POWERTEC(struct parsed_partitions *state) } put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } #endif @@ -542,7 +538,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); } return i ? 1 : 0; diff --git a/block/partitions/aix.c b/block/partitions/aix.c index a886cefbefbb..29b8f4cebb63 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -173,24 +173,22 @@ int aix_partition(struct parsed_partitions *state) if (d) { struct lvm_rec *p = (struct lvm_rec *)d; u16 lvm_version = be16_to_cpu(p->version); - char tmp[64]; if (lvm_version == 1) { int pp_size_log2 = be16_to_cpu(p->pp_size); pp_bytes_size = 1 << pp_size_log2; pp_blocks_size = pp_bytes_size / 512; - snprintf(tmp, sizeof(tmp), - " AIX LVM header version %u found\n", - lvm_version); + seq_buf_printf(&state->pp_buf, + " AIX LVM header version %u found\n", + lvm_version); vgda_len = be32_to_cpu(p->vgda_len); vgda_sector = be32_to_cpu(p->vgda_psn[0]); } else { - snprintf(tmp, sizeof(tmp), - " unsupported AIX LVM version %d found\n", - lvm_version); + seq_buf_printf(&state->pp_buf, + " unsupported AIX LVM version %d found\n", + lvm_version); } - strlcat(state->pp_buf, tmp, PAGE_SIZE); put_dev_sector(sect); } if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { @@ -251,14 +249,11 @@ int aix_partition(struct parsed_partitions *state) continue; } if (lp_ix == lvip[lv_ix].pps_per_lv) { - char tmp[70]; - put_partition(state, lv_ix + 1, (i + 1 - lp_ix) * pp_blocks_size + psn_part1, lvip[lv_ix].pps_per_lv * pp_blocks_size); - snprintf(tmp, sizeof(tmp), " <%s>\n", - n[lv_ix].name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " <%s>\n", + n[lv_ix].name); lvip[lv_ix].lv_is_contiguous = 1; ret = 1; next_lp_ix = 1; diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 506921095412..8325046a14eb 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -81,13 +81,8 @@ int amiga_partition(struct parsed_partitions *state) /* blksize is blocks per 512 byte standard block */ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; - { - char tmp[7 + 10 + 1 + 1]; - - /* Be more informative */ - snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } + /* Be more informative */ + seq_buf_printf(&state->pp_buf, " RDSK (%d)", blksize * 512); blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { @@ -179,27 +174,27 @@ int amiga_partition(struct parsed_partitions *state) { /* Be even more informative to aid mounting */ char dostype[4]; - char tmp[42]; __be32 *dt = (__be32 *)dostype; *dt = pb->pb_Environment[16]; if (dostype[3] < ' ') - snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", - dostype[0], dostype[1], - dostype[2], dostype[3] + '@' ); + seq_buf_printf(&state->pp_buf, + " (%c%c%c^%c)", + dostype[0], dostype[1], + dostype[2], + dostype[3] + '@'); else - snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", - dostype[0], dostype[1], - dostype[2], dostype[3]); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - snprintf(tmp, sizeof(tmp), "(res %d spb %d)", - be32_to_cpu(pb->pb_Environment[6]), - be32_to_cpu(pb->pb_Environment[4])); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, + " (%c%c%c%c)", + dostype[0], dostype[1], + dostype[2], dostype[3]); + seq_buf_printf(&state->pp_buf, "(res %d spb %d)", + be32_to_cpu(pb->pb_Environment[6]), + be32_to_cpu(pb->pb_Environment[4])); } res = 1; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); rdb_done: return res; diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 9655c728262a..2438d1448f38 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -70,7 +70,7 @@ int atari_partition(struct parsed_partitions *state) } pi = &rs->part[0]; - strlcat(state->pp_buf, " AHDI", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " AHDI"); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; @@ -89,7 +89,7 @@ int atari_partition(struct parsed_partitions *state) #ifdef ICD_PARTS part_fmt = 1; #endif - strlcat(state->pp_buf, " XGM<", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " XGM<"); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, §2); @@ -128,14 +128,14 @@ int atari_partition(struct parsed_partitions *state) break; } } - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { - strlcat(state->pp_buf, " ICD<", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " ICD<"); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) @@ -144,13 +144,13 @@ int atari_partition(struct parsed_partitions *state) be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); } } #endif put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/check.h b/block/partitions/check.h index e5c1c61eb353..b0997467b61a 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include #include +#include #include "../blk.h" /* @@ -20,7 +21,7 @@ struct parsed_partitions { int next; int limit; bool access_beyond_eod; - char *pp_buf; + struct seq_buf pp_buf; }; typedef struct { @@ -37,12 +38,9 @@ static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { if (n < p->limit) { - char tmp[1 + BDEVNAME_SIZE + 10 + 1]; - p->parts[n].from = from; p->parts[n].size = size; - snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); - strlcat(p->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&p->pp_buf, " %s%d", p->name, n); } } diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index a2b1870c3fd4..4fd52ed154b4 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -229,7 +229,6 @@ static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { struct partition_meta_info *info; - char tmp[sizeof(info->volname) + 4]; if (slot >= state->limit) return 1; @@ -244,8 +243,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, strscpy(info->volname, subpart->name, sizeof(info->volname)); - snprintf(tmp, sizeof(tmp), "(%s)", info->volname); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(%s)", info->volname); state->parts[slot].has_info = true; @@ -379,7 +377,7 @@ int cmdline_partition(struct parsed_partitions *state) cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/core.c b/block/partitions/core.c index 740228750aaf..3b5928836c69 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -123,16 +123,16 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) state = allocate_partitions(hd); if (!state) return NULL; - state->pp_buf = (char *)__get_free_page(GFP_KERNEL); - if (!state->pp_buf) { + state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf.buffer) { free_partitions(state); return NULL; } - state->pp_buf[0] = '\0'; + seq_buf_init(&state->pp_buf, state->pp_buf.buffer, PAGE_SIZE); state->disk = hd; strscpy(state->name, hd->disk_name); - snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + seq_buf_printf(&state->pp_buf, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); @@ -151,9 +151,9 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) } if (res > 0) { - printk(KERN_INFO "%s", state->pp_buf); + printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf)); - free_page((unsigned long)state->pp_buf); + free_page((unsigned long)state->pp_buf.buffer); return state; } if (state->access_beyond_eod) @@ -164,12 +164,12 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) if (err) res = err; if (res) { - strlcat(state->pp_buf, - " unable to read partition table\n", PAGE_SIZE); - printk(KERN_INFO "%s", state->pp_buf); + seq_buf_puts(&state->pp_buf, + " unable to read partition table\n"); + printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf)); } - free_page((unsigned long)state->pp_buf); + free_page((unsigned long)state->pp_buf.buffer); free_partitions(state); return ERR_PTR(res); } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 75474fb3848e..9865d59093fa 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -751,6 +751,6 @@ int efi_partition(struct parsed_partitions *state) } kfree(ptes); kfree(gpt); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 9311ad5fb95d..54047e722a9d 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -173,15 +173,13 @@ static int find_vol1_partitions(struct parsed_partitions *state, { sector_t blk; int counter; - char tmp[64]; Sector sect; unsigned char *data; loff_t offset, size; struct vtoc_format1_label f1; int secperblk; - snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "VOL1/%8s:", name); /* * get start of VTOC from the disk label and then search for format1 * and format8 labels @@ -219,7 +217,7 @@ static int find_vol1_partitions(struct parsed_partitions *state, blk++; data = read_part_sector(state, blk * secperblk, §); } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); if (!data) return -1; @@ -237,11 +235,9 @@ static int find_lnx1_partitions(struct parsed_partitions *state, dasd_information2_t *info) { loff_t offset, geo_size, size; - char tmp[64]; int secperblk; - snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "LNX1/%8s:", name); secperblk = blocksize >> 9; if (label->lnx.ldl_version == 0xf2) { size = label->lnx.formatted_blocks * secperblk; @@ -258,7 +254,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, size = nr_sectors; if (size != geo_size) { if (!info) { - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } if (!strcmp(info->type, "ECKD")) @@ -270,7 +266,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, /* first and only partition starts in the first block after the label */ offset = labelsect + secperblk; put_partition(state, 1, offset, size - offset); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } @@ -282,7 +278,6 @@ static int find_cms1_partitions(struct parsed_partitions *state, sector_t labelsect) { loff_t offset, size; - char tmp[64]; int secperblk; /* @@ -291,14 +286,12 @@ static int find_cms1_partitions(struct parsed_partitions *state, blocksize = label->cms.block_size; secperblk = blocksize >> 9; if (label->cms.disk_offset != 0) { - snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "CMS1/%8s(MDSK):", name); /* disk is reserved minidisk */ offset = label->cms.disk_offset * secperblk; size = (label->cms.block_count - 1) * secperblk; } else { - snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "CMS1/%8s:", name); /* * Special case for FBA devices: * If an FBA device is CMS formatted with blocksize > 512 byte @@ -314,7 +307,7 @@ static int find_cms1_partitions(struct parsed_partitions *state, } put_partition(state, 1, offset, size-offset); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } @@ -391,11 +384,11 @@ int ibm_partition(struct parsed_partitions *state) */ res = 1; if (info->format == DASD_FORMAT_LDL) { - strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "(nonl)"); size = nr_sectors; offset = (info->label_block + 1) * (blocksize >> 9); put_partition(state, 1, offset, size-offset); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); } } else res = 0; diff --git a/block/partitions/karma.c b/block/partitions/karma.c index 4d93512f4bd4..a4e3c5050177 100644 --- a/block/partitions/karma.c +++ b/block/partitions/karma.c @@ -53,7 +53,7 @@ int karma_partition(struct parsed_partitions *state) } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index 776b4ad95091..c0bdcae58a3e 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -582,7 +582,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, return false; } - strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); + seq_buf_puts(&pp->pp_buf, " [LDM]"); /* Create the data partitions */ list_for_each (item, &ldb->v_part) { @@ -597,7 +597,7 @@ static bool ldm_create_data_partitions (struct parsed_partitions *pp, part_num++; } - strlcat(pp->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&pp->pp_buf, "\n"); return true; } diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b02530d98629..df03ca428e15 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -86,7 +86,7 @@ int mac_partition(struct parsed_partitions *state) if (blocks_in_map >= state->limit) blocks_in_map = state->limit - 1; - strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [mac]"); for (slot = 1; slot <= blocks_in_map; ++slot) { int pos = slot * secsize; put_dev_sector(sect); @@ -152,6 +152,6 @@ int mac_partition(struct parsed_partitions *state) #endif put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 073be78ba0b0..200ea53ea6a2 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -263,18 +263,11 @@ static void parse_solaris_x86(struct parsed_partitions *state, put_dev_sector(sect); return; } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } + seq_buf_printf(&state->pp_buf, " %s%d: name, origin); if (le32_to_cpu(v->v_version) != 1) { - char tmp[64]; - - snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", - le32_to_cpu(v->v_version)); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, + " cannot handle version %d vtoc>\n", + le32_to_cpu(v->v_version)); put_dev_sector(sect); return; } @@ -282,12 +275,10 @@ static void parse_solaris_x86(struct parsed_partitions *state, max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; for (i = 0; i < max_nparts && state->next < state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; - char tmp[3 + 10 + 1 + 1]; if (s->s_size == 0) continue; - snprintf(tmp, sizeof(tmp), " [s%d]", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " [s%d]", i); /* solaris partitions are relative to current MS-DOS * one; must add the offset of the current partition */ put_partition(state, state->next++, @@ -295,7 +286,7 @@ static void parse_solaris_x86(struct parsed_partitions *state, le32_to_cpu(s->s_size)); } put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); #endif } @@ -359,7 +350,6 @@ static void parse_bsd(struct parsed_partitions *state, Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; - char tmp[64]; l = read_part_sector(state, offset + 1, §); if (!l) @@ -369,8 +359,7 @@ static void parse_bsd(struct parsed_partitions *state, return; } - snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " %s%d: <%s:", state->name, origin, flavour); if (le16_to_cpu(l->d_npartitions) < max_partitions) max_partitions = le16_to_cpu(l->d_npartitions); @@ -391,18 +380,16 @@ static void parse_bsd(struct parsed_partitions *state, /* full parent partition, we have it already */ continue; if (offset > bsd_start || offset+size < bsd_start+bsd_size) { - strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "bad subpartition - ignored\n"); continue; } put_partition(state, state->next++, bsd_start, bsd_size); } put_dev_sector(sect); - if (le16_to_cpu(l->d_npartitions) > max_partitions) { - snprintf(tmp, sizeof(tmp), " (ignored %d more)", - le16_to_cpu(l->d_npartitions) - max_partitions); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + if (le16_to_cpu(l->d_npartitions) > max_partitions) + seq_buf_printf(&state->pp_buf, " (ignored %d more)", + le16_to_cpu(l->d_npartitions) - max_partitions); + seq_buf_puts(&state->pp_buf, " >\n"); } #endif @@ -496,12 +483,7 @@ static void parse_unixware(struct parsed_partitions *state, put_dev_sector(sect); return; } - { - char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); - } + seq_buf_printf(&state->pp_buf, " %s%d: name, origin); p = &l->vtoc.v_slice[1]; /* I omit the 0th slice as it is the same as whole disk. */ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { @@ -515,7 +497,7 @@ static void parse_unixware(struct parsed_partitions *state, p++; } put_dev_sector(sect); - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); #endif } @@ -546,10 +528,7 @@ static void parse_minix(struct parsed_partitions *state, * the normal boot sector. */ if (msdos_magic_present(data + 510) && p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ - char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; - - snprintf(tmp, sizeof(tmp), " %s%d: name, origin); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, " %s%d: name, origin); for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { if (state->next == state->limit) break; @@ -558,7 +537,7 @@ static void parse_minix(struct parsed_partitions *state, put_partition(state, state->next++, start_sect(p), nr_sects(p)); } - strlcat(state->pp_buf, " >\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >\n"); } put_dev_sector(sect); #endif /* CONFIG_MINIX_SUBPARTITION */ @@ -602,7 +581,7 @@ int msdos_partition(struct parsed_partitions *state) #ifdef CONFIG_AIX_PARTITION return aix_partition(state); #else - strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " [AIX]"); return 0; #endif } @@ -629,7 +608,7 @@ int msdos_partition(struct parsed_partitions *state) fb = (struct fat_boot_sector *) data; if (slot == 1 && fb->reserved && fb->fats && fat_valid_media(fb->media)) { - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } else { @@ -678,9 +657,9 @@ int msdos_partition(struct parsed_partitions *state) n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); - strlcat(state->pp_buf, " <", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " <"); parse_extended(state, start, size, disksig); - strlcat(state->pp_buf, " >", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, " >"); continue; } put_partition(state, slot, start, size); @@ -688,12 +667,12 @@ int msdos_partition(struct parsed_partitions *state) if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (p->sys_ind == DM6_PARTITION) - strlcat(state->pp_buf, "[DM]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "[DM]"); if (p->sys_ind == EZD_PARTITION) - strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "[EZD]"); } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); /* second pass - output for each on a separate line */ p = (struct msdos_partition *) (0x1be + data); diff --git a/block/partitions/of.c b/block/partitions/of.c index 4e760fdffb3f..c22b60661098 100644 --- a/block/partitions/of.c +++ b/block/partitions/of.c @@ -36,7 +36,6 @@ static void add_of_partition(struct parsed_partitions *state, int slot, struct device_node *np) { struct partition_meta_info *info; - char tmp[sizeof(info->volname) + 4]; const char *partname; int len; @@ -63,8 +62,7 @@ static void add_of_partition(struct parsed_partitions *state, int slot, partname = of_get_property(np, "name", &len); strscpy(info->volname, partname, sizeof(info->volname)); - snprintf(tmp, sizeof(tmp), "(%s)", info->volname); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(%s)", info->volname); } int of_partition(struct parsed_partitions *state) @@ -104,7 +102,7 @@ int of_partition(struct parsed_partitions *state) slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } diff --git a/block/partitions/osf.c b/block/partitions/osf.c index 84560d0765ed..2a692584dba9 100644 --- a/block/partitions/osf.c +++ b/block/partitions/osf.c @@ -81,7 +81,7 @@ int osf_partition(struct parsed_partitions *state) le32_to_cpu(partition->p_size)); slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index b5ecddd5181a..2383ca63cd66 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -79,7 +79,7 @@ int sgi_partition(struct parsed_partitions *state) } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 2419af76120f..92c645fcd2e0 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -121,7 +121,7 @@ int sun_partition(struct parsed_partitions *state) } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c index 6f6257fd4eb4..470e0f9de7be 100644 --- a/block/partitions/sysv68.c +++ b/block/partitions/sysv68.c @@ -54,7 +54,6 @@ int sysv68_partition(struct parsed_partitions *state) unsigned char *data; struct dkblk0 *b; struct slice *slice; - char tmp[64]; data = read_part_sector(state, 0, §); if (!data) @@ -74,8 +73,7 @@ int sysv68_partition(struct parsed_partitions *state) return -1; slices -= 1; /* last slice is the whole disk */ - snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "sysV68: %s(s%u)", state->name, slices); slice = (struct slice *)data; for (i = 0; i < slices; i++, slice++) { if (slot == state->limit) @@ -84,12 +82,11 @@ int sysv68_partition(struct parsed_partitions *state) put_partition(state, slot, be32_to_cpu(slice->blkoff), be32_to_cpu(slice->nblocks)); - snprintf(tmp, sizeof(tmp), "(s%u)", i); - strlcat(state->pp_buf, tmp, PAGE_SIZE); + seq_buf_printf(&state->pp_buf, "(s%u)", i); } slot++; } - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); put_dev_sector(sect); return 1; } diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c index 4aaa81043ca0..b4b9ddc57a5d 100644 --- a/block/partitions/ultrix.c +++ b/block/partitions/ultrix.c @@ -39,7 +39,7 @@ int ultrix_partition(struct parsed_partitions *state) label->pt_part[i].pi_blkoff, label->pt_part[i].pi_nblocks); put_dev_sector(sect); - strlcat(state->pp_buf, "\n", PAGE_SIZE); + seq_buf_puts(&state->pp_buf, "\n"); return 1; } else { put_dev_sector(sect); From 52e4324935be917f8f3267354b3cc06bb8ffcec1 Mon Sep 17 00:00:00 2001 From: FengWei Shih Date: Thu, 19 Mar 2026 13:33:51 +0800 Subject: [PATCH 043/146] md/raid5: skip 2-failure compute when other disk is R5_LOCKED When skip_copy is enabled on a doubly-degraded RAID6, a device that is being written to will be in R5_LOCKED state with R5_UPTODATE cleared. If a new read triggers fetch_block() while the write is still in flight, the 2-failure compute path may select this locked device as a compute target because it is not R5_UPTODATE. Because skip_copy makes the device page point directly to the bio page, reconstructing data into it might be risky. Also, since the compute marks the device R5_UPTODATE, it triggers WARN_ON in ops_run_io() which checks that R5_SkipCopy and R5_UPTODATE are not both set. This can be reproduced by running small-range concurrent read/write on a doubly-degraded RAID6 with skip_copy enabled, for example: mdadm -C /dev/md0 -l6 -n6 -R -f /dev/loop[0-3] missing missing echo 1 > /sys/block/md0/md/skip_copy fio --filename=/dev/md0 --rw=randrw --bs=4k --numjobs=8 \ --iodepth=32 --size=4M --runtime=30 --time_based --direct=1 Fix by checking R5_LOCKED before proceeding with the compute. The compute will be retried once the lock is cleared on IO completion. Signed-off-by: FengWei Shih Reviewed-by: Yu Kuai Link: https://lore.kernel.org/linux-raid/20260319053351.3676794-1-dannyshih@synology.com/ Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2ec6dd6ddd93..ddac1be2648f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3916,6 +3916,8 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, break; } BUG_ON(other < 0); + if (test_bit(R5_LOCKED, &sh->dev[other].flags)) + return 0; pr_debug("Computing stripe %llu blocks %d,%d\n", (unsigned long long)sh->sector, disk_idx, other); From de3544d2e5ea99064498de3c21ba490155864657 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Thu, 5 Mar 2026 09:18:33 +0800 Subject: [PATCH 044/146] md/raid1: fix the comparing region of interval tree Interval tree uses [start, end] as a region which stores in the tree. In raid1, it uses the wrong end value. For example: bio(A,B) is too big and needs to be split to bio1(A,C-1), bio2(C,B). The region of bio1 is [A,C] and the region of bio2 is [C,B]. So bio1 and bio2 overlap which is not right. Fix this problem by using right end value of the region. Fixes: d0d2d8ba0494 ("md/raid1: introduce wait_for_serialization") Signed-off-by: Xiao Ni Link: https://lore.kernel.org/linux-raid/20260305011839.5118-2-xni@redhat.com/ Signed-off-by: Yu Kuai --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cda6af0712b9..16f671ab12c0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -62,7 +62,7 @@ static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, unsigned long flags; int ret = 0; sector_t lo = r1_bio->sector; - sector_t hi = lo + r1_bio->sectors; + sector_t hi = lo + r1_bio->sectors - 1; struct serial_in_rdev *serial = &rdev->serial[idx]; spin_lock_irqsave(&serial->serial_lock, flags); @@ -452,7 +452,7 @@ static void raid1_end_write_request(struct bio *bio) int mirror = find_bio_disk(r1_bio, bio); struct md_rdev *rdev = conf->mirrors[mirror].rdev; sector_t lo = r1_bio->sector; - sector_t hi = r1_bio->sector + r1_bio->sectors; + sector_t hi = r1_bio->sector + r1_bio->sectors - 1; bool ignore_error = !raid1_should_handle_error(bio) || (bio->bi_status && bio_op(bio) == REQ_OP_DISCARD); From af5c99b8ea371b8137e2a62be50adabd964a68c6 Mon Sep 17 00:00:00 2001 From: Chen Cheng Date: Wed, 4 Mar 2026 19:09:18 +0800 Subject: [PATCH 045/146] md/raid5: remove stale md_raid5_kick_device() declaration Remove the unused md_raid5_kick_device() declaration from raid5.h - no definition exists for this function. Signed-off-by: Chen Cheng Reviewed-by: Yu Kuai Link: https://lore.kernel.org/linux-raid/20260304110919.15071-1-chencheng@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/raid5.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 110b1c2d0a86..1c7b710fc9c1 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -801,7 +801,6 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx) } #endif -void md_raid5_kick_device(struct r5conf *conf); int raid5_set_cache_size(struct mddev *mddev, int size); sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); void raid5_release_stripe(struct stripe_head *sh); From 81c041260a2b8b1533a2492071a0ab53074368a7 Mon Sep 17 00:00:00 2001 From: Chen Cheng Date: Wed, 4 Mar 2026 19:10:01 +0800 Subject: [PATCH 046/146] md/raid5: move handle_stripe() comment to correct location Move the handle_stripe() documentation comment from above analyse_stripe() to directly above handle_stripe() where it belongs. Signed-off-by: Chen Cheng Reviewed-by: Yu Kuai Link: https://lore.kernel.org/linux-raid/20260304111001.15767-1-chencheng@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ddac1be2648f..1f8360d4cdb7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4596,20 +4596,6 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) async_tx_quiesce(&tx); } -/* - * handle_stripe - do things to a stripe. - * - * We lock the stripe by setting STRIPE_ACTIVE and then examine the - * state of various bits to see what needs to be done. - * Possible results: - * return some read requests which now have data - * return some write requests which are safely on storage - * schedule a read on some buffers - * schedule a write of some buffers - * return confirmation of parity correctness - * - */ - static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) { struct r5conf *conf = sh->raid_conf; @@ -4903,6 +4889,18 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_bit(STRIPE_HANDLE, &head_sh->state); } +/* + * handle_stripe - do things to a stripe. + * + * We lock the stripe by setting STRIPE_ACTIVE and then examine the + * state of various bits to see what needs to be done. + * Possible results: + * return some read requests which now have data + * return some write requests which are safely on storage + * schedule a read on some buffers + * schedule a write of some buffers + * return confirmation of parity correctness + */ static void handle_stripe(struct stripe_head *sh) { struct stripe_head_state s; From 6f507eb2bb5491327fe634dc23558d4ca5d710b8 Mon Sep 17 00:00:00 2001 From: Chen Cheng Date: Wed, 4 Mar 2026 19:14:17 +0800 Subject: [PATCH 047/146] md: remove unused mddev argument from export_rdev The mddev argument in export_rdev() is never used. Remove it to simplify callers. Signed-off-by: Chen Cheng Reviewed-by: Paul Menzel Link: https://lore.kernel.org/linux-raid/20260304111417.20777-1-chencheng@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/md.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index c2cc2302d727..521d9b34cd9e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -98,7 +98,7 @@ static struct workqueue_struct *md_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); static void mddev_detach(struct mddev *mddev); -static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); +static void export_rdev(struct md_rdev *rdev); static void md_wakeup_thread_directly(struct md_thread __rcu **thread); /* @@ -959,7 +959,7 @@ void mddev_unlock(struct mddev *mddev) list_for_each_entry_safe(rdev, tmp, &delete, same_set) { list_del_init(&rdev->same_set); kobject_del(&rdev->kobj); - export_rdev(rdev, mddev); + export_rdev(rdev); } if (!legacy_async_del_gendisk) { @@ -2632,7 +2632,7 @@ void md_autodetect_dev(dev_t dev); /* just for claiming the bdev */ static struct md_rdev claim_rdev; -static void export_rdev(struct md_rdev *rdev, struct mddev *mddev) +static void export_rdev(struct md_rdev *rdev) { pr_debug("md: export_rdev(%pg)\n", rdev->bdev); md_rdev_clear(rdev); @@ -4850,7 +4850,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) err = bind_rdev_to_array(rdev, mddev); out: if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); mddev_unlock_and_resume(mddev); if (!err) md_new_event(); @@ -7142,7 +7142,7 @@ static void autorun_devices(int part) rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); if (bind_rdev_to_array(rdev, mddev)) - export_rdev(rdev, mddev); + export_rdev(rdev); } autorun_array(mddev); mddev_unlock_and_resume(mddev); @@ -7152,7 +7152,7 @@ static void autorun_devices(int part) */ rdev_for_each_list(rdev, tmp, &candidates) { list_del_init(&rdev->same_set); - export_rdev(rdev, mddev); + export_rdev(rdev); } mddev_put(mddev); } @@ -7340,13 +7340,13 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) pr_warn("md: %pg has different UUID to %pg\n", rdev->bdev, rdev0->bdev); - export_rdev(rdev, mddev); + export_rdev(rdev); return -EINVAL; } } err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } @@ -7389,7 +7389,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* This was a hot-add request, but events doesn't * match, so reject it. */ - export_rdev(rdev, mddev); + export_rdev(rdev); return -EINVAL; } @@ -7415,7 +7415,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) } } if (has_journal || mddev->bitmap) { - export_rdev(rdev, mddev); + export_rdev(rdev); return -EBUSY; } set_bit(Journal, &rdev->flags); @@ -7430,7 +7430,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) /* --add initiated by this node */ err = mddev->cluster_ops->add_new_disk(mddev, rdev); if (err) { - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } } @@ -7440,7 +7440,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) - export_rdev(rdev, mddev); + export_rdev(rdev); if (mddev_is_clustered(mddev)) { if (info->state & (1 << MD_DISK_CANDIDATE)) { @@ -7503,7 +7503,7 @@ int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info) err = bind_rdev_to_array(rdev, mddev); if (err) { - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } } @@ -7615,7 +7615,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) return 0; abort_export: - export_rdev(rdev, mddev); + export_rdev(rdev); return err; } From 24d4c90286b9a36a2b72d1e0ceeae237d427f975 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 18 Mar 2026 09:41:12 +0800 Subject: [PATCH 048/146] ublk: move cold paths out of __ublk_batch_dispatch() for icache efficiency Mark ublk_filter_unused_tags() as noinline since it is only called from the unlikely(needs_filter) branch. Extract the error-handling block from __ublk_batch_dispatch() into a new noinline ublk_batch_dispatch_fail() function to keep the hot path compact and icache-friendly. This also makes __ublk_batch_dispatch() more readable by separating the error recovery logic from the normal dispatch flow. Before: __ublk_batch_dispatch is ~1419 bytes After: __ublk_batch_dispatch is ~1090 bytes (-329 bytes, -23%) Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260318014112.3125432-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 70 ++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 32 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 34ed4f6a02ef..71c7c56b38ca 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1789,7 +1789,7 @@ static bool ublk_batch_prep_dispatch(struct ublk_queue *ubq, * Filter out UBLK_BATCH_IO_UNUSED_TAG entries from tag_buf. * Returns the new length after filtering. */ -static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, +static noinline unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, unsigned int len) { unsigned int i, j; @@ -1805,6 +1805,41 @@ static unsigned int ublk_filter_unused_tags(unsigned short *tag_buf, return j; } +static noinline void ublk_batch_dispatch_fail(struct ublk_queue *ubq, + const struct ublk_batch_io_data *data, + unsigned short *tag_buf, size_t len, int ret) +{ + int i, res; + + /* + * Undo prep state for all IOs since userspace never received them. + * This restores IOs to pre-prepared state so they can be cleanly + * re-prepared when tags are pulled from FIFO again. + */ + for (i = 0; i < len; i++) { + struct ublk_io *io = &ubq->ios[tag_buf[i]]; + int index = -1; + + ublk_io_lock(io); + if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) + index = io->buf.auto_reg.index; + io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); + io->flags |= UBLK_IO_FLAG_ACTIVE; + ublk_io_unlock(io); + + if (index != -1) + io_buffer_unregister_bvec(data->cmd, index, + data->issue_flags); + } + + res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, + tag_buf, len, &ubq->evts_lock); + + pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " + "tags(%d %zu) ret %d\n", __func__, res, len, + ret); +} + #define MAX_NR_TAG 128 static int __ublk_batch_dispatch(struct ublk_queue *ubq, const struct ublk_batch_io_data *data, @@ -1848,37 +1883,8 @@ static int __ublk_batch_dispatch(struct ublk_queue *ubq, sel.val = ublk_batch_copy_io_tags(fcmd, sel.addr, tag_buf, len * tag_sz); ret = ublk_batch_fetch_post_cqe(fcmd, &sel, data->issue_flags); - if (unlikely(ret < 0)) { - int i, res; - - /* - * Undo prep state for all IOs since userspace never received them. - * This restores IOs to pre-prepared state so they can be cleanly - * re-prepared when tags are pulled from FIFO again. - */ - for (i = 0; i < len; i++) { - struct ublk_io *io = &ubq->ios[tag_buf[i]]; - int index = -1; - - ublk_io_lock(io); - if (io->flags & UBLK_IO_FLAG_AUTO_BUF_REG) - index = io->buf.auto_reg.index; - io->flags &= ~(UBLK_IO_FLAG_OWNED_BY_SRV | UBLK_IO_FLAG_AUTO_BUF_REG); - io->flags |= UBLK_IO_FLAG_ACTIVE; - ublk_io_unlock(io); - - if (index != -1) - io_buffer_unregister_bvec(data->cmd, index, - data->issue_flags); - } - - res = kfifo_in_spinlocked_noirqsave(&ubq->evts_fifo, - tag_buf, len, &ubq->evts_lock); - - pr_warn_ratelimited("%s: copy tags or post CQE failure, move back " - "tags(%d %zu) ret %d\n", __func__, res, len, - ret); - } + if (unlikely(ret < 0)) + ublk_batch_dispatch_fail(ubq, data, tag_buf, len, ret); return ret; } From 67807fbaf12719fca46a622d759484652b79c7c3 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Sun, 22 Mar 2026 03:35:10 +0100 Subject: [PATCH 049/146] block: fix bio_alloc_bioset slowpath GFP handling bio_alloc_bioset() first strips __GFP_DIRECT_RECLAIM from the optimistic fast allocation attempt with try_alloc_gfp(). If that fast path fails, the slowpath checks saved_gfp to decide whether blocking allocation is allowed, but then still calls mempool_alloc() with the stripped gfp mask. That can lead to a NULL bio pointer being passed into bio_init(). Fix the slowpath by using saved_gfp for the bio and bvec mempool allocations. Fixes: b520c4eef83d ("block: split bio_alloc_bioset more clearly into a fast and slowpath") Reported-by: syzbot+09ddb593eea76a158f42@syzkaller.appspotmail.com Signed-off-by: Vasily Gorbik Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/p01.gc6e9ad5845ad.ttca29g@ub.hpns Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index 5057047194c4..77067fa346d3 100644 --- a/block/bio.c +++ b/block/bio.c @@ -581,11 +581,11 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, */ opf &= ~REQ_ALLOC_CACHE; - p = mempool_alloc(&bs->bio_pool, gfp); + p = mempool_alloc(&bs->bio_pool, saved_gfp); bio = p + bs->front_pad; if (nr_vecs > BIO_INLINE_VECS) { nr_vecs = BIO_MAX_VECS; - bvecs = mempool_alloc(&bs->bvec_pool, gfp); + bvecs = mempool_alloc(&bs->bvec_pool, saved_gfp); } } From eff8d1656e83d186fdf9dd3ad0f229088440e4c6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2026 08:11:49 +0100 Subject: [PATCH 050/146] zloop: refactor zloop_rw Split out two helpers functions to make the function more readable and to avoid conditional locking. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260323071156.2940772-2-hch@lst.de Signed-off-by: Jens Axboe --- drivers/block/zloop.c | 240 ++++++++++++++++++++++-------------------- 1 file changed, 124 insertions(+), 116 deletions(-) diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 51c043342127..8ca37ca1935a 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -378,125 +378,22 @@ static void zloop_rw_complete(struct kiocb *iocb, long ret) zloop_put_cmd(cmd); } -static void zloop_rw(struct zloop_cmd *cmd) +static int zloop_do_rw(struct zloop_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); + int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; + unsigned int nr_bvec = blk_rq_nr_bvec(rq); struct zloop_device *zlo = rq->q->queuedata; - unsigned int zone_no = rq_zone_no(rq); - sector_t sector = blk_rq_pos(rq); - sector_t nr_sectors = blk_rq_sectors(rq); - bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; - bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; - int rw = is_write ? ITER_SOURCE : ITER_DEST; + struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; struct req_iterator rq_iter; - struct zloop_zone *zone; struct iov_iter iter; - struct bio_vec tmp; - unsigned long flags; - sector_t zone_end; - unsigned int nr_bvec; - int ret; - - atomic_set(&cmd->ref, 2); - cmd->sector = sector; - cmd->nr_sectors = nr_sectors; - cmd->ret = 0; - - if (WARN_ON_ONCE(is_append && !zlo->zone_append)) { - ret = -EIO; - goto out; - } - - /* We should never get an I/O beyond the device capacity. */ - if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) { - ret = -EIO; - goto out; - } - zone = &zlo->zones[zone_no]; - zone_end = zone->start + zlo->zone_capacity; - - /* - * The block layer should never send requests that are not fully - * contained within the zone. - */ - if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) { - ret = -EIO; - goto out; - } - - if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { - mutex_lock(&zone->lock); - ret = zloop_update_seq_zone(zlo, zone_no); - mutex_unlock(&zone->lock); - if (ret) - goto out; - } - - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { - mutex_lock(&zone->lock); - - spin_lock_irqsave(&zone->wp_lock, flags); - - /* - * Zone append operations always go at the current write - * pointer, but regular write operations must already be - * aligned to the write pointer when submitted. - */ - if (is_append) { - /* - * If ordered zone append is in use, we already checked - * and set the target sector in zloop_queue_rq(). - */ - if (!zlo->ordered_zone_append) { - if (zone->cond == BLK_ZONE_COND_FULL || - zone->wp + nr_sectors > zone_end) { - spin_unlock_irqrestore(&zone->wp_lock, - flags); - ret = -EIO; - goto unlock; - } - sector = zone->wp; - } - cmd->sector = sector; - } else if (sector != zone->wp) { - spin_unlock_irqrestore(&zone->wp_lock, flags); - pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", - zone_no, sector, zone->wp); - ret = -EIO; - goto unlock; - } - - /* Implicitly open the target zone. */ - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - /* - * Advance the write pointer, unless ordered zone append is in - * use. If the write fails, the write pointer position will be - * corrected when the next I/O starts execution. - */ - if (!is_append || !zlo->ordered_zone_append) { - zone->wp += nr_sectors; - if (zone->wp == zone_end) { - zone->cond = BLK_ZONE_COND_FULL; - zone->wp = ULLONG_MAX; - } - } - - spin_unlock_irqrestore(&zone->wp_lock, flags); - } - - nr_bvec = blk_rq_nr_bvec(rq); if (rq->bio != rq->biotail) { - struct bio_vec *bvec; + struct bio_vec tmp, *bvec; cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); - if (!cmd->bvec) { - ret = -EIO; - goto unlock; - } + if (!cmd->bvec) + return -EIO; /* * The bios of the request may be started from the middle of @@ -522,7 +419,7 @@ static void zloop_rw(struct zloop_cmd *cmd) iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; } - cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT; + cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; cmd->iocb.ki_filp = zone->file; cmd->iocb.ki_complete = zloop_rw_complete; if (!zlo->buffered_io) @@ -530,12 +427,123 @@ static void zloop_rw(struct zloop_cmd *cmd) cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); if (rw == ITER_SOURCE) - ret = zone->file->f_op->write_iter(&cmd->iocb, &iter); - else - ret = zone->file->f_op->read_iter(&cmd->iocb, &iter); -unlock: - if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) + return zone->file->f_op->write_iter(&cmd->iocb, &iter); + return zone->file->f_op->read_iter(&cmd->iocb, &iter); +} + +static int zloop_seq_write_prep(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + struct zloop_zone *zone = &zlo->zones[zone_no]; + sector_t zone_end = zone->start + zlo->zone_capacity; + unsigned long flags; + int ret = 0; + + spin_lock_irqsave(&zone->wp_lock, flags); + + /* + * Zone append operations always go at the current write pointer, but + * regular write operations must already be aligned to the write pointer + * when submitted. + */ + if (is_append) { + /* + * If ordered zone append is in use, we already checked and set + * the target sector in zloop_queue_rq(). + */ + if (!zlo->ordered_zone_append) { + if (zone->cond == BLK_ZONE_COND_FULL || + zone->wp + nr_sectors > zone_end) { + ret = -EIO; + goto out_unlock; + } + cmd->sector = zone->wp; + } + } else { + if (cmd->sector != zone->wp) { + pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", + zone_no, cmd->sector, zone->wp); + ret = -EIO; + goto out_unlock; + } + } + + /* Implicitly open the target zone. */ + if (zone->cond == BLK_ZONE_COND_CLOSED || + zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + /* + * Advance the write pointer, unless ordered zone append is in use. If + * the write fails, the write pointer position will be corrected when + * the next I/O starts execution. + */ + if (!is_append || !zlo->ordered_zone_append) { + zone->wp += nr_sectors; + if (zone->wp == zone_end) { + zone->cond = BLK_ZONE_COND_FULL; + zone->wp = ULLONG_MAX; + } + } +out_unlock: + spin_unlock_irqrestore(&zone->wp_lock, flags); + return ret; +} + +static void zloop_rw(struct zloop_cmd *cmd) +{ + struct request *rq = blk_mq_rq_from_pdu(cmd); + struct zloop_device *zlo = rq->q->queuedata; + unsigned int zone_no = rq_zone_no(rq); + sector_t nr_sectors = blk_rq_sectors(rq); + bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; + bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; + struct zloop_zone *zone; + int ret = -EIO; + + atomic_set(&cmd->ref, 2); + cmd->sector = blk_rq_pos(rq); + cmd->nr_sectors = nr_sectors; + cmd->ret = 0; + + if (WARN_ON_ONCE(is_append && !zlo->zone_append)) + goto out; + + /* We should never get an I/O beyond the device capacity. */ + if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) + goto out; + + zone = &zlo->zones[zone_no]; + + /* + * The block layer should never send requests that are not fully + * contained within the zone. + */ + if (WARN_ON_ONCE(cmd->sector + nr_sectors > + zone->start + zlo->zone_size)) + goto out; + + if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { + mutex_lock(&zone->lock); + ret = zloop_update_seq_zone(zlo, zone_no); mutex_unlock(&zone->lock); + if (ret) + goto out; + } + + if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { + mutex_lock(&zone->lock); + ret = zloop_seq_write_prep(cmd); + if (!ret) + ret = zloop_do_rw(cmd); + mutex_unlock(&zone->lock); + } else { + ret = zloop_do_rw(cmd); + } out: if (ret != -EIOCBQUEUED) zloop_rw_complete(&cmd->iocb, ret); From 829def1e35ca3a6ef07d53d47089ef7cff0fd127 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Mar 2026 08:11:50 +0100 Subject: [PATCH 051/146] zloop: forget write cache on force removal Add a new options that causes zloop to truncate the zone files to the write pointer value recorded at the last cache flush to simulate unclean shutdowns. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Link: https://patch.msgid.link/20260323071156.2940772-3-hch@lst.de Signed-off-by: Jens Axboe --- .../admin-guide/blockdev/zoned_loop.rst | 5 + drivers/block/zloop.c | 97 +++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst index 6aa865424ac3..a01f857b36ad 100644 --- a/Documentation/admin-guide/blockdev/zoned_loop.rst +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst @@ -104,6 +104,11 @@ ordered_zone_append Enable zloop mitigation of zone append reordering. (extents), as when enabled, this can significantly reduce the number of data extents needed to for a file data mapping. +discard_write_cache Discard all data that was not explicitly persisted using a + flush operation when the device is removed by truncating + each zone file to the size recorded during the last flush + operation. This simulates power fail events where + uncommitted data is lost. =================== ========================================================= 3) Deleting a Zoned Device diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 8ca37ca1935a..86a1324c27b3 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -17,6 +17,7 @@ #include #include #include +#include /* * Options for adding (and removing) a device. @@ -34,6 +35,7 @@ enum { ZLOOP_OPT_BUFFERED_IO = (1 << 8), ZLOOP_OPT_ZONE_APPEND = (1 << 9), ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), + ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11), }; static const match_table_t zloop_opt_tokens = { @@ -48,6 +50,7 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, + { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" }, { ZLOOP_OPT_ERR, NULL } }; @@ -79,6 +82,7 @@ struct zloop_options { bool buffered_io; bool zone_append; bool ordered_zone_append; + bool discard_write_cache; }; /* @@ -119,6 +123,7 @@ struct zloop_device { bool buffered_io; bool zone_append; bool ordered_zone_append; + bool discard_write_cache; const char *base_dir; struct file *data_dir; @@ -550,6 +555,41 @@ out: zloop_put_cmd(cmd); } +static inline bool zloop_zone_is_active(struct zloop_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_CLOSED: + return true; + default: + return false; + } +} + +static int zloop_record_safe_wps(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + struct file *file = zone->file; + + if (!zloop_zone_is_active(zone)) + continue; + ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file), + "user.zloop.wp", &zone->wp, sizeof(zone->wp), 0); + if (ret) { + pr_err("%pg: failed to record write pointer (%d)\n", + zlo->disk->part0, ret); + return ret; + } + } + + return 0; +} + /* * Sync the entire FS containing the zone files instead of walking all files. */ @@ -558,6 +598,12 @@ static int zloop_flush(struct zloop_device *zlo) struct super_block *sb = file_inode(zlo->data_dir)->i_sb; int ret; + if (zlo->discard_write_cache) { + ret = zloop_record_safe_wps(zlo); + if (ret) + return ret; + } + down_read(&sb->s_umount); ret = sync_filesystem(sb); up_read(&sb->s_umount); @@ -1054,6 +1100,7 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->zone_append = opts->zone_append; if (zlo->zone_append) zlo->ordered_zone_append = opts->ordered_zone_append; + zlo->discard_write_cache = opts->discard_write_cache; zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, opts->nr_queues * opts->queue_depth, zlo->id); @@ -1176,6 +1223,49 @@ out: return ret; } +static void zloop_truncate(struct file *file, loff_t pos) +{ + struct mnt_idmap *idmap = file_mnt_idmap(file); + struct dentry *dentry = file_dentry(file); + struct iattr newattrs; + + newattrs.ia_size = pos; + newattrs.ia_valid = ATTR_SIZE; + + inode_lock(dentry->d_inode); + notify_change(idmap, dentry, &newattrs, NULL); + inode_unlock(dentry->d_inode); +} + +static void zloop_forget_cache(struct zloop_device *zlo) +{ + unsigned int i; + int ret; + + pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0); + + for (i = 0; i < zlo->nr_zones; i++) { + struct zloop_zone *zone = &zlo->zones[i]; + struct file *file = zone->file; + sector_t old_wp; + + if (!zloop_zone_is_active(zone)) + continue; + + ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file), + "user.zloop.wp", &old_wp, sizeof(old_wp)); + if (ret == -ENODATA) { + old_wp = 0; + } else if (ret != sizeof(old_wp)) { + pr_err("%pg: failed to retrieve write pointer (%d)\n", + zlo->disk->part0, ret); + continue; + } + if (old_wp < zone->wp) + zloop_truncate(file, old_wp); + } +} + static int zloop_ctl_remove(struct zloop_options *opts) { struct zloop_device *zlo; @@ -1210,6 +1300,10 @@ static int zloop_ctl_remove(struct zloop_options *opts) return ret; del_gendisk(zlo->disk); + + if (zlo->discard_write_cache) + zloop_forget_cache(zlo); + put_disk(zlo->disk); pr_info("Removed device %d\n", opts->id); @@ -1361,6 +1455,9 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) case ZLOOP_OPT_ORDERED_ZONE_APPEND: opts->ordered_zone_append = true; break; + case ZLOOP_OPT_DISCARD_WRITE_CACHE: + opts->discard_write_cache = true; + break; case ZLOOP_OPT_ERR: default: pr_warn("unknown parameter or missing value '%s'\n", p); From 630bbba45cfd3e4f9247cefd3e2cdc03fe40421b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Tue, 24 Mar 2026 16:29:07 +0100 Subject: [PATCH 052/146] drbd: use genl pre_doit/post_doit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every doit handler followed the same pattern: stack-allocate an adm_ctx, call drbd_adm_prepare() at the top, call drbd_adm_finish() at the bottom. This duplicated boilerplate across 25 handlers and made error paths inconsistent, since some handlers could miss sending the reply skb on early-exit paths. The generic netlink framework already provides pre_doit/post_doit hooks for exactly this purpose. An old comment even noted "this would be a good candidate for a pre_doit hook". Use them: - pre_doit heap-allocates adm_ctx, looks up per-command flags from a new drbd_genl_cmd_flags[] table, runs drbd_adm_prepare(), and stores the context in info->user_ptr[0]. - post_doit sends the reply, drops kref references for device/connection/resource, and frees the adm_ctx. - Handlers just receive adm_ctx from info->user_ptr[0], set reply_dh->ret_code, and return. All teardown is in post_doit. - drbd_adm_finish() is removed, superseded by post_doit. Signed-off-by: Christoph Böhmwalder Link: https://patch.msgid.link/20260324152907.2840984-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 573 ++++++++++++++++++-------------- include/linux/genl_magic_func.h | 4 + 2 files changed, 324 insertions(+), 253 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e201f0087a0f..e18fa260a662 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -75,6 +75,15 @@ int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) #include #include "drbd_nla.h" + +static int drbd_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +static void drbd_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); + +#define GENL_MAGIC_FAMILY_PRE_DOIT drbd_pre_doit +#define GENL_MAGIC_FAMILY_POST_DOIT drbd_post_doit + #include static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ @@ -144,18 +153,46 @@ static int drbd_msg_sprintf_info(struct sk_buff *skb, const char *fmt, ...) return 0; } -/* This would be a good candidate for a "pre_doit" hook, - * and per-family private info->pointers. - * But we need to stay compatible with older kernels. - * If it returns successfully, adm_ctx members are valid. - * +/* Flags for drbd_adm_prepare() */ +#define DRBD_ADM_NEED_MINOR (1 << 0) +#define DRBD_ADM_NEED_RESOURCE (1 << 1) +#define DRBD_ADM_NEED_CONNECTION (1 << 2) + +/* Per-command flags for drbd_pre_doit() */ +static const unsigned int drbd_genl_cmd_flags[] = { + [DRBD_ADM_GET_STATUS] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_NEW_MINOR] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_DEL_MINOR] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_NEW_RESOURCE] = 0, + [DRBD_ADM_DEL_RESOURCE] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_RESOURCE_OPTS] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_CONNECT] = DRBD_ADM_NEED_RESOURCE, + [DRBD_ADM_CHG_NET_OPTS] = DRBD_ADM_NEED_CONNECTION, + [DRBD_ADM_DISCONNECT] = DRBD_ADM_NEED_CONNECTION, + [DRBD_ADM_ATTACH] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_CHG_DISK_OPTS] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_RESIZE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_PRIMARY] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_SECONDARY] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_NEW_C_UUID] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_START_OV] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_DETACH] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_INVALIDATE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_INVAL_PEER] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_PAUSE_SYNC] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_RESUME_SYNC] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_SUSPEND_IO] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_RESUME_IO] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_OUTDATE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_GET_TIMEOUT_TYPE] = DRBD_ADM_NEED_MINOR, + [DRBD_ADM_DOWN] = DRBD_ADM_NEED_RESOURCE, +}; + +/* * At this point, we still rely on the global genl_lock(). * If we want to avoid that, and allow "genl_family.parallel_ops", we may need * to add additional synchronization against object destruction/modification. */ -#define DRBD_ADM_NEED_MINOR 1 -#define DRBD_ADM_NEED_RESOURCE 2 -#define DRBD_ADM_NEED_CONNECTION 4 static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, struct sk_buff *skb, struct genl_info *info, unsigned flags) { @@ -163,8 +200,6 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, const u8 cmd = info->genlhdr->cmd; int err; - memset(adm_ctx, 0, sizeof(*adm_ctx)); - /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN)) return -EPERM; @@ -300,9 +335,45 @@ fail: return err; } -static int drbd_adm_finish(struct drbd_config_context *adm_ctx, - struct genl_info *info, int retcode) +static int drbd_pre_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) { + struct drbd_config_context *adm_ctx; + u8 cmd = info->genlhdr->cmd; + unsigned int flags; + int err; + + adm_ctx = kzalloc_obj(*adm_ctx); + if (!adm_ctx) + return -ENOMEM; + + flags = (cmd < ARRAY_SIZE(drbd_genl_cmd_flags)) + ? drbd_genl_cmd_flags[cmd] : 0; + + err = drbd_adm_prepare(adm_ctx, skb, info, flags); + if (err && !adm_ctx->reply_skb) { + /* Fatal error before reply_skb was allocated. */ + kfree(adm_ctx); + return err; + } + if (err) + adm_ctx->reply_dh->ret_code = err; + + info->user_ptr[0] = adm_ctx; + return 0; +} + +static void drbd_post_doit(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct drbd_config_context *adm_ctx = info->user_ptr[0]; + + if (!adm_ctx) + return; + + if (adm_ctx->reply_skb) + drbd_adm_send_reply(adm_ctx->reply_skb, info); + if (adm_ctx->device) { kref_put(&adm_ctx->device->kref, drbd_destroy_device); adm_ctx->device = NULL; @@ -316,12 +387,7 @@ static int drbd_adm_finish(struct drbd_config_context *adm_ctx, adm_ctx->resource = NULL; } - if (!adm_ctx->reply_skb) - return -ENOMEM; - - adm_ctx->reply_dh->ret_code = retcode; - drbd_adm_send_reply(adm_ctx->reply_skb, info); - return 0; + kfree(adm_ctx); } static void setup_khelper_env(struct drbd_connection *connection, char **envp) @@ -766,15 +832,15 @@ static const char *from_attrs_err_to_txt(int err) int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct set_role_parms parms; int err; enum drbd_ret_code retcode; enum drbd_state_rv rv; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; @@ -783,24 +849,24 @@ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info) err = set_role_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } } genl_unlock(); - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); if (info->genlhdr->cmd == DRBD_ADM_PRIMARY) - rv = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate); + rv = drbd_set_role(adm_ctx->device, R_PRIMARY, parms.assume_uptodate); else - rv = drbd_set_role(adm_ctx.device, R_SECONDARY, 0); + rv = drbd_set_role(adm_ctx->device, R_SECONDARY, 0); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); genl_lock(); - drbd_adm_finish(&adm_ctx, info, rv); + adm_ctx->reply_dh->ret_code = rv; return 0; out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -1512,7 +1578,7 @@ out: int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct drbd_device *device; struct disk_conf *new_disk_conf, *old_disk_conf; @@ -1520,14 +1586,14 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) int err; unsigned int fifo_size; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - device = adm_ctx.device; - mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx->device; + mutex_lock(&adm_ctx->resource->adm_mutex); /* we also need a disk * to change the options on */ @@ -1551,7 +1617,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs_for_change(new_disk_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail_unlock; } @@ -1577,7 +1643,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (err) { /* Could be just "busy". Ignore? * Introduce dedicated error code? */ - drbd_msg_put_info(adm_ctx.reply_skb, + drbd_msg_put_info(adm_ctx->reply_skb, "Try again without changing current al-extents setting"); retcode = ERR_NOMEM; goto fail_unlock; @@ -1640,9 +1706,9 @@ fail_unlock: success: put_ldev(device); out: - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -1734,7 +1800,7 @@ void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev * int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; struct drbd_peer_device *peer_device; struct drbd_connection *connection; @@ -1751,14 +1817,14 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) enum drbd_state_rv rv; struct net_conf *nc; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - device = adm_ctx.device; - mutex_lock(&adm_ctx.resource->adm_mutex); + device = adm_ctx->device; + mutex_lock(&adm_ctx->resource->adm_mutex); peer_device = first_peer_device(device); connection = peer_device->connection; conn_reconfig_start(connection); @@ -1803,7 +1869,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) err = disk_conf_from_attrs(new_disk_conf, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -1954,7 +2020,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) drbd_warn(device, "truncating a consistent device during attach (%llu < %llu)\n", nsz, eff); } else { drbd_warn(device, "refusing to truncate a consistent device (%llu < %llu)\n", nsz, eff); - drbd_msg_sprintf_info(adm_ctx.reply_skb, + drbd_msg_sprintf_info(adm_ctx->reply_skb, "To-be-attached device has last effective > current size, and is consistent\n" "(%llu > %llu sectors). Refusing to attach.", eff, nsz); retcode = ERR_IMPLICIT_SHRINK; @@ -2130,8 +2196,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE); put_ldev(device); conn_reconfig_done(connection); - mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, retcode); + mutex_unlock(&adm_ctx->resource->adm_mutex); + adm_ctx->reply_dh->ret_code = retcode; return 0; force_diskless_dec: @@ -2150,9 +2216,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) kfree(new_disk_conf); lc_destroy(resync_lru); kfree(new_plan); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2174,14 +2240,14 @@ static int adm_detach(struct drbd_device *device, int force) * Only then we have finally detached. */ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct detach_parms parms = { }; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; @@ -2189,16 +2255,16 @@ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info) err = detach_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } } - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = adm_detach(adm_ctx.device, parms.force_detach); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = adm_detach(adm_ctx->device, parms.force_detach); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2372,7 +2438,7 @@ static void free_crypto(struct crypto *crypto) int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct drbd_connection *connection; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2381,14 +2447,14 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) int rsr; /* re-sync running */ struct crypto crypto = { }; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - connection = adm_ctx.connection; - mutex_lock(&adm_ctx.resource->adm_mutex); + connection = adm_ctx->connection; + mutex_lock(&adm_ctx->resource->adm_mutex); new_net_conf = kzalloc_obj(struct net_conf); if (!new_net_conf) { @@ -2403,7 +2469,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) old_net_conf = connection->net_conf; if (!old_net_conf) { - drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect"); + drbd_msg_put_info(adm_ctx->reply_skb, "net conf missing, try connect"); retcode = ERR_INVALID_REQUEST; goto fail; } @@ -2415,7 +2481,7 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs_for_change(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2485,9 +2551,9 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) done: conn_reconfig_done(connection); out: - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2516,7 +2582,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) struct connection_info connection_info; enum drbd_notification_type flags; unsigned int peer_devices = 0; - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; struct crypto crypto = { }; @@ -2527,14 +2593,13 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) int i; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) { - drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing"); + if (!(adm_ctx->my_addr && adm_ctx->peer_addr)) { + drbd_msg_put_info(adm_ctx->reply_skb, "connection endpoint(s) missing"); retcode = ERR_INVALID_REQUEST; goto out; } @@ -2544,15 +2609,15 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) * concurrent reconfiguration/addition/deletion */ for_each_resource(resource, &drbd_resources) { for_each_connection(connection, resource) { - if (nla_len(adm_ctx.my_addr) == connection->my_addr_len && - !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr, + if (nla_len(adm_ctx->my_addr) == connection->my_addr_len && + !memcmp(nla_data(adm_ctx->my_addr), &connection->my_addr, connection->my_addr_len)) { retcode = ERR_LOCAL_ADDR; goto out; } - if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len && - !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr, + if (nla_len(adm_ctx->peer_addr) == connection->peer_addr_len && + !memcmp(nla_data(adm_ctx->peer_addr), &connection->peer_addr, connection->peer_addr_len)) { retcode = ERR_PEER_ADDR; goto out; @@ -2560,8 +2625,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) } } - mutex_lock(&adm_ctx.resource->adm_mutex); - connection = first_connection(adm_ctx.resource); + mutex_lock(&adm_ctx->resource->adm_mutex); + connection = first_connection(adm_ctx->resource); conn_reconfig_start(connection); if (connection->cstate > C_STANDALONE) { @@ -2581,7 +2646,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) err = net_conf_from_attrs(new_net_conf, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } @@ -2597,11 +2662,11 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) drbd_flush_workqueue(&connection->sender_work); - mutex_lock(&adm_ctx.resource->conf_update); + mutex_lock(&adm_ctx->resource->conf_update); old_net_conf = connection->net_conf; if (old_net_conf) { retcode = ERR_NET_CONFIGURED; - mutex_unlock(&adm_ctx.resource->conf_update); + mutex_unlock(&adm_ctx->resource->conf_update); goto fail; } rcu_assign_pointer(connection->net_conf, new_net_conf); @@ -2612,10 +2677,10 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) connection->csums_tfm = crypto.csums_tfm; connection->verify_tfm = crypto.verify_tfm; - connection->my_addr_len = nla_len(adm_ctx.my_addr); - memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len); - connection->peer_addr_len = nla_len(adm_ctx.peer_addr); - memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + connection->my_addr_len = nla_len(adm_ctx->my_addr); + memcpy(&connection->my_addr, nla_data(adm_ctx->my_addr), connection->my_addr_len); + connection->peer_addr_len = nla_len(adm_ctx->peer_addr); + memcpy(&connection->peer_addr, nla_data(adm_ctx->peer_addr), connection->peer_addr_len); idr_for_each_entry(&connection->peer_devices, peer_device, i) { peer_devices++; @@ -2633,7 +2698,7 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags); } mutex_unlock(¬ification_mutex); - mutex_unlock(&adm_ctx.resource->conf_update); + mutex_unlock(&adm_ctx->resource->conf_update); rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, i) { @@ -2646,8 +2711,8 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) rv = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE); conn_reconfig_done(connection); - mutex_unlock(&adm_ctx.resource->adm_mutex); - drbd_adm_finish(&adm_ctx, info, rv); + mutex_unlock(&adm_ctx->resource->adm_mutex); + adm_ctx->reply_dh->ret_code = rv; return 0; fail: @@ -2655,9 +2720,9 @@ fail: kfree(new_net_conf); conn_reconfig_done(connection); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2729,40 +2794,40 @@ repeat: int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct disconnect_parms parms; struct drbd_connection *connection; enum drbd_state_rv rv; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto fail; - connection = adm_ctx.connection; + connection = adm_ctx->connection; memset(&parms, 0, sizeof(parms)); if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) { err = disconnect_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); rv = conn_try_disconnect(connection, parms.force_disconnect); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); if (rv < SS_SUCCESS) { - drbd_adm_finish(&adm_ctx, info, rv); + adm_ctx->reply_dh->ret_code = rv; return 0; } retcode = NO_ERROR; fail: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -2784,7 +2849,7 @@ void resync_after_online_grow(struct drbd_device *device) int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct disk_conf *old_disk_conf, *new_disk_conf = NULL; struct resize_parms rs; struct drbd_device *device; @@ -2795,14 +2860,14 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) sector_t u_size; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - mutex_lock(&adm_ctx.resource->adm_mutex); - device = adm_ctx.device; + mutex_lock(&adm_ctx->resource->adm_mutex); + device = adm_ctx->device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; goto fail; @@ -2815,7 +2880,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) err = resize_parms_from_attrs(&rs, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail_ldev; } } @@ -2907,9 +2972,9 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) } fail: - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; fail_ldev: @@ -2920,61 +2985,61 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto fail; - res_opts = adm_ctx.resource->res_opts; + res_opts = adm_ctx->resource->res_opts; if (should_set_defaults(info)) set_res_opts_defaults(&res_opts); err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto fail; } - mutex_lock(&adm_ctx.resource->adm_mutex); - err = set_resource_options(adm_ctx.resource, &res_opts); + mutex_lock(&adm_ctx->resource->adm_mutex); + err = set_resource_options(adm_ctx->resource, &res_opts); if (err) { retcode = ERR_INVALID_REQUEST; if (err == -ENOMEM) retcode = ERR_NOMEM; } - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); fail: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - device = adm_ctx.device; + device = adm_ctx->device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; goto out; } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -2997,30 +3062,30 @@ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T)); drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); put_ldev(device); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info, union drbd_state mask, union drbd_state val) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = drbd_request_state(adm_ctx.device, mask, val); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = drbd_request_state(adm_ctx->device, mask, val); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3036,23 +3101,23 @@ static int drbd_bmio_set_susp_al(struct drbd_device *device, int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; int retcode; /* drbd_ret_code, drbd_state_rv */ struct drbd_device *device; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - device = adm_ctx.device; + device = adm_ctx->device; if (!get_ldev(device)) { retcode = ERR_NO_DISK; goto out; } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. @@ -3078,48 +3143,48 @@ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info) } else retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S)); drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); put_ldev(device); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + mutex_lock(&adm_ctx->resource->adm_mutex); + if (drbd_request_state(adm_ctx->device, NS(user_isp, 1)) == SS_NOTHING_TO_DO) retcode = ERR_PAUSE_IS_SET; - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; union drbd_dev_state s; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { - s = adm_ctx.device->state; + mutex_lock(&adm_ctx->resource->adm_mutex); + if (drbd_request_state(adm_ctx->device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) { + s = adm_ctx->device->state; if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) { retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP : s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR; @@ -3127,9 +3192,9 @@ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info) retcode = ERR_PAUSE_IS_CLEAR; } } - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3140,18 +3205,18 @@ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info) int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - device = adm_ctx.device; + mutex_lock(&adm_ctx->resource->adm_mutex); + device = adm_ctx->device; if (test_bit(NEW_CUR_UUID, &device->flags)) { if (get_ldev_if_state(device, D_ATTACHING)) { drbd_uuid_new_current(device); @@ -3188,9 +3253,9 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO); } drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -3843,23 +3908,24 @@ nla_put_failure: int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL); + err = nla_put_status_info(adm_ctx->reply_skb, adm_ctx->device, NULL); if (err) { - nlmsg_free(adm_ctx.reply_skb); + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4046,46 +4112,47 @@ dump: int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct timeout_parms tp; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; tp.timeout_type = - adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : - test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED : + adm_ctx->device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &adm_ctx->device->flags) ? UT_DEGRADED : UT_DEFAULT; - err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp); + err = timeout_parms_to_priv_skb(adm_ctx->reply_skb, &tp); if (err) { - nlmsg_free(adm_ctx.reply_skb); + nlmsg_free(adm_ctx->reply_skb); + adm_ctx->reply_skb = NULL; return err; } out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; enum drbd_ret_code retcode; struct start_ov_parms parms; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - device = adm_ctx.device; + device = adm_ctx->device; /* resume from last known position, if possible */ parms.ov_start_sector = device->ov_start_sector; @@ -4094,11 +4161,11 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) int err = start_ov_parms_from_attrs(&parms, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); /* w_make_ov_request expects position to be aligned */ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1); @@ -4111,40 +4178,40 @@ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info) retcode = drbd_request_state(device, NS(conn, C_VERIFY_S)); drbd_resume_io(device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_device *device; enum drbd_ret_code retcode; int skip_initial_sync = 0; int err; struct new_c_uuid_parms args; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out_nolock; - device = adm_ctx.device; + device = adm_ctx->device; memset(&args, 0, sizeof(args)); if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) { err = new_c_uuid_parms_from_attrs(&args, info); if (err) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out_nolock; } } - mutex_lock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */ if (!get_ldev(device)) { @@ -4189,9 +4256,9 @@ out_dec: put_ldev(device); out: mutex_unlock(device->state_mutex); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out_nolock: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4224,14 +4291,14 @@ static void resource_to_info(struct resource_info *info, int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { struct drbd_connection *connection; - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; struct res_opts res_opts; int err; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; @@ -4239,18 +4306,18 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) err = res_opts_from_attrs(&res_opts, info); if (err && err != -ENOMSG) { retcode = ERR_MANDATORY_TAG; - drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err)); + drbd_msg_put_info(adm_ctx->reply_skb, from_attrs_err_to_txt(err)); goto out; } - retcode = drbd_check_resource_name(&adm_ctx); + retcode = drbd_check_resource_name(adm_ctx); if (retcode != NO_ERROR) goto out; - if (adm_ctx.resource) { + if (adm_ctx->resource) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) { retcode = ERR_INVALID_REQUEST; - drbd_msg_put_info(adm_ctx.reply_skb, "resource exists"); + drbd_msg_put_info(adm_ctx->reply_skb, "resource exists"); } /* else: still NO_ERROR */ goto out; @@ -4258,7 +4325,7 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) /* not yet safe for genl_family.parallel_ops */ mutex_lock(&resources_mutex); - connection = conn_create(adm_ctx.resource_name, &res_opts); + connection = conn_create(adm_ctx->resource_name, &res_opts); mutex_unlock(&resources_mutex); if (connection) { @@ -4273,7 +4340,7 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) retcode = ERR_NOMEM; out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4286,38 +4353,38 @@ static void device_to_info(struct device_info *info, int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_genlmsghdr *dh = genl_info_userhdr(info); enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; if (dh->minor > MINORMASK) { - drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range"); + drbd_msg_put_info(adm_ctx->reply_skb, "requested minor out of range"); retcode = ERR_INVALID_REQUEST; goto out; } - if (adm_ctx.volume > DRBD_VOLUME_MAX) { - drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range"); + if (adm_ctx->volume > DRBD_VOLUME_MAX) { + drbd_msg_put_info(adm_ctx->reply_skb, "requested volume id out of range"); retcode = ERR_INVALID_REQUEST; goto out; } /* drbd_adm_prepare made sure already * that first_peer_device(device)->connection and device->vnr match the request. */ - if (adm_ctx.device) { + if (adm_ctx->device) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) retcode = ERR_MINOR_OR_VOLUME_EXISTS; /* else: still NO_ERROR */ goto out; } - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = drbd_create_device(&adm_ctx, dh->minor); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = drbd_create_device(adm_ctx, dh->minor); if (retcode == NO_ERROR) { struct drbd_device *device; struct drbd_peer_device *peer_device; @@ -4348,9 +4415,9 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) } mutex_unlock(¬ification_mutex); } - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4393,20 +4460,20 @@ static enum drbd_ret_code adm_del_minor(struct drbd_device *device) int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto out; - mutex_lock(&adm_ctx.resource->adm_mutex); - retcode = adm_del_minor(adm_ctx.device); - mutex_unlock(&adm_ctx.resource->adm_mutex); + mutex_lock(&adm_ctx->resource->adm_mutex); + retcode = adm_del_minor(adm_ctx->device); + mutex_unlock(&adm_ctx->resource->adm_mutex); out: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } @@ -4442,20 +4509,20 @@ static int adm_del_resource(struct drbd_resource *resource) int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_resource *resource; struct drbd_connection *connection; struct drbd_device *device; int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */ unsigned i; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; + resource = adm_ctx->resource; mutex_lock(&resource->adm_mutex); /* demote */ for_each_connection(connection, resource) { @@ -4464,14 +4531,14 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&connection->peer_devices, peer_device, i) { retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to demote"); goto out; } } retcode = conn_try_disconnect(connection, 0); if (retcode < SS_SUCCESS) { - drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to disconnect"); goto out; } } @@ -4480,7 +4547,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) idr_for_each_entry(&resource->devices, device, i) { retcode = adm_detach(device, 0); if (retcode < SS_SUCCESS || retcode > NO_ERROR) { - drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to detach"); goto out; } } @@ -4490,7 +4557,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) retcode = adm_del_minor(device); if (retcode != NO_ERROR) { /* "can not happen" */ - drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume"); + drbd_msg_put_info(adm_ctx->reply_skb, "failed to delete volume"); goto out; } } @@ -4499,28 +4566,28 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) out: mutex_unlock(&resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { - struct drbd_config_context adm_ctx; + struct drbd_config_context *adm_ctx = info->user_ptr[0]; struct drbd_resource *resource; enum drbd_ret_code retcode; - retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); - if (!adm_ctx.reply_skb) - return retcode; + if (!adm_ctx->reply_skb) + return 0; + retcode = adm_ctx->reply_dh->ret_code; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; + resource = adm_ctx->resource; mutex_lock(&resource->adm_mutex); retcode = adm_del_resource(resource); mutex_unlock(&resource->adm_mutex); finish: - drbd_adm_finish(&adm_ctx, info, retcode); + adm_ctx->reply_dh->ret_code = retcode; return 0; } diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index d4da060b7532..6edcac85155e 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -292,6 +292,10 @@ static struct genl_family ZZZ_genl_family __ro_after_init = { #endif .maxattr = ARRAY_SIZE(CONCATENATE(GENL_MAGIC_FAMILY, _tla_nl_policy))-1, .policy = CONCATENATE(GENL_MAGIC_FAMILY, _tla_nl_policy), +#ifdef GENL_MAGIC_FAMILY_PRE_DOIT + .pre_doit = GENL_MAGIC_FAMILY_PRE_DOIT, + .post_doit = GENL_MAGIC_FAMILY_POST_DOIT, +#endif .ops = ZZZ_genl_ops, .n_ops = ARRAY_SIZE(ZZZ_genl_ops), .mcgrps = ZZZ_genl_mcgrps, From 2b31e86387e60b3689339f0f0fbb4d3623d9d494 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 26 Mar 2026 14:40:54 -0700 Subject: [PATCH 053/146] drbd: Balance RCU calls in drbd_adm_dump_devices() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make drbd_adm_dump_devices() call rcu_read_lock() before rcu_read_unlock() is called. This has been detected by the Clang thread-safety analyzer. Tested-by: Christoph Böhmwalder Reviewed-by: Christoph Hellwig Cc: Andreas Gruenbacher Fixes: a55bbd375d18 ("drbd: Backport the "status" command") Signed-off-by: Bart Van Assche Link: https://patch.msgid.link/20260326214054.284593-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e18fa260a662..1f8ffdf9b24e 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -3443,8 +3443,10 @@ int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) if (resource_filter) { retcode = ERR_RES_NOT_KNOWN; resource = drbd_find_resource(nla_data(resource_filter)); - if (!resource) + if (!resource) { + rcu_read_lock(); goto put_result; + } cb->args[0] = (long)resource; } } @@ -3693,8 +3695,10 @@ int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb) if (resource_filter) { retcode = ERR_RES_NOT_KNOWN; resource = drbd_find_resource(nla_data(resource_filter)); - if (!resource) + if (!resource) { + rcu_read_lock(); goto put_result; + } } cb->args[0] = (long)resource; } From 9100a28c8bb4270744942cf834efcd80f1acda7d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:39 -0800 Subject: [PATCH 054/146] nvme-auth: add NVME_AUTH_MAX_DIGEST_SIZE constant Define a NVME_AUTH_MAX_DIGEST_SIZE constant and use it in the appropriate places. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 6 ++---- drivers/nvme/host/auth.c | 6 +++--- include/linux/nvme.h | 5 +++++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index e07e7d4bf8b6..78d751481fe3 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -15,8 +15,6 @@ #include #include -#define HKDF_MAX_HASHLEN 64 - static u32 nvme_dhchap_seqnum; static DEFINE_MUTEX(nvme_dhchap_mutex); @@ -769,7 +767,7 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, struct crypto_shash *hmac_tfm; const char *hmac_name; const char *label = "nvme-tls-psk"; - static const char default_salt[HKDF_MAX_HASHLEN]; + static const char default_salt[NVME_AUTH_MAX_DIGEST_SIZE]; size_t prk_len; const char *ctx; unsigned char *prk, *tls_key; @@ -798,7 +796,7 @@ int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, goto out_free_shash; } - if (WARN_ON(prk_len > HKDF_MAX_HASHLEN)) { + if (WARN_ON(prk_len > NVME_AUTH_MAX_DIGEST_SIZE)) { ret = -EINVAL; goto out_free_prk; } diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 405e7c03b1cf..301c858b7c57 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -38,9 +38,9 @@ struct nvme_dhchap_queue_context { u8 hash_id; u8 sc_c; size_t hash_len; - u8 c1[64]; - u8 c2[64]; - u8 response[64]; + u8 c1[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 c2[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 response[NVME_AUTH_MAX_DIGEST_SIZE]; u8 *ctrl_key; u8 *host_key; u8 *sess_key; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 655d194f8e72..edfebbce6745 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1837,6 +1837,11 @@ enum { NVME_AUTH_HASH_INVALID = 0xff, }; +/* Maximum digest size for any NVME_AUTH_HASH_* value */ +enum { + NVME_AUTH_MAX_DIGEST_SIZE = 64, +}; + /* Defined Diffie-Hellman group identifiers for DH-HMAC-CHAP authentication */ enum { NVME_AUTH_DHGROUP_NULL = 0x00, From e57406c07b790005feaccc9f2bd75b827566e141 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:40 -0800 Subject: [PATCH 055/146] nvme-auth: common: constify static data Fully constify the dhgroup_map and hash_map arrays. Remove 'const' from individual fields, as it is now redundant. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 78d751481fe3..9e5cee217ff5 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -36,9 +36,9 @@ u32 nvme_auth_get_seqnum(void) } EXPORT_SYMBOL_GPL(nvme_auth_get_seqnum); -static struct nvme_auth_dhgroup_map { - const char name[16]; - const char kpp[16]; +static const struct nvme_auth_dhgroup_map { + char name[16]; + char kpp[16]; } dhgroup_map[] = { [NVME_AUTH_DHGROUP_NULL] = { .name = "null", .kpp = "null" }, @@ -87,10 +87,10 @@ u8 nvme_auth_dhgroup_id(const char *dhgroup_name) } EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id); -static struct nvme_dhchap_hash_map { +static const struct nvme_dhchap_hash_map { int len; - const char hmac[15]; - const char digest[8]; + char hmac[15]; + char digest[8]; } hash_map[] = { [NVME_AUTH_HASH_SHA256] = { .len = 32, From bf0e2567a639c455110f9be5db8c92032175e222 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:41 -0800 Subject: [PATCH 056/146] nvme-auth: use proper argument types For input parameters, use pointer to const. This makes it easier to understand which parameters are inputs and which are outputs. In addition, consistently use char for strings and u8 for binary. This makes it easier to understand what is a string and what is binary data. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 47 ++++++++++++++++++++----------------- drivers/nvme/host/auth.c | 3 ++- drivers/nvme/target/auth.c | 5 ++-- drivers/nvme/target/nvmet.h | 2 +- include/linux/nvme-auth.h | 26 ++++++++++---------- 5 files changed, 44 insertions(+), 39 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 9e5cee217ff5..d35523d0a017 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -159,11 +159,10 @@ u32 nvme_auth_key_struct_size(u32 key_len) } EXPORT_SYMBOL_GPL(nvme_auth_key_struct_size); -struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, - u8 key_hash) +struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash) { struct nvme_dhchap_key *key; - unsigned char *p; + const char *p; u32 crc; int ret, key_len; size_t allocated_len = strlen(secret); @@ -181,14 +180,14 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, pr_debug("base64 key decoding error %d\n", key_len); ret = key_len; - goto out_free_secret; + goto out_free_key; } if (key_len != 36 && key_len != 52 && key_len != 68) { pr_err("Invalid key len %d\n", key_len); ret = -EINVAL; - goto out_free_secret; + goto out_free_key; } /* The last four bytes is the CRC in little-endian format */ @@ -203,12 +202,12 @@ struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, pr_err("key crc mismatch (key %08x, crc %08x)\n", get_unaligned_le32(key->key + key_len), crc); ret = -EKEYREJECTED; - goto out_free_secret; + goto out_free_key; } key->len = key_len; key->hash = key_hash; return key; -out_free_secret: +out_free_key: nvme_auth_free_key(key); return ERR_PTR(ret); } @@ -236,7 +235,7 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key) EXPORT_SYMBOL_GPL(nvme_auth_free_key); struct nvme_dhchap_key *nvme_auth_transform_key( - struct nvme_dhchap_key *key, char *nqn) + const struct nvme_dhchap_key *key, const char *nqn) { const char *hmac_name; struct crypto_shash *key_tfm; @@ -302,7 +301,8 @@ out_free_key: } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); -static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey) +static int nvme_auth_hash_skey(int hmac_id, const u8 *skey, size_t skey_len, + u8 *hkey) { const char *digest_name; struct crypto_shash *tfm; @@ -327,8 +327,8 @@ static int nvme_auth_hash_skey(int hmac_id, u8 *skey, size_t skey_len, u8 *hkey) return ret; } -int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *challenge, u8 *aug, size_t hlen) +int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *challenge, u8 *aug, size_t hlen) { struct crypto_shash *tfm; u8 *hashed_key; @@ -409,7 +409,7 @@ int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, EXPORT_SYMBOL_GPL(nvme_auth_gen_pubkey); int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - u8 *ctrl_key, size_t ctrl_key_len, + const u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len) { struct kpp_request *req; @@ -436,7 +436,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, } EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); -int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key) +int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key) { struct nvme_dhchap_key *key; u8 key_hash; @@ -484,8 +484,9 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_key); * Returns 0 on success with a valid generated PSK pointer in @ret_psk and * the length of @ret_psk in @ret_len, or a negative error number otherwise. */ -int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *c1, u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) +int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *c1, const u8 *c2, size_t hash_len, + u8 **ret_psk, size_t *ret_len) { struct crypto_shash *tfm; SHASH_DESC_ON_STACK(shash, tfm); @@ -582,12 +583,14 @@ EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); * Returns 0 on success with a valid digest pointer in @ret_digest, or a * negative error number on failure. */ -int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, - char *subsysnqn, char *hostnqn, u8 **ret_digest) +int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len, + const char *subsysnqn, const char *hostnqn, + char **ret_digest) { struct crypto_shash *tfm; SHASH_DESC_ON_STACK(shash, tfm); - u8 *digest, *enc; + u8 *digest; + char *enc; const char *hmac_name; size_t digest_len, hmac_len; int ret; @@ -761,16 +764,16 @@ static int hkdf_expand_label(struct crypto_shash *hmac_tfm, * Returns 0 on success with a valid psk pointer in @ret_psk or a negative * error number otherwise. */ -int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, - u8 *psk_digest, u8 **ret_psk) +int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, + const char *psk_digest, u8 **ret_psk) { struct crypto_shash *hmac_tfm; const char *hmac_name; const char *label = "nvme-tls-psk"; - static const char default_salt[NVME_AUTH_MAX_DIGEST_SIZE]; + static const u8 default_salt[NVME_AUTH_MAX_DIGEST_SIZE]; size_t prk_len; const char *ctx; - unsigned char *prk, *tls_key; + u8 *prk, *tls_key; int ret; hmac_name = nvme_auth_hmac_name(hmac_id); diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 301c858b7c57..d0d0a9d5a871 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -708,7 +708,8 @@ EXPORT_SYMBOL_GPL(nvme_auth_revoke_tls_key); static int nvme_auth_secure_concat(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { - u8 *psk, *digest, *tls_psk; + u8 *psk, *tls_psk; + char *digest; struct key *tls_key; size_t psk_len; int ret = 0; diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 2eadeb7e06f2..f483e1fd48ac 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -531,7 +531,7 @@ int nvmet_auth_ctrl_exponential(struct nvmet_req *req, } int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, - u8 *pkey, int pkey_size) + const u8 *pkey, int pkey_size) { struct nvmet_ctrl *ctrl = req->sq->ctrl; int ret; @@ -557,7 +557,8 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, void nvmet_auth_insert_psk(struct nvmet_sq *sq) { int hash_len = nvme_auth_hmac_hash_len(sq->ctrl->shash_id); - u8 *psk, *digest, *tls_psk; + u8 *psk, *tls_psk; + char *digest; size_t psk_len; int ret; #ifdef CONFIG_NVME_TARGET_TCP_TLS diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index b664b584fdc8..986d4c7bd734 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -912,7 +912,7 @@ static inline bool nvmet_has_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) int nvmet_auth_ctrl_exponential(struct nvmet_req *req, u8 *buf, int buf_size); int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, - u8 *buf, int buf_size); + const u8 *pkey, int pkey_size); void nvmet_auth_insert_psk(struct nvmet_sq *sq); #else static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 60e069a6757f..a4b248c24ccf 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -25,27 +25,27 @@ size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); u32 nvme_auth_key_struct_size(u32 key_len); -struct nvme_dhchap_key *nvme_auth_extract_key(unsigned char *secret, - u8 key_hash); +struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash); void nvme_auth_free_key(struct nvme_dhchap_key *key); struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash); struct nvme_dhchap_key *nvme_auth_transform_key( - struct nvme_dhchap_key *key, char *nqn); -int nvme_auth_generate_key(u8 *secret, struct nvme_dhchap_key **ret_key); -int nvme_auth_augmented_challenge(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *challenge, u8 *aug, size_t hlen); + const struct nvme_dhchap_key *key, const char *nqn); +int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key); +int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *challenge, u8 *aug, size_t hlen); int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); int nvme_auth_gen_pubkey(struct crypto_kpp *dh_tfm, u8 *host_key, size_t host_key_len); int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, - u8 *ctrl_key, size_t ctrl_key_len, + const u8 *ctrl_key, size_t ctrl_key_len, u8 *sess_key, size_t sess_key_len); -int nvme_auth_generate_psk(u8 hmac_id, u8 *skey, size_t skey_len, - u8 *c1, u8 *c2, size_t hash_len, +int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, + const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len); -int nvme_auth_generate_digest(u8 hmac_id, u8 *psk, size_t psk_len, - char *subsysnqn, char *hostnqn, u8 **ret_digest); -int nvme_auth_derive_tls_psk(int hmac_id, u8 *psk, size_t psk_len, - u8 *psk_digest, u8 **ret_psk); +int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len, + const char *subsysnqn, const char *hostnqn, + char **ret_digest); +int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, + const char *psk_digest, u8 **ret_psk); #endif /* _NVME_AUTH_H */ From f990ad67f0febc51274adb604d5bdeab0d06d024 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:42 -0800 Subject: [PATCH 057/146] nvme-auth: common: add KUnit tests for TLS key derivation Unit-test the sequence of function calls that derive tls_psk, so that we can be more confident that changes in the implementation don't break it. Since the NVMe specification doesn't seem to include any test vectors for this (nor does its description of the algorithm seem to match what was actually implemented, for that matter), I just set the expected values to the values that the code currently produces. In the case of SHA-512, nvme_auth_generate_digest() currently returns -EINVAL, so for now the test tests for that too. If it is later determined that some other behavior is needed, the test can be updated accordingly. Tested with: tools/testing/kunit/kunit.py run --kunitconfig drivers/nvme/common/ Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/.kunitconfig | 6 + drivers/nvme/common/Kconfig | 8 ++ drivers/nvme/common/Makefile | 2 + drivers/nvme/common/tests/auth_kunit.c | 175 +++++++++++++++++++++++++ 4 files changed, 191 insertions(+) create mode 100644 drivers/nvme/common/.kunitconfig create mode 100644 drivers/nvme/common/tests/auth_kunit.c diff --git a/drivers/nvme/common/.kunitconfig b/drivers/nvme/common/.kunitconfig new file mode 100644 index 000000000000..60a038dc9423 --- /dev/null +++ b/drivers/nvme/common/.kunitconfig @@ -0,0 +1,6 @@ +CONFIG_KUNIT=y +CONFIG_PCI=y +CONFIG_BLOCK=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_HOST_AUTH=y +CONFIG_NVME_AUTH_KUNIT_TEST=y diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index da963e4f3f1f..d19988c13af5 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -13,3 +13,11 @@ config NVME_AUTH select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS select CRYPTO_HKDF + +config NVME_AUTH_KUNIT_TEST + tristate "KUnit tests for NVMe authentication" if !KUNIT_ALL_TESTS + depends on KUNIT && NVME_AUTH + default KUNIT_ALL_TESTS + help + Enable KUnit tests for some of the common code for NVMe over Fabrics + In-Band Authentication. diff --git a/drivers/nvme/common/Makefile b/drivers/nvme/common/Makefile index 681514cf2e2f..fd9d01a60946 100644 --- a/drivers/nvme/common/Makefile +++ b/drivers/nvme/common/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_NVME_KEYRING) += nvme-keyring.o nvme-auth-y += auth.o nvme-keyring-y += keyring.o + +obj-$(CONFIG_NVME_AUTH_KUNIT_TEST) += tests/auth_kunit.o diff --git a/drivers/nvme/common/tests/auth_kunit.c b/drivers/nvme/common/tests/auth_kunit.c new file mode 100644 index 000000000000..28b8dd1e3b18 --- /dev/null +++ b/drivers/nvme/common/tests/auth_kunit.c @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Unit tests for NVMe authentication functions + * + * Copyright 2026 Google LLC + */ + +#include +#include +#include +#include +#include + +struct nvme_auth_test_values { + u8 hmac_id; + size_t hash_len; + u8 expected_psk[NVME_AUTH_MAX_DIGEST_SIZE]; + char *expected_psk_digest; + u8 expected_tls_psk[NVME_AUTH_MAX_DIGEST_SIZE]; +}; + +static void kfree_action(void *ptr) +{ + kfree(ptr); +} + +static void kunit_add_kfree_action(struct kunit *test, void *ptr) +{ + KUNIT_ASSERT_EQ(test, 0, + kunit_add_action_or_reset(test, kfree_action, ptr)); +} + +/* + * Test the derivation of a TLS PSK from the initial skey. The vals parameter + * gives the expected value of tls_psk as well as the intermediate values psk + * and psk_digest. The inputs are implicitly the fixed values set below. + */ +static void +test_nvme_auth_derive_tls_psk(struct kunit *test, + const struct nvme_auth_test_values *vals) +{ + const u8 hmac_id = vals->hmac_id; + const size_t hash_len = vals->hash_len; + const size_t skey_len = hash_len; + u8 skey[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 c1[NVME_AUTH_MAX_DIGEST_SIZE]; + u8 c2[NVME_AUTH_MAX_DIGEST_SIZE]; + const char *subsysnqn = "subsysnqn"; + const char *hostnqn = "hostnqn"; + u8 *psk = NULL, *tls_psk = NULL; + char *psk_digest = NULL; + size_t psk_len; + int ret; + + for (int i = 0; i < NVME_AUTH_MAX_DIGEST_SIZE; i++) { + skey[i] = 'A' + i; + c1[i] = i; + c2[i] = 0xff - i; + } + + ret = nvme_auth_generate_psk(hmac_id, skey, skey_len, c1, c2, hash_len, + &psk, &psk_len); + kunit_add_kfree_action(test, psk); + KUNIT_ASSERT_EQ(test, 0, ret); + KUNIT_ASSERT_EQ(test, hash_len, psk_len); + KUNIT_ASSERT_MEMEQ(test, vals->expected_psk, psk, psk_len); + + ret = nvme_auth_generate_digest(hmac_id, psk, psk_len, subsysnqn, + hostnqn, &psk_digest); + kunit_add_kfree_action(test, psk_digest); + if (vals->expected_psk_digest == NULL) { + /* + * Algorithm has an ID assigned but is not supported by + * nvme_auth_generate_digest(). + */ + KUNIT_ASSERT_EQ(test, -EINVAL, ret); + return; + } + KUNIT_ASSERT_EQ(test, 0, ret); + KUNIT_ASSERT_STREQ(test, vals->expected_psk_digest, psk_digest); + + ret = nvme_auth_derive_tls_psk(hmac_id, psk, psk_len, psk_digest, + &tls_psk); + kunit_add_kfree_action(test, tls_psk); + KUNIT_ASSERT_EQ(test, 0, ret); + KUNIT_ASSERT_MEMEQ(test, vals->expected_tls_psk, tls_psk, psk_len); +} + +static void test_nvme_auth_derive_tls_psk_hmac_sha256(struct kunit *test) +{ + static const struct nvme_auth_test_values vals = { + .hmac_id = NVME_AUTH_HASH_SHA256, + .hash_len = SHA256_DIGEST_SIZE, + .expected_psk = { + 0x17, 0x33, 0xc5, 0x9f, 0xa7, 0xf4, 0x8f, 0xcf, + 0x37, 0xf5, 0xf2, 0x6f, 0xc4, 0xff, 0x02, 0x68, + 0xad, 0x4f, 0x78, 0xe0, 0x30, 0xf4, 0xf3, 0xb0, + 0xbf, 0xd1, 0xd4, 0x7e, 0x7b, 0xb1, 0x44, 0x7a, + }, + .expected_psk_digest = "OldoKuTfKddMuyCznAZojkWD7P4D9/AtzDzLimtOxqI=", + .expected_tls_psk = { + 0x3c, 0x17, 0xda, 0x62, 0x84, 0x74, 0xa0, 0x4d, + 0x22, 0x47, 0xc4, 0xca, 0xb4, 0x79, 0x68, 0xc9, + 0x15, 0x38, 0x81, 0x93, 0xf7, 0xc0, 0x71, 0xbd, + 0x94, 0x89, 0xcc, 0x36, 0x66, 0xcd, 0x7c, 0xc8, + }, + }; + + test_nvme_auth_derive_tls_psk(test, &vals); +} + +static void test_nvme_auth_derive_tls_psk_hmac_sha384(struct kunit *test) +{ + static const struct nvme_auth_test_values vals = { + .hmac_id = NVME_AUTH_HASH_SHA384, + .hash_len = SHA384_DIGEST_SIZE, + .expected_psk = { + 0xf1, 0x4b, 0x2d, 0xd3, 0x23, 0x4c, 0x45, 0x96, + 0x94, 0xd3, 0xbc, 0x63, 0xf8, 0x96, 0x8b, 0xd6, + 0xb3, 0x7c, 0x2c, 0x6d, 0xe8, 0x49, 0xe2, 0x2e, + 0x11, 0x87, 0x49, 0x00, 0x1c, 0xe4, 0xbb, 0xe8, + 0x64, 0x0b, 0x9e, 0x3a, 0x74, 0x8c, 0xb1, 0x1c, + 0xe4, 0xb1, 0xd7, 0x1d, 0x35, 0x9c, 0xce, 0x39, + }, + .expected_psk_digest = "cffMWk8TSS7HOQebjgYEIkrPrjWPV4JE5cdPB8WhEvY4JBW5YynKyv66XscN4A9n", + .expected_tls_psk = { + 0x27, 0x74, 0x75, 0x32, 0x33, 0x53, 0x7b, 0x3f, + 0xa5, 0x0e, 0xb7, 0xd1, 0x6a, 0x8e, 0x43, 0x45, + 0x7d, 0x85, 0xf4, 0x90, 0x6c, 0x00, 0x5b, 0x22, + 0x36, 0x61, 0x6c, 0x5d, 0x80, 0x93, 0x9d, 0x08, + 0x98, 0xff, 0xf1, 0x5b, 0xb8, 0xb7, 0x71, 0x19, + 0xd2, 0xbe, 0x0a, 0xac, 0x42, 0x3e, 0x75, 0x90, + }, + }; + + test_nvme_auth_derive_tls_psk(test, &vals); +} + +static void test_nvme_auth_derive_tls_psk_hmac_sha512(struct kunit *test) +{ + static const struct nvme_auth_test_values vals = { + .hmac_id = NVME_AUTH_HASH_SHA512, + .hash_len = SHA512_DIGEST_SIZE, + .expected_psk = { + 0x9c, 0x9f, 0x08, 0x9a, 0x61, 0x8b, 0x47, 0xd2, + 0xd7, 0x5f, 0x4b, 0x6c, 0x28, 0x07, 0x04, 0x24, + 0x48, 0x7b, 0x44, 0x5d, 0xd9, 0x6e, 0x70, 0xc4, + 0xc0, 0x9b, 0x55, 0xe8, 0xb6, 0x00, 0x01, 0x52, + 0xa3, 0x36, 0x3c, 0x34, 0x54, 0x04, 0x3f, 0x38, + 0xf0, 0xb8, 0x50, 0x36, 0xde, 0xd4, 0x06, 0x55, + 0x35, 0x0a, 0xa8, 0x7b, 0x8b, 0x6a, 0x28, 0x2b, + 0x5c, 0x1a, 0xca, 0xe1, 0x62, 0x33, 0xdd, 0x5b, + }, + /* nvme_auth_generate_digest() doesn't support SHA-512 yet. */ + .expected_psk_digest = NULL, + }; + + test_nvme_auth_derive_tls_psk(test, &vals); +} + +static struct kunit_case nvme_auth_test_cases[] = { + KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha256), + KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha384), + KUNIT_CASE(test_nvme_auth_derive_tls_psk_hmac_sha512), + {}, +}; + +static struct kunit_suite nvme_auth_test_suite = { + .name = "nvme-auth", + .test_cases = nvme_auth_test_cases, +}; +kunit_test_suite(nvme_auth_test_suite); + +MODULE_DESCRIPTION("Unit tests for NVMe authentication functions"); +MODULE_LICENSE("GPL"); From 0beeca72cf21c7c1d9d232148fdeef8e5e242f62 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:43 -0800 Subject: [PATCH 058/146] nvme-auth: rename nvme_auth_generate_key() to nvme_auth_parse_key() This function does not generate a key. It parses the key from the string that the caller passes in. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 4 ++-- drivers/nvme/host/auth.c | 7 +++---- drivers/nvme/host/sysfs.c | 4 ++-- include/linux/nvme-auth.h | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index d35523d0a017..2f83c9ddea5e 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -436,7 +436,7 @@ int nvme_auth_gen_shared_secret(struct crypto_kpp *dh_tfm, } EXPORT_SYMBOL_GPL(nvme_auth_gen_shared_secret); -int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key) +int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key) { struct nvme_dhchap_key *key; u8 key_hash; @@ -459,7 +459,7 @@ int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key) *ret_key = key; return 0; } -EXPORT_SYMBOL_GPL(nvme_auth_generate_key); +EXPORT_SYMBOL_GPL(nvme_auth_parse_key); /** * nvme_auth_generate_psk - Generate a PSK for TLS diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index d0d0a9d5a871..47a1525e876e 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -1072,12 +1072,11 @@ int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work); if (!ctrl->opts) return 0; - ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret, - &ctrl->host_key); + ret = nvme_auth_parse_key(ctrl->opts->dhchap_secret, &ctrl->host_key); if (ret) return ret; - ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, - &ctrl->ctrl_key); + ret = nvme_auth_parse_key(ctrl->opts->dhchap_ctrl_secret, + &ctrl->ctrl_key); if (ret) goto err_free_dhchap_secret; diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 16c6fea4b2db..45422d4274de 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -658,7 +658,7 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev, struct nvme_dhchap_key *key, *host_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &key); + ret = nvme_auth_parse_key(dhchap_secret, &key); if (ret) { kfree(dhchap_secret); return ret; @@ -716,7 +716,7 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev, struct nvme_dhchap_key *key, *ctrl_key; int ret; - ret = nvme_auth_generate_key(dhchap_secret, &key); + ret = nvme_auth_parse_key(dhchap_secret, &key); if (ret) { kfree(dhchap_secret); return ret; diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index a4b248c24ccf..02ca9a716256 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -30,7 +30,7 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key); struct nvme_dhchap_key *nvme_auth_alloc_key(u32 len, u8 hash); struct nvme_dhchap_key *nvme_auth_transform_key( const struct nvme_dhchap_key *key, const char *nqn); -int nvme_auth_generate_key(const char *secret, struct nvme_dhchap_key **ret_key); +int nvme_auth_parse_key(const char *secret, struct nvme_dhchap_key **ret_key); int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *challenge, u8 *aug, size_t hlen); int nvme_auth_gen_privkey(struct crypto_kpp *dh_tfm, u8 dh_gid); From 4454820b4ee59154d0c271722bbe48bb4f554e3e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:44 -0800 Subject: [PATCH 059/146] nvme-auth: common: explicitly verify psk_len == hash_len nvme_auth_derive_tls_psk() is always called with psk_len == hash_len. And based on the comments above nvme_auth_generate_psk() and nvme_auth_derive_tls_psk(), this isn't an implementation choice but rather just the length the spec uses. Add a check which makes this explicit, so that when cleaning up nvme_auth_derive_tls_psk() we don't have to retain support for arbitrary values of psk_len. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 2f83c9ddea5e..9e33fc02cf51 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -788,6 +788,11 @@ int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, return -EINVAL; } + if (psk_len != nvme_auth_hmac_hash_len(hmac_id)) { + pr_warn("%s: unexpected psk_len %zu\n", __func__, psk_len); + return -EINVAL; + } + hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); if (IS_ERR(hmac_tfm)) return PTR_ERR(hmac_tfm); From 4263ca1cae5cebc09ba95375c4a8927bf4b39d49 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:45 -0800 Subject: [PATCH 060/146] nvme-auth: common: add HMAC helper functions Add some helper functions for computing HMAC-SHA256, HMAC-SHA384, or HMAC-SHA512 values using the crypto library instead of crypto_shash. These will enable some significant simplifications and performance improvements in nvme-auth. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/Kconfig | 2 ++ drivers/nvme/common/auth.c | 66 +++++++++++++++++++++++++++++++++++++ include/linux/nvme-auth.h | 14 ++++++++ 3 files changed, 82 insertions(+) diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index d19988c13af5..1ec507d1f9b5 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -13,6 +13,8 @@ config NVME_AUTH select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS select CRYPTO_HKDF + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 config NVME_AUTH_KUNIT_TEST tristate "KUnit tests for NVMe authentication" if !KUNIT_ALL_TESTS diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 9e33fc02cf51..00f21176181f 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -234,6 +235,71 @@ void nvme_auth_free_key(struct nvme_dhchap_key *key) } EXPORT_SYMBOL_GPL(nvme_auth_free_key); +/* + * Start computing an HMAC value, given the algorithm ID and raw key. + * + * The context should be zeroized at the end of its lifetime. The caller can do + * that implicitly by calling nvme_auth_hmac_final(), or explicitly (needed when + * a context is abandoned without finalizing it) by calling memzero_explicit(). + */ +int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id, + const u8 *key, size_t key_len) +{ + hmac->hmac_id = hmac_id; + switch (hmac_id) { + case NVME_AUTH_HASH_SHA256: + hmac_sha256_init_usingrawkey(&hmac->sha256, key, key_len); + return 0; + case NVME_AUTH_HASH_SHA384: + hmac_sha384_init_usingrawkey(&hmac->sha384, key, key_len); + return 0; + case NVME_AUTH_HASH_SHA512: + hmac_sha512_init_usingrawkey(&hmac->sha512, key, key_len); + return 0; + } + pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_init); + +void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data, + size_t data_len) +{ + switch (hmac->hmac_id) { + case NVME_AUTH_HASH_SHA256: + hmac_sha256_update(&hmac->sha256, data, data_len); + return; + case NVME_AUTH_HASH_SHA384: + hmac_sha384_update(&hmac->sha384, data, data_len); + return; + case NVME_AUTH_HASH_SHA512: + hmac_sha512_update(&hmac->sha512, data, data_len); + return; + } + /* Unreachable because nvme_auth_hmac_init() validated hmac_id */ + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_update); + +/* Finish computing an HMAC value. Note that this zeroizes the HMAC context. */ +void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out) +{ + switch (hmac->hmac_id) { + case NVME_AUTH_HASH_SHA256: + hmac_sha256_final(&hmac->sha256, out); + return; + case NVME_AUTH_HASH_SHA384: + hmac_sha384_final(&hmac->sha384, out); + return; + case NVME_AUTH_HASH_SHA512: + hmac_sha512_final(&hmac->sha512, out); + return; + } + /* Unreachable because nvme_auth_hmac_init() validated hmac_id */ + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(nvme_auth_hmac_final); + struct nvme_dhchap_key *nvme_auth_transform_key( const struct nvme_dhchap_key *key, const char *nqn) { diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 02ca9a716256..940d0703eb1d 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -7,6 +7,7 @@ #define _NVME_AUTH_H #include +#include struct nvme_dhchap_key { size_t len; @@ -23,6 +24,19 @@ const char *nvme_auth_hmac_name(u8 hmac_id); const char *nvme_auth_digest_name(u8 hmac_id); size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); +struct nvme_auth_hmac_ctx { + u8 hmac_id; + union { + struct hmac_sha256_ctx sha256; + struct hmac_sha384_ctx sha384; + struct hmac_sha512_ctx sha512; + }; +}; +int nvme_auth_hmac_init(struct nvme_auth_hmac_ctx *hmac, u8 hmac_id, + const u8 *key, size_t key_len); +void nvme_auth_hmac_update(struct nvme_auth_hmac_ctx *hmac, const u8 *data, + size_t data_len); +void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out); u32 nvme_auth_key_struct_size(u32 key_len); struct nvme_dhchap_key *nvme_auth_extract_key(const char *secret, u8 key_hash); From 092c05f8de3d2f83242d70f6f044f339b8ba5df1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:46 -0800 Subject: [PATCH 061/146] nvme-auth: common: use crypto library in nvme_auth_transform_key() For the HMAC computation in nvme_auth_transform_key(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Notably, this eliminates the transformation object allocation for every call, which was very slow. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 53 +++++++------------------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 00f21176181f..321d6e11c275 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -303,9 +303,7 @@ EXPORT_SYMBOL_GPL(nvme_auth_hmac_final); struct nvme_dhchap_key *nvme_auth_transform_key( const struct nvme_dhchap_key *key, const char *nqn) { - const char *hmac_name; - struct crypto_shash *key_tfm; - SHASH_DESC_ON_STACK(shash, key_tfm); + struct nvme_auth_hmac_ctx hmac; struct nvme_dhchap_key *transformed_key; int ret, key_len; @@ -320,50 +318,19 @@ struct nvme_dhchap_key *nvme_auth_transform_key( return ERR_PTR(-ENOMEM); return transformed_key; } - hmac_name = nvme_auth_hmac_name(key->hash); - if (!hmac_name) { - pr_warn("Invalid key hash id %d\n", key->hash); - return ERR_PTR(-EINVAL); - } - - key_tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(key_tfm)) - return ERR_CAST(key_tfm); - - key_len = crypto_shash_digestsize(key_tfm); + ret = nvme_auth_hmac_init(&hmac, key->hash, key->key, key->len); + if (ret) + return ERR_PTR(ret); + key_len = nvme_auth_hmac_hash_len(key->hash); transformed_key = nvme_auth_alloc_key(key_len, key->hash); if (!transformed_key) { - ret = -ENOMEM; - goto out_free_key; + memzero_explicit(&hmac, sizeof(hmac)); + return ERR_PTR(-ENOMEM); } - - shash->tfm = key_tfm; - ret = crypto_shash_setkey(key_tfm, key->key, key->len); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_init(shash); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_update(shash, nqn, strlen(nqn)); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_update(shash, "NVMe-over-Fabrics", 17); - if (ret < 0) - goto out_free_transformed_key; - ret = crypto_shash_final(shash, transformed_key->key); - if (ret < 0) - goto out_free_transformed_key; - - crypto_free_shash(key_tfm); - + nvme_auth_hmac_update(&hmac, nqn, strlen(nqn)); + nvme_auth_hmac_update(&hmac, "NVMe-over-Fabrics", 17); + nvme_auth_hmac_final(&hmac, transformed_key->key); return transformed_key; - -out_free_transformed_key: - nvme_auth_free_key(transformed_key); -out_free_key: - crypto_free_shash(key_tfm); - - return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); From a67d096fe9761e3e503f40643228bca6d69c7c4e Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:47 -0800 Subject: [PATCH 062/146] nvme-auth: common: use crypto library in nvme_auth_augmented_challenge() For the hash and HMAC computations in nvme_auth_augmented_challenge(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Notably, this eliminates two crypto transformation object allocations for every call, which was very slow. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 96 ++++++++++++++------------------------ 1 file changed, 36 insertions(+), 60 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 321d6e11c275..be5bc5fcafc6 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -300,6 +300,37 @@ void nvme_auth_hmac_final(struct nvme_auth_hmac_ctx *hmac, u8 *out) } EXPORT_SYMBOL_GPL(nvme_auth_hmac_final); +static int nvme_auth_hmac(u8 hmac_id, const u8 *key, size_t key_len, + const u8 *data, size_t data_len, u8 *out) +{ + struct nvme_auth_hmac_ctx hmac; + int ret; + + ret = nvme_auth_hmac_init(&hmac, hmac_id, key, key_len); + if (ret == 0) { + nvme_auth_hmac_update(&hmac, data, data_len); + nvme_auth_hmac_final(&hmac, out); + } + return ret; +} + +static int nvme_auth_hash(u8 hmac_id, const u8 *data, size_t data_len, u8 *out) +{ + switch (hmac_id) { + case NVME_AUTH_HASH_SHA256: + sha256(data, data_len, out); + return 0; + case NVME_AUTH_HASH_SHA384: + sha384(data, data_len, out); + return 0; + case NVME_AUTH_HASH_SHA512: + sha512(data, data_len, out); + return 0; + } + pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); + return -EINVAL; +} + struct nvme_dhchap_key *nvme_auth_transform_key( const struct nvme_dhchap_key *key, const char *nqn) { @@ -334,72 +365,17 @@ struct nvme_dhchap_key *nvme_auth_transform_key( } EXPORT_SYMBOL_GPL(nvme_auth_transform_key); -static int nvme_auth_hash_skey(int hmac_id, const u8 *skey, size_t skey_len, - u8 *hkey) -{ - const char *digest_name; - struct crypto_shash *tfm; - int ret; - - digest_name = nvme_auth_digest_name(hmac_id); - if (!digest_name) { - pr_debug("%s: failed to get digest for %d\n", __func__, - hmac_id); - return -EINVAL; - } - tfm = crypto_alloc_shash(digest_name, 0, 0); - if (IS_ERR(tfm)) - return -ENOMEM; - - ret = crypto_shash_tfm_digest(tfm, skey, skey_len, hkey); - if (ret < 0) - pr_debug("%s: Failed to hash digest len %zu\n", __func__, - skey_len); - - crypto_free_shash(tfm); - return ret; -} - int nvme_auth_augmented_challenge(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *challenge, u8 *aug, size_t hlen) { - struct crypto_shash *tfm; - u8 *hashed_key; - const char *hmac_name; + u8 hashed_key[NVME_AUTH_MAX_DIGEST_SIZE]; int ret; - hashed_key = kmalloc(hlen, GFP_KERNEL); - if (!hashed_key) - return -ENOMEM; - - ret = nvme_auth_hash_skey(hmac_id, skey, - skey_len, hashed_key); - if (ret < 0) - goto out_free_key; - - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { - pr_warn("%s: invalid hash algorithm %d\n", - __func__, hmac_id); - ret = -EINVAL; - goto out_free_key; - } - - tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out_free_key; - } - - ret = crypto_shash_setkey(tfm, hashed_key, hlen); + ret = nvme_auth_hash(hmac_id, skey, skey_len, hashed_key); if (ret) - goto out_free_hash; - - ret = crypto_shash_tfm_digest(tfm, challenge, hlen, aug); -out_free_hash: - crypto_free_shash(tfm); -out_free_key: - kfree_sensitive(hashed_key); + return ret; + ret = nvme_auth_hmac(hmac_id, hashed_key, hlen, challenge, hlen, aug); + memzero_explicit(hashed_key, sizeof(hashed_key)); return ret; } EXPORT_SYMBOL_GPL(nvme_auth_augmented_challenge); From be01b841d3dd667d873cbcd984d9839b7e98ef4f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:48 -0800 Subject: [PATCH 063/146] nvme-auth: common: use crypto library in nvme_auth_generate_psk() For the HMAC computation in nvme_auth_generate_psk(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Notably, this eliminates the crypto transformation object allocation for every call, which was very slow. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 63 +++++++++----------------------------- 1 file changed, 14 insertions(+), 49 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index be5bc5fcafc6..781d1d5d46dd 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -497,63 +497,28 @@ int nvme_auth_generate_psk(u8 hmac_id, const u8 *skey, size_t skey_len, const u8 *c1, const u8 *c2, size_t hash_len, u8 **ret_psk, size_t *ret_len) { - struct crypto_shash *tfm; - SHASH_DESC_ON_STACK(shash, tfm); + size_t psk_len = nvme_auth_hmac_hash_len(hmac_id); + struct nvme_auth_hmac_ctx hmac; u8 *psk; - const char *hmac_name; - int ret, psk_len; + int ret; if (!c1 || !c2) return -EINVAL; - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { - pr_warn("%s: invalid hash algorithm %d\n", - __func__, hmac_id); - return -EINVAL; - } - - tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - psk_len = crypto_shash_digestsize(tfm); + ret = nvme_auth_hmac_init(&hmac, hmac_id, skey, skey_len); + if (ret) + return ret; psk = kzalloc(psk_len, GFP_KERNEL); if (!psk) { - ret = -ENOMEM; - goto out_free_tfm; + memzero_explicit(&hmac, sizeof(hmac)); + return -ENOMEM; } - - shash->tfm = tfm; - ret = crypto_shash_setkey(tfm, skey, skey_len); - if (ret) - goto out_free_psk; - - ret = crypto_shash_init(shash); - if (ret) - goto out_free_psk; - - ret = crypto_shash_update(shash, c1, hash_len); - if (ret) - goto out_free_psk; - - ret = crypto_shash_update(shash, c2, hash_len); - if (ret) - goto out_free_psk; - - ret = crypto_shash_final(shash, psk); - if (!ret) { - *ret_psk = psk; - *ret_len = psk_len; - } - -out_free_psk: - if (ret) - kfree_sensitive(psk); -out_free_tfm: - crypto_free_shash(tfm); - - return ret; + nvme_auth_hmac_update(&hmac, c1, hash_len); + nvme_auth_hmac_update(&hmac, c2, hash_len); + nvme_auth_hmac_final(&hmac, psk); + *ret_psk = psk; + *ret_len = psk_len; + return 0; } EXPORT_SYMBOL_GPL(nvme_auth_generate_psk); From 0002764c2faa769cd41b45e95af3dd5e1777df9f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:49 -0800 Subject: [PATCH 064/146] nvme-auth: common: use crypto library in nvme_auth_generate_digest() For the HMAC computation in nvme_auth_generate_digest(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Notably, this eliminates the crypto transformation object allocation for every call, which was very slow. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 85 +++++++++++--------------------------- 1 file changed, 24 insertions(+), 61 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 781d1d5d46dd..f0b4e1c6ade7 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -561,99 +561,62 @@ int nvme_auth_generate_digest(u8 hmac_id, const u8 *psk, size_t psk_len, const char *subsysnqn, const char *hostnqn, char **ret_digest) { - struct crypto_shash *tfm; - SHASH_DESC_ON_STACK(shash, tfm); - u8 *digest; + struct nvme_auth_hmac_ctx hmac; + u8 digest[NVME_AUTH_MAX_DIGEST_SIZE]; + size_t hash_len = nvme_auth_hmac_hash_len(hmac_id); char *enc; - const char *hmac_name; - size_t digest_len, hmac_len; + size_t enc_len; int ret; if (WARN_ON(!subsysnqn || !hostnqn)) return -EINVAL; - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { + if (hash_len == 0) { pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); return -EINVAL; } - switch (nvme_auth_hmac_hash_len(hmac_id)) { + switch (hash_len) { case 32: - hmac_len = 44; + enc_len = 44; break; case 48: - hmac_len = 64; + enc_len = 64; break; default: pr_warn("%s: invalid hash algorithm '%s'\n", - __func__, hmac_name); + __func__, nvme_auth_hmac_name(hmac_id)); return -EINVAL; } - enc = kzalloc(hmac_len + 1, GFP_KERNEL); - if (!enc) - return -ENOMEM; - - tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto out_free_enc; - } - - digest_len = crypto_shash_digestsize(tfm); - digest = kzalloc(digest_len, GFP_KERNEL); - if (!digest) { + enc = kzalloc(enc_len + 1, GFP_KERNEL); + if (!enc) { ret = -ENOMEM; - goto out_free_tfm; + goto out; } - shash->tfm = tfm; - ret = crypto_shash_setkey(tfm, psk, psk_len); + ret = nvme_auth_hmac_init(&hmac, hmac_id, psk, psk_len); if (ret) - goto out_free_digest; + goto out; + nvme_auth_hmac_update(&hmac, hostnqn, strlen(hostnqn)); + nvme_auth_hmac_update(&hmac, " ", 1); + nvme_auth_hmac_update(&hmac, subsysnqn, strlen(subsysnqn)); + nvme_auth_hmac_update(&hmac, " NVMe-over-Fabrics", 18); + nvme_auth_hmac_final(&hmac, digest); - ret = crypto_shash_init(shash); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, hostnqn, strlen(hostnqn)); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, " ", 1); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, subsysnqn, strlen(subsysnqn)); - if (ret) - goto out_free_digest; - - ret = crypto_shash_update(shash, " NVMe-over-Fabrics", 18); - if (ret) - goto out_free_digest; - - ret = crypto_shash_final(shash, digest); - if (ret) - goto out_free_digest; - - ret = base64_encode(digest, digest_len, enc, true, BASE64_STD); - if (ret < hmac_len) { + ret = base64_encode(digest, hash_len, enc, true, BASE64_STD); + if (ret < enc_len) { ret = -ENOKEY; - goto out_free_digest; + goto out; } *ret_digest = enc; ret = 0; -out_free_digest: - kfree_sensitive(digest); -out_free_tfm: - crypto_free_shash(tfm); -out_free_enc: +out: if (ret) kfree_sensitive(enc); - + memzero_explicit(digest, sizeof(digest)); return ret; } EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); From d126cbaa7d9a971dedc8535d4f2529c799de8f85 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:50 -0800 Subject: [PATCH 065/146] nvme-auth: common: use crypto library in nvme_auth_derive_tls_psk() For the HKDF-Expand-Label computation in nvme_auth_derive_tls_psk(), use the crypto library instead of crypto_shash and crypto/hkdf.c. While this means the HKDF "helper" functions are no longer utilized, they clearly weren't buying us much: it's simpler to just inline the HMAC computations directly, and this code needs to be tested anyway. (A similar result was seen in fs/crypto/. As a result, this eliminates the last user of crypto/hkdf.c, which we'll be able to remove as well.) As usual this is also a lot more efficient, eliminating the allocation of a transformation object and multiple other dynamic allocations. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 154 +++++++++++++------------------------ 1 file changed, 52 insertions(+), 102 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index f0b4e1c6ade7..5be86629c2d4 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -9,9 +9,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -621,59 +619,6 @@ out: } EXPORT_SYMBOL_GPL(nvme_auth_generate_digest); -/** - * hkdf_expand_label - HKDF-Expand-Label (RFC 8846 section 7.1) - * @hmac_tfm: hash context keyed with pseudorandom key - * @label: ASCII label without "tls13 " prefix - * @labellen: length of @label - * @context: context bytes - * @contextlen: length of @context - * @okm: output keying material - * @okmlen: length of @okm - * - * Build the TLS 1.3 HkdfLabel structure and invoke hkdf_expand(). - * - * Returns 0 on success with output keying material stored in @okm, - * or a negative errno value otherwise. - */ -static int hkdf_expand_label(struct crypto_shash *hmac_tfm, - const u8 *label, unsigned int labellen, - const u8 *context, unsigned int contextlen, - u8 *okm, unsigned int okmlen) -{ - int err; - u8 *info; - unsigned int infolen; - const char *tls13_prefix = "tls13 "; - unsigned int prefixlen = strlen(tls13_prefix); - - if (WARN_ON(labellen > (255 - prefixlen))) - return -EINVAL; - if (WARN_ON(contextlen > 255)) - return -EINVAL; - - infolen = 2 + (1 + prefixlen + labellen) + (1 + contextlen); - info = kzalloc(infolen, GFP_KERNEL); - if (!info) - return -ENOMEM; - - /* HkdfLabel.Length */ - put_unaligned_be16(okmlen, info); - - /* HkdfLabel.Label */ - info[2] = prefixlen + labellen; - memcpy(info + 3, tls13_prefix, prefixlen); - memcpy(info + 3 + prefixlen, label, labellen); - - /* HkdfLabel.Context */ - info[3 + prefixlen + labellen] = contextlen; - memcpy(info + 4 + prefixlen + labellen, context, contextlen); - - err = hkdf_expand(hmac_tfm, info, infolen, okm, okmlen); - kfree_sensitive(info); - return err; -} - /** * nvme_auth_derive_tls_psk - Derive TLS PSK * @hmac_id: Hash function identifier @@ -704,84 +649,89 @@ static int hkdf_expand_label(struct crypto_shash *hmac_tfm, int nvme_auth_derive_tls_psk(int hmac_id, const u8 *psk, size_t psk_len, const char *psk_digest, u8 **ret_psk) { - struct crypto_shash *hmac_tfm; - const char *hmac_name; - const char *label = "nvme-tls-psk"; static const u8 default_salt[NVME_AUTH_MAX_DIGEST_SIZE]; - size_t prk_len; - const char *ctx; - u8 *prk, *tls_key; + static const char label[] = "tls13 nvme-tls-psk"; + const size_t label_len = sizeof(label) - 1; + u8 prk[NVME_AUTH_MAX_DIGEST_SIZE]; + size_t hash_len, ctx_len; + u8 *hmac_data = NULL, *tls_key; + size_t i; int ret; - hmac_name = nvme_auth_hmac_name(hmac_id); - if (!hmac_name) { + hash_len = nvme_auth_hmac_hash_len(hmac_id); + if (hash_len == 0) { pr_warn("%s: invalid hash algorithm %d\n", __func__, hmac_id); return -EINVAL; } if (hmac_id == NVME_AUTH_HASH_SHA512) { pr_warn("%s: unsupported hash algorithm %s\n", - __func__, hmac_name); + __func__, nvme_auth_hmac_name(hmac_id)); return -EINVAL; } - if (psk_len != nvme_auth_hmac_hash_len(hmac_id)) { + if (psk_len != hash_len) { pr_warn("%s: unexpected psk_len %zu\n", __func__, psk_len); return -EINVAL; } - hmac_tfm = crypto_alloc_shash(hmac_name, 0, 0); - if (IS_ERR(hmac_tfm)) - return PTR_ERR(hmac_tfm); + /* HKDF-Extract */ + ret = nvme_auth_hmac(hmac_id, default_salt, hash_len, psk, psk_len, + prk); + if (ret) + goto out; - prk_len = crypto_shash_digestsize(hmac_tfm); - prk = kzalloc(prk_len, GFP_KERNEL); - if (!prk) { + /* + * HKDF-Expand-Label (RFC 8446 section 7.1), with output length equal to + * the hash length (so only a single HMAC operation is needed) + */ + + hmac_data = kmalloc(/* output length */ 2 + + /* label */ 1 + label_len + + /* context (max) */ 1 + 3 + 1 + strlen(psk_digest) + + /* counter */ 1, + GFP_KERNEL); + if (!hmac_data) { ret = -ENOMEM; - goto out_free_shash; + goto out; } + /* output length */ + i = 0; + hmac_data[i++] = hash_len >> 8; + hmac_data[i++] = hash_len; - if (WARN_ON(prk_len > NVME_AUTH_MAX_DIGEST_SIZE)) { + /* label */ + static_assert(label_len <= 255); + hmac_data[i] = label_len; + memcpy(&hmac_data[i + 1], label, label_len); + i += 1 + label_len; + + /* context */ + ctx_len = sprintf(&hmac_data[i + 1], "%02d %s", hmac_id, psk_digest); + if (ctx_len > 255) { ret = -EINVAL; - goto out_free_prk; + goto out; } - ret = hkdf_extract(hmac_tfm, psk, psk_len, - default_salt, prk_len, prk); - if (ret) - goto out_free_prk; + hmac_data[i] = ctx_len; + i += 1 + ctx_len; - ret = crypto_shash_setkey(hmac_tfm, prk, prk_len); - if (ret) - goto out_free_prk; - - ctx = kasprintf(GFP_KERNEL, "%02d %s", hmac_id, psk_digest); - if (!ctx) { - ret = -ENOMEM; - goto out_free_prk; - } + /* counter (this overwrites the NUL terminator written by sprintf) */ + hmac_data[i++] = 1; tls_key = kzalloc(psk_len, GFP_KERNEL); if (!tls_key) { ret = -ENOMEM; - goto out_free_ctx; + goto out; } - ret = hkdf_expand_label(hmac_tfm, - label, strlen(label), - ctx, strlen(ctx), - tls_key, psk_len); + ret = nvme_auth_hmac(hmac_id, prk, hash_len, hmac_data, i, tls_key); if (ret) { - kfree(tls_key); - goto out_free_ctx; + kfree_sensitive(tls_key); + goto out; } *ret_psk = tls_key; - -out_free_ctx: - kfree(ctx); -out_free_prk: - kfree(prk); -out_free_shash: - crypto_free_shash(hmac_tfm); - +out: + kfree_sensitive(hmac_data); + memzero_explicit(prk, sizeof(prk)); return ret; } EXPORT_SYMBOL_GPL(nvme_auth_derive_tls_psk); From 6be8d3f043a12d208d8c5c08fc9c5f54082c87b2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:51 -0800 Subject: [PATCH 066/146] nvme-auth: host: use crypto library in nvme_auth_dhchap_setup_host_response() For the HMAC computation in nvme_auth_dhchap_setup_host_response(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 59 ++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 38 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 47a1525e876e..f22f17ad7e2f 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -434,7 +434,7 @@ static int nvme_auth_set_dhchap_failure2_data(struct nvme_ctrl *ctrl, static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { - SHASH_DESC_ON_STACK(shash, chap->shash_tfm); + struct nvme_auth_hmac_ctx hmac; u8 buf[4], *challenge = chap->c1; int ret; @@ -454,13 +454,11 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, __func__, chap->qid); } - ret = crypto_shash_setkey(chap->shash_tfm, - chap->transformed_key->key, chap->transformed_key->len); - if (ret) { - dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", - chap->qid, ret); + ret = nvme_auth_hmac_init(&hmac, chap->hash_id, + chap->transformed_key->key, + chap->transformed_key->len); + if (ret) goto out; - } if (chap->dh_tfm) { challenge = kmalloc(chap->hash_len, GFP_KERNEL); @@ -477,44 +475,29 @@ static int nvme_auth_dhchap_setup_host_response(struct nvme_ctrl *ctrl, goto out; } - shash->tfm = chap->shash_tfm; - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, chap->hash_len); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, challenge, chap->hash_len); + put_unaligned_le32(chap->s1, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(chap->transaction, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + *buf = chap->sc_c; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "HostHost", 8); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->host->nqn, - strlen(ctrl->opts->host->nqn)); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "HostHost", 8); + nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn, + strlen(ctrl->opts->host->nqn)); memset(buf, 0, sizeof(buf)); - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, - strlen(ctrl->opts->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, chap->response); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn, + strlen(ctrl->opts->subsysnqn)); + nvme_auth_hmac_final(&hmac, chap->response); + ret = 0; out: if (challenge != chap->c1) kfree(challenge); + memzero_explicit(&hmac, sizeof(hmac)); return ret; } From c4f216c2a95c16ad2cd61eeb91229103002a0f6d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:52 -0800 Subject: [PATCH 067/146] nvme-auth: host: use crypto library in nvme_auth_dhchap_setup_ctrl_response() For the HMAC computation in nvme_auth_dhchap_setup_ctrl_response(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 56 +++++++++++++++------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index f22f17ad7e2f..2f27f550a744 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -504,7 +504,7 @@ out: static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, struct nvme_dhchap_queue_context *chap) { - SHASH_DESC_ON_STACK(shash, chap->shash_tfm); + struct nvme_auth_hmac_ctx hmac; struct nvme_dhchap_key *transformed_key; u8 buf[4], *challenge = chap->c2; int ret; @@ -516,10 +516,10 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, return ret; } - ret = crypto_shash_setkey(chap->shash_tfm, - transformed_key->key, transformed_key->len); + ret = nvme_auth_hmac_init(&hmac, chap->hash_id, transformed_key->key, + transformed_key->len); if (ret) { - dev_warn(ctrl->device, "qid %d: failed to set key, error %d\n", + dev_warn(ctrl->device, "qid %d: failed to init hmac, error %d\n", chap->qid, ret); goto out; } @@ -546,43 +546,29 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl, __func__, chap->qid, ctrl->opts->subsysnqn); dev_dbg(ctrl->device, "%s: qid %d hostnqn %s\n", __func__, chap->qid, ctrl->opts->host->nqn); - shash->tfm = chap->shash_tfm; - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, chap->hash_len); - if (ret) - goto out; + + nvme_auth_hmac_update(&hmac, challenge, chap->hash_len); + put_unaligned_le32(chap->s2, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(chap->transaction, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + memset(buf, 0, 4); - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "Controller", 10); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->subsysnqn, - strlen(ctrl->opts->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->opts->host->nqn, - strlen(ctrl->opts->host->nqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, chap->response); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "Controller", 10); + nvme_auth_hmac_update(&hmac, ctrl->opts->subsysnqn, + strlen(ctrl->opts->subsysnqn)); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->opts->host->nqn, + strlen(ctrl->opts->host->nqn)); + nvme_auth_hmac_final(&hmac, chap->response); + ret = 0; out: if (challenge != chap->c2) kfree(challenge); + memzero_explicit(&hmac, sizeof(hmac)); nvme_auth_free_key(transformed_key); return ret; } From ac9a49cf6e0c230e81de1c91b59e4ad912ee98c1 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:53 -0800 Subject: [PATCH 068/146] nvme-auth: host: remove allocation of crypto_shash Now that the crypto_shash that is being allocated in nvme_auth_process_dhchap_challenge() and stored in the struct nvme_dhchap_queue_context is no longer used, remove it. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 2f27f550a744..c8cd633cb0ea 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "nvme.h" #include "fabrics.h" @@ -22,7 +21,6 @@ struct nvme_dhchap_queue_context { struct list_head entry; struct work_struct auth_work; struct nvme_ctrl *ctrl; - struct crypto_shash *shash_tfm; struct crypto_kpp *dh_tfm; struct nvme_dhchap_key *transformed_key; void *buf; @@ -183,38 +181,17 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl, return -EPROTO; } - if (chap->hash_id == data->hashid && chap->shash_tfm && - !strcmp(crypto_shash_alg_name(chap->shash_tfm), hmac_name) && - crypto_shash_digestsize(chap->shash_tfm) == data->hl) { + if (chap->hash_id == data->hashid && chap->hash_len == data->hl) { dev_dbg(ctrl->device, "qid %d: reuse existing hash %s\n", chap->qid, hmac_name); goto select_kpp; } - /* Reset if hash cannot be reused */ - if (chap->shash_tfm) { - crypto_free_shash(chap->shash_tfm); - chap->hash_id = 0; - chap->hash_len = 0; - } - chap->shash_tfm = crypto_alloc_shash(hmac_name, 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(chap->shash_tfm)) { - dev_warn(ctrl->device, - "qid %d: failed to allocate hash %s, error %ld\n", - chap->qid, hmac_name, PTR_ERR(chap->shash_tfm)); - chap->shash_tfm = NULL; - chap->status = NVME_AUTH_DHCHAP_FAILURE_FAILED; - return -ENOMEM; - } - - if (crypto_shash_digestsize(chap->shash_tfm) != data->hl) { + if (nvme_auth_hmac_hash_len(data->hashid) != data->hl) { dev_warn(ctrl->device, "qid %d: invalid hash length %d\n", chap->qid, data->hl); - crypto_free_shash(chap->shash_tfm); - chap->shash_tfm = NULL; chap->status = NVME_AUTH_DHCHAP_FAILURE_HASH_UNUSABLE; return -EPROTO; } @@ -658,8 +635,6 @@ static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap) { nvme_auth_reset_dhchap(chap); chap->authenticated = false; - if (chap->shash_tfm) - crypto_free_shash(chap->shash_tfm); if (chap->dh_tfm) crypto_free_kpp(chap->dh_tfm); } From efe8df9f9ce12903244e42038346de6afec473de Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:54 -0800 Subject: [PATCH 069/146] nvme-auth: target: remove obsolete crypto_has_shash() checks Since nvme-auth is now doing its HMAC computations using the crypto library, it's guaranteed that all the algorithms actually work. Therefore, remove the crypto_has_shash() checks which are now obsolete. However, the caller in nvmet_auth_negotiate() seems to have also been relying on crypto_has_shash(nvme_auth_hmac_name(host_hmac_id)) to validate the host_hmac_id. Therefore, make it validate the ID more directly by checking whether nvme_auth_hmac_hash_len() returns 0 or not. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 9 --------- drivers/nvme/target/configfs.c | 3 --- drivers/nvme/target/fabrics-cmd-auth.c | 4 +--- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index f483e1fd48ac..08c1783d70fc 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -45,15 +45,6 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, key_hash); return -EINVAL; } - if (key_hash > 0) { - /* Validate selected hash algorithm */ - const char *hmac = nvme_auth_hmac_name(key_hash); - - if (!crypto_has_shash(hmac, 0, 0)) { - pr_err("DH-HMAC-CHAP hash %s unsupported\n", hmac); - return -ENOTSUPP; - } - } dhchap_secret = kstrdup(secret, GFP_KERNEL); if (!dhchap_secret) return -ENOMEM; diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 3088e044dbcb..463348c7f097 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -17,7 +17,6 @@ #include #endif #include -#include #include #include @@ -2181,8 +2180,6 @@ static ssize_t nvmet_host_dhchap_hash_store(struct config_item *item, hmac_id = nvme_auth_hmac_id(page); if (hmac_id == NVME_AUTH_HASH_INVALID) return -EINVAL; - if (!crypto_has_shash(nvme_auth_hmac_name(hmac_id), 0, 0)) - return -ENOTSUPP; host->dhchap_hash_id = hmac_id; return count; } diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 5946681cb0e3..b703e3bebae4 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include "nvmet.h" @@ -75,8 +74,7 @@ static u8 nvmet_auth_negotiate(struct nvmet_req *req, void *d) for (i = 0; i < data->auth_protocol[0].dhchap.halen; i++) { u8 host_hmac_id = data->auth_protocol[0].dhchap.idlist[i]; - if (!fallback_hash_id && - crypto_has_shash(nvme_auth_hmac_name(host_hmac_id), 0, 0)) + if (!fallback_hash_id && nvme_auth_hmac_hash_len(host_hmac_id)) fallback_hash_id = host_hmac_id; if (ctrl->shash_id != host_hmac_id) continue; From e501533f671f6576c032fa40376e413cba4bfb25 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:55 -0800 Subject: [PATCH 070/146] nvme-auth: target: use crypto library in nvmet_auth_host_hash() For the HMAC computation in nvmet_auth_host_hash(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Notably, this eliminates the crypto transformation object allocation for every call, which was very slow. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 90 ++++++++++++-------------------------- 1 file changed, 28 insertions(+), 62 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index 08c1783d70fc..fc56ce74d20f 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -283,47 +283,30 @@ bool nvmet_check_auth_status(struct nvmet_req *req) int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, unsigned int shash_len) { - struct crypto_shash *shash_tfm; - SHASH_DESC_ON_STACK(shash, shash_tfm); + struct nvme_auth_hmac_ctx hmac; struct nvmet_ctrl *ctrl = req->sq->ctrl; - const char *hash_name; u8 *challenge = req->sq->dhchap_c1; struct nvme_dhchap_key *transformed_key; u8 buf[4]; int ret; - hash_name = nvme_auth_hmac_name(ctrl->shash_id); - if (!hash_name) { - pr_warn("Hash ID %d invalid\n", ctrl->shash_id); - return -EINVAL; - } - - shash_tfm = crypto_alloc_shash(hash_name, 0, 0); - if (IS_ERR(shash_tfm)) { - pr_err("failed to allocate shash %s\n", hash_name); - return PTR_ERR(shash_tfm); - } - - if (shash_len != crypto_shash_digestsize(shash_tfm)) { - pr_err("%s: hash len mismatch (len %d digest %d)\n", - __func__, shash_len, - crypto_shash_digestsize(shash_tfm)); - ret = -EINVAL; - goto out_free_tfm; - } - transformed_key = nvme_auth_transform_key(ctrl->host_key, ctrl->hostnqn); - if (IS_ERR(transformed_key)) { - ret = PTR_ERR(transformed_key); - goto out_free_tfm; - } + if (IS_ERR(transformed_key)) + return PTR_ERR(transformed_key); - ret = crypto_shash_setkey(shash_tfm, transformed_key->key, + ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key, transformed_key->len); if (ret) goto out_free_response; + if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) { + pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__, + shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id)); + ret = -EINVAL; + goto out_free_response; + } + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { challenge = kmalloc(shash_len, GFP_KERNEL); if (!challenge) { @@ -336,54 +319,37 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response, req->sq->dhchap_c1, challenge, shash_len); if (ret) - goto out; + goto out_free_challenge; } pr_debug("ctrl %d qid %d host response seq %u transaction %d\n", ctrl->cntlid, req->sq->qid, req->sq->dhchap_s1, req->sq->dhchap_tid); - shash->tfm = shash_tfm; - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, shash_len); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, challenge, shash_len); + put_unaligned_le32(req->sq->dhchap_s1, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(req->sq->dhchap_tid, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + *buf = req->sq->sc_c; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "HostHost", 8); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "HostHost", 8); memset(buf, 0, 4); - ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); - if (ret) - goto out; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn, - strlen(ctrl->subsys->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, response); -out: + nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn)); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn, + strlen(ctrl->subsys->subsysnqn)); + nvme_auth_hmac_final(&hmac, response); + ret = 0; +out_free_challenge: if (challenge != req->sq->dhchap_c1) kfree(challenge); out_free_response: + memzero_explicit(&hmac, sizeof(hmac)); nvme_auth_free_key(transformed_key); -out_free_tfm: - crypto_free_shash(shash_tfm); return ret; } From 16977e77554b203cbc1d29ca64cd53c5166e7c56 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:56 -0800 Subject: [PATCH 071/146] nvme-auth: target: use crypto library in nvmet_auth_ctrl_hash() For the HMAC computation in nvmet_auth_ctrl_hash(), use the crypto library instead of crypto_shash. This is simpler, faster, and more reliable. Notably, this eliminates the crypto transformation object allocation for every call, which was very slow. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 94 ++++++++++---------------------------- 1 file changed, 25 insertions(+), 69 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index fc56ce74d20f..b7417ab6b035 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -356,47 +355,30 @@ out_free_response: int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, unsigned int shash_len) { - struct crypto_shash *shash_tfm; - struct shash_desc *shash; + struct nvme_auth_hmac_ctx hmac; struct nvmet_ctrl *ctrl = req->sq->ctrl; - const char *hash_name; u8 *challenge = req->sq->dhchap_c2; struct nvme_dhchap_key *transformed_key; u8 buf[4]; int ret; - hash_name = nvme_auth_hmac_name(ctrl->shash_id); - if (!hash_name) { - pr_warn("Hash ID %d invalid\n", ctrl->shash_id); - return -EINVAL; - } - - shash_tfm = crypto_alloc_shash(hash_name, 0, 0); - if (IS_ERR(shash_tfm)) { - pr_err("failed to allocate shash %s\n", hash_name); - return PTR_ERR(shash_tfm); - } - - if (shash_len != crypto_shash_digestsize(shash_tfm)) { - pr_debug("%s: hash len mismatch (len %d digest %d)\n", - __func__, shash_len, - crypto_shash_digestsize(shash_tfm)); - ret = -EINVAL; - goto out_free_tfm; - } - transformed_key = nvme_auth_transform_key(ctrl->ctrl_key, ctrl->subsys->subsysnqn); - if (IS_ERR(transformed_key)) { - ret = PTR_ERR(transformed_key); - goto out_free_tfm; - } + if (IS_ERR(transformed_key)) + return PTR_ERR(transformed_key); - ret = crypto_shash_setkey(shash_tfm, transformed_key->key, + ret = nvme_auth_hmac_init(&hmac, ctrl->shash_id, transformed_key->key, transformed_key->len); if (ret) goto out_free_response; + if (shash_len != nvme_auth_hmac_hash_len(ctrl->shash_id)) { + pr_err("%s: hash len mismatch (len %u digest %zu)\n", __func__, + shash_len, nvme_auth_hmac_hash_len(ctrl->shash_id)); + ret = -EINVAL; + goto out_free_response; + } + if (ctrl->dh_gid != NVME_AUTH_DHGROUP_NULL) { challenge = kmalloc(shash_len, GFP_KERNEL); if (!challenge) { @@ -412,55 +394,29 @@ int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response, goto out_free_challenge; } - shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(shash_tfm), - GFP_KERNEL); - if (!shash) { - ret = -ENOMEM; - goto out_free_challenge; - } - shash->tfm = shash_tfm; + nvme_auth_hmac_update(&hmac, challenge, shash_len); - ret = crypto_shash_init(shash); - if (ret) - goto out; - ret = crypto_shash_update(shash, challenge, shash_len); - if (ret) - goto out; put_unaligned_le32(req->sq->dhchap_s2, buf); - ret = crypto_shash_update(shash, buf, 4); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 4); + put_unaligned_le16(req->sq->dhchap_tid, buf); - ret = crypto_shash_update(shash, buf, 2); - if (ret) - goto out; + nvme_auth_hmac_update(&hmac, buf, 2); + memset(buf, 0, 4); - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, "Controller", 10); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->subsys->subsysnqn, - strlen(ctrl->subsys->subsysnqn)); - if (ret) - goto out; - ret = crypto_shash_update(shash, buf, 1); - if (ret) - goto out; - ret = crypto_shash_update(shash, ctrl->hostnqn, strlen(ctrl->hostnqn)); - if (ret) - goto out; - ret = crypto_shash_final(shash, response); -out: - kfree(shash); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, "Controller", 10); + nvme_auth_hmac_update(&hmac, ctrl->subsys->subsysnqn, + strlen(ctrl->subsys->subsysnqn)); + nvme_auth_hmac_update(&hmac, buf, 1); + nvme_auth_hmac_update(&hmac, ctrl->hostnqn, strlen(ctrl->hostnqn)); + nvme_auth_hmac_final(&hmac, response); + ret = 0; out_free_challenge: if (challenge != req->sq->dhchap_c2) kfree(challenge); out_free_response: + memzero_explicit(&hmac, sizeof(hmac)); nvme_auth_free_key(transformed_key); -out_free_tfm: - crypto_free_shash(shash_tfm); return ret; } From 844d950bb2cb1fc5b8973369de59cbfb7eecd94d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:57 -0800 Subject: [PATCH 072/146] nvme-auth: common: remove nvme_auth_digest_name() Since nvme_auth_digest_name() is no longer used, remove it and the associated data from the hash_map array. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/auth.c | 12 ------------ include/linux/nvme-auth.h | 1 - 2 files changed, 13 deletions(-) diff --git a/drivers/nvme/common/auth.c b/drivers/nvme/common/auth.c index 5be86629c2d4..2d325fb93083 100644 --- a/drivers/nvme/common/auth.c +++ b/drivers/nvme/common/auth.c @@ -89,22 +89,18 @@ EXPORT_SYMBOL_GPL(nvme_auth_dhgroup_id); static const struct nvme_dhchap_hash_map { int len; char hmac[15]; - char digest[8]; } hash_map[] = { [NVME_AUTH_HASH_SHA256] = { .len = 32, .hmac = "hmac(sha256)", - .digest = "sha256", }, [NVME_AUTH_HASH_SHA384] = { .len = 48, .hmac = "hmac(sha384)", - .digest = "sha384", }, [NVME_AUTH_HASH_SHA512] = { .len = 64, .hmac = "hmac(sha512)", - .digest = "sha512", }, }; @@ -116,14 +112,6 @@ const char *nvme_auth_hmac_name(u8 hmac_id) } EXPORT_SYMBOL_GPL(nvme_auth_hmac_name); -const char *nvme_auth_digest_name(u8 hmac_id) -{ - if (hmac_id >= ARRAY_SIZE(hash_map)) - return NULL; - return hash_map[hmac_id].digest; -} -EXPORT_SYMBOL_GPL(nvme_auth_digest_name); - u8 nvme_auth_hmac_id(const char *hmac_name) { int i; diff --git a/include/linux/nvme-auth.h b/include/linux/nvme-auth.h index 940d0703eb1d..184a1f9510fa 100644 --- a/include/linux/nvme-auth.h +++ b/include/linux/nvme-auth.h @@ -21,7 +21,6 @@ const char *nvme_auth_dhgroup_kpp(u8 dhgroup_id); u8 nvme_auth_dhgroup_id(const char *dhgroup_name); const char *nvme_auth_hmac_name(u8 hmac_id); -const char *nvme_auth_digest_name(u8 hmac_id); size_t nvme_auth_hmac_hash_len(u8 hmac_id); u8 nvme_auth_hmac_id(const char *hmac_name); struct nvme_auth_hmac_ctx { From 26c8c2dddecb016a6115b30cf2cee633b311222b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:58 -0800 Subject: [PATCH 073/146] nvme-auth: common: remove selections of no-longer used crypto modules Now that nvme-auth uses the crypto library instead of crypto_shash, remove obsolete selections from the NVME_AUTH kconfig option. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- drivers/nvme/common/Kconfig | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/nvme/common/Kconfig b/drivers/nvme/common/Kconfig index 1ec507d1f9b5..f1639db65fd3 100644 --- a/drivers/nvme/common/Kconfig +++ b/drivers/nvme/common/Kconfig @@ -7,12 +7,8 @@ config NVME_KEYRING config NVME_AUTH tristate select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA256 - select CRYPTO_SHA512 select CRYPTO_DH select CRYPTO_DH_RFC7919_GROUPS - select CRYPTO_HKDF select CRYPTO_LIB_SHA256 select CRYPTO_LIB_SHA512 From 6d888db2cfd910ad5b2070659e9b2598bbe4081f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 1 Mar 2026 23:59:59 -0800 Subject: [PATCH 074/146] crypto: remove HKDF library Remove crypto/hkdf.c, since it's no longer used. Originally it had two users, but now both of them just inline the needed HMAC computations using the HMAC library APIs. That ends up being better, since it eliminates all the complexity and performance issues associated with the crypto_shash abstraction and multi-step HMAC input formatting. Acked-by: Ard Biesheuvel Acked-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Eric Biggers Signed-off-by: Keith Busch --- crypto/Kconfig | 6 - crypto/Makefile | 1 - crypto/hkdf.c | 573 ------------------------------------------ include/crypto/hkdf.h | 20 -- 4 files changed, 600 deletions(-) delete mode 100644 crypto/hkdf.c delete mode 100644 include/crypto/hkdf.h diff --git a/crypto/Kconfig b/crypto/Kconfig index b4bb85e8e226..7e5ea61168c3 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -141,12 +141,6 @@ config CRYPTO_ACOMP select CRYPTO_ALGAPI select CRYPTO_ACOMP2 -config CRYPTO_HKDF - tristate - select CRYPTO_SHA256 if CRYPTO_SELFTESTS - select CRYPTO_SHA512 if CRYPTO_SELFTESTS - select CRYPTO_HASH2 - config CRYPTO_MANAGER tristate default CRYPTO_ALGAPI if CRYPTO_SELFTESTS diff --git a/crypto/Makefile b/crypto/Makefile index 04e269117589..8eb3f9a629d8 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -36,7 +36,6 @@ obj-$(CONFIG_CRYPTO_HASH2) += crypto_hash.o obj-$(CONFIG_CRYPTO_AKCIPHER2) += akcipher.o obj-$(CONFIG_CRYPTO_SIG2) += sig.o obj-$(CONFIG_CRYPTO_KPP2) += kpp.o -obj-$(CONFIG_CRYPTO_HKDF) += hkdf.o dh_generic-y := dh.o dh_generic-y += dh_helper.o diff --git a/crypto/hkdf.c b/crypto/hkdf.c deleted file mode 100644 index 82d1b32ca6ce..000000000000 --- a/crypto/hkdf.c +++ /dev/null @@ -1,573 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * - * Copyright 2019 Google LLC - */ - -#include -#include -#include -#include - -/* - * HKDF consists of two steps: - * - * 1. HKDF-Extract: extract a pseudorandom key from the input keying material - * and optional salt. - * 2. HKDF-Expand: expand the pseudorandom key into output keying material of - * any length, parameterized by an application-specific info string. - * - */ - -/** - * hkdf_extract - HKDF-Extract (RFC 5869 section 2.2) - * @hmac_tfm: an HMAC transform using the hash function desired for HKDF. The - * caller is responsible for setting the @prk afterwards. - * @ikm: input keying material - * @ikmlen: length of @ikm - * @salt: input salt value - * @saltlen: length of @salt - * @prk: resulting pseudorandom key - * - * Extracts a pseudorandom key @prk from the input keying material - * @ikm with length @ikmlen and salt @salt with length @saltlen. - * The length of @prk is given by the digest size of @hmac_tfm. - * For an 'unsalted' version of HKDF-Extract @salt must be set - * to all zeroes and @saltlen must be set to the length of @prk. - * - * Returns 0 on success with the pseudorandom key stored in @prk, - * or a negative errno value otherwise. - */ -int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, const u8 *salt, unsigned int saltlen, - u8 *prk) -{ - int err; - - err = crypto_shash_setkey(hmac_tfm, salt, saltlen); - if (!err) - err = crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); - - return err; -} -EXPORT_SYMBOL_GPL(hkdf_extract); - -/** - * hkdf_expand - HKDF-Expand (RFC 5869 section 2.3) - * @hmac_tfm: hash context keyed with pseudorandom key - * @info: application-specific information - * @infolen: length of @info - * @okm: output keying material - * @okmlen: length of @okm - * - * This expands the pseudorandom key, which was already keyed into @hmac_tfm, - * into @okmlen bytes of output keying material parameterized by the - * application-specific @info of length @infolen bytes. - * This is thread-safe and may be called by multiple threads in parallel. - * - * Returns 0 on success with output keying material stored in @okm, - * or a negative errno value otherwise. - */ -int hkdf_expand(struct crypto_shash *hmac_tfm, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen) -{ - SHASH_DESC_ON_STACK(desc, hmac_tfm); - unsigned int i, hashlen = crypto_shash_digestsize(hmac_tfm); - int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HASH_MAX_DIGESTSIZE] = {}; - - if (WARN_ON(okmlen > 255 * hashlen)) - return -EINVAL; - - desc->tfm = hmac_tfm; - - for (i = 0; i < okmlen; i += hashlen) { - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, hashlen); - if (err) - goto out; - } - - if (infolen) { - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - } - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < hashlen) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); - memzero_explicit(tmp, HASH_MAX_DIGESTSIZE); - return err; -} -EXPORT_SYMBOL_GPL(hkdf_expand); - -struct hkdf_testvec { - const char *test; - const u8 *ikm; - const u8 *salt; - const u8 *info; - const u8 *prk; - const u8 *okm; - u16 ikm_size; - u16 salt_size; - u16 info_size; - u16 prk_size; - u16 okm_size; -}; - -/* - * HKDF test vectors from RFC5869 - * - * Additional HKDF test vectors from - * https://github.com/brycx/Test-Vector-Generation/blob/master/HKDF/hkdf-hmac-sha2-test-vectors.md - */ -static const struct hkdf_testvec hkdf_sha256_tv[] = { - { - .test = "basic hdkf test", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x07\x77\x09\x36\x2c\x2e\x32\xdf\x0d\xdc\x3f\x0d\xc4\x7b\xba\x63" - "\x90\xb6\xc7\x3b\xb5\x0f\x9c\x31\x22\xec\x84\x4a\xd7\xc2\xb3\xe5", - .prk_size = 32, - .okm = "\x3c\xb2\x5f\x25\xfa\xac\xd5\x7a\x90\x43\x4f\x64\xd0\x36\x2f\x2a" - "\x2d\x2d\x0a\x90\xcf\x1a\x5a\x4c\x5d\xb0\x2d\x56\xec\xc4\xc5\xbf" - "\x34\x00\x72\x08\xd5\xb8\x87\x18\x58\x65", - .okm_size = 42, - }, { - .test = "hkdf test with long input", - .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" - "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" - "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", - .ikm_size = 80, - .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" - "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" - "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" - "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" - "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", - .salt_size = 80, - .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" - "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" - "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" - "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", - .info_size = 80, - .prk = "\x06\xa6\xb8\x8c\x58\x53\x36\x1a\x06\x10\x4c\x9c\xeb\x35\xb4\x5c" - "\xef\x76\x00\x14\x90\x46\x71\x01\x4a\x19\x3f\x40\xc1\x5f\xc2\x44", - .prk_size = 32, - .okm = "\xb1\x1e\x39\x8d\xc8\x03\x27\xa1\xc8\xe7\xf7\x8c\x59\x6a\x49\x34" - "\x4f\x01\x2e\xda\x2d\x4e\xfa\xd8\xa0\x50\xcc\x4c\x19\xaf\xa9\x7c" - "\x59\x04\x5a\x99\xca\xc7\x82\x72\x71\xcb\x41\xc6\x5e\x59\x0e\x09" - "\xda\x32\x75\x60\x0c\x2f\x09\xb8\x36\x77\x93\xa9\xac\xa3\xdb\x71" - "\xcc\x30\xc5\x81\x79\xec\x3e\x87\xc1\x4c\x01\xd5\xc1\xf3\x43\x4f" - "\x1d\x87", - .okm_size = 82, - }, { - .test = "hkdf test with zero salt and info", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = NULL, - .salt_size = 0, - .info = NULL, - .info_size = 0, - .prk = "\x19\xef\x24\xa3\x2c\x71\x7b\x16\x7f\x33\xa9\x1d\x6f\x64\x8b\xdf" - "\x96\x59\x67\x76\xaf\xdb\x63\x77\xac\x43\x4c\x1c\x29\x3c\xcb\x04", - .prk_size = 32, - .okm = "\x8d\xa4\xe7\x75\xa5\x63\xc1\x8f\x71\x5f\x80\x2a\x06\x3c\x5a\x31" - "\xb8\xa1\x1f\x5c\x5e\xe1\x87\x9e\xc3\x45\x4e\x5f\x3c\x73\x8d\x2d" - "\x9d\x20\x13\x95\xfa\xa4\xb6\x1a\x96\xc8", - .okm_size = 42, - }, { - .test = "hkdf test with short input", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 11, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x82\x65\xf6\x9d\x7f\xf7\xe5\x01\x37\x93\x01\x5c\xa0\xef\x92\x0c" - "\xb1\x68\x21\x99\xc8\xbc\x3a\x00\xda\x0c\xab\x47\xb7\xb0\x0f\xdf", - .prk_size = 32, - .okm = "\x58\xdc\xe1\x0d\x58\x01\xcd\xfd\xa8\x31\x72\x6b\xfe\xbc\xb7\x43" - "\xd1\x4a\x7e\xe8\x3a\xa0\x57\xa9\x3d\x59\xb0\xa1\x31\x7f\xf0\x9d" - "\x10\x5c\xce\xcf\x53\x56\x92\xb1\x4d\xd5", - .okm_size = 42, - }, { - .test = "unsalted hkdf test with zero info", - .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" - "\x0c\x0c\x0c\x0c\x0c\x0c", - .ikm_size = 22, - .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - .salt_size = 32, - .info = NULL, - .info_size = 0, - .prk = "\xaa\x84\x1e\x1f\x35\x74\xf3\x2d\x13\xfb\xa8\x00\x5f\xcd\x9b\x8d" - "\x77\x67\x82\xa5\xdf\xa1\x92\x38\x92\xfd\x8b\x63\x5d\x3a\x89\xdf", - .prk_size = 32, - .okm = "\x59\x68\x99\x17\x9a\xb1\xbc\x00\xa7\xc0\x37\x86\xff\x43\xee\x53" - "\x50\x04\xbe\x2b\xb9\xbe\x68\xbc\x14\x06\x63\x6f\x54\xbd\x33\x8a" - "\x66\xa2\x37\xba\x2a\xcb\xce\xe3\xc9\xa7", - .okm_size = 42, - } -}; - -static const struct hkdf_testvec hkdf_sha384_tv[] = { - { - .test = "basic hkdf test", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x70\x4b\x39\x99\x07\x79\xce\x1d\xc5\x48\x05\x2c\x7d\xc3\x9f\x30" - "\x35\x70\xdd\x13\xfb\x39\xf7\xac\xc5\x64\x68\x0b\xef\x80\xe8\xde" - "\xc7\x0e\xe9\xa7\xe1\xf3\xe2\x93\xef\x68\xec\xeb\x07\x2a\x5a\xde", - .prk_size = 48, - .okm = "\x9b\x50\x97\xa8\x60\x38\xb8\x05\x30\x90\x76\xa4\x4b\x3a\x9f\x38" - "\x06\x3e\x25\xb5\x16\xdc\xbf\x36\x9f\x39\x4c\xfa\xb4\x36\x85\xf7" - "\x48\xb6\x45\x77\x63\xe4\xf0\x20\x4f\xc5", - .okm_size = 42, - }, { - .test = "hkdf test with long input", - .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" - "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" - "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", - .ikm_size = 80, - .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" - "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" - "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" - "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" - "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", - .salt_size = 80, - .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" - "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" - "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" - "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", - .info_size = 80, - .prk = "\xb3\x19\xf6\x83\x1d\xff\x93\x14\xef\xb6\x43\xba\xa2\x92\x63\xb3" - "\x0e\x4a\x8d\x77\x9f\xe3\x1e\x9c\x90\x1e\xfd\x7d\xe7\x37\xc8\x5b" - "\x62\xe6\x76\xd4\xdc\x87\xb0\x89\x5c\x6a\x7d\xc9\x7b\x52\xce\xbb", - .prk_size = 48, - .okm = "\x48\x4c\xa0\x52\xb8\xcc\x72\x4f\xd1\xc4\xec\x64\xd5\x7b\x4e\x81" - "\x8c\x7e\x25\xa8\xe0\xf4\x56\x9e\xd7\x2a\x6a\x05\xfe\x06\x49\xee" - "\xbf\x69\xf8\xd5\xc8\x32\x85\x6b\xf4\xe4\xfb\xc1\x79\x67\xd5\x49" - "\x75\x32\x4a\x94\x98\x7f\x7f\x41\x83\x58\x17\xd8\x99\x4f\xdb\xd6" - "\xf4\xc0\x9c\x55\x00\xdc\xa2\x4a\x56\x22\x2f\xea\x53\xd8\x96\x7a" - "\x8b\x2e", - .okm_size = 82, - }, { - .test = "hkdf test with zero salt and info", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = NULL, - .salt_size = 0, - .info = NULL, - .info_size = 0, - .prk = "\x10\xe4\x0c\xf0\x72\xa4\xc5\x62\x6e\x43\xdd\x22\xc1\xcf\x72\x7d" - "\x4b\xb1\x40\x97\x5c\x9a\xd0\xcb\xc8\xe4\x5b\x40\x06\x8f\x8f\x0b" - "\xa5\x7c\xdb\x59\x8a\xf9\xdf\xa6\x96\x3a\x96\x89\x9a\xf0\x47\xe5", - .prk_size = 48, - .okm = "\xc8\xc9\x6e\x71\x0f\x89\xb0\xd7\x99\x0b\xca\x68\xbc\xde\xc8\xcf" - "\x85\x40\x62\xe5\x4c\x73\xa7\xab\xc7\x43\xfa\xde\x9b\x24\x2d\xaa" - "\xcc\x1c\xea\x56\x70\x41\x5b\x52\x84\x9c", - .okm_size = 42, - }, { - .test = "hkdf test with short input", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 11, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x6d\x31\x69\x98\x28\x79\x80\x88\xb3\x59\xda\xd5\x0b\x8f\x01\xb0" - "\x15\xf1\x7a\xa3\xbd\x4e\x27\xa6\xe9\xf8\x73\xb7\x15\x85\xca\x6a" - "\x00\xd1\xf0\x82\x12\x8a\xdb\x3c\xf0\x53\x0b\x57\xc0\xf9\xac\x72", - .prk_size = 48, - .okm = "\xfb\x7e\x67\x43\xeb\x42\xcd\xe9\x6f\x1b\x70\x77\x89\x52\xab\x75" - "\x48\xca\xfe\x53\x24\x9f\x7f\xfe\x14\x97\xa1\x63\x5b\x20\x1f\xf1" - "\x85\xb9\x3e\x95\x19\x92\xd8\x58\xf1\x1a", - .okm_size = 42, - }, { - .test = "unsalted hkdf test with zero info", - .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" - "\x0c\x0c\x0c\x0c\x0c\x0c", - .ikm_size = 22, - .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - .salt_size = 48, - .info = NULL, - .info_size = 0, - .prk = "\x9d\x2d\xa5\x06\x6f\x05\xd1\x6c\x59\xfe\xdf\x6c\x5f\x32\xc7\x5e" - "\xda\x9a\x47\xa7\x9c\x93\x6a\xa4\x4c\xb7\x63\xa8\xe2\x2f\xfb\xfc" - "\xd8\xfe\x55\x43\x58\x53\x47\x21\x90\x39\xd1\x68\x28\x36\x33\xf5", - .prk_size = 48, - .okm = "\x6a\xd7\xc7\x26\xc8\x40\x09\x54\x6a\x76\xe0\x54\x5d\xf2\x66\x78" - "\x7e\x2b\x2c\xd6\xca\x43\x73\xa1\xf3\x14\x50\xa7\xbd\xf9\x48\x2b" - "\xfa\xb8\x11\xf5\x54\x20\x0e\xad\x8f\x53", - .okm_size = 42, - } -}; - -static const struct hkdf_testvec hkdf_sha512_tv[] = { - { - .test = "basic hkdf test", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x66\x57\x99\x82\x37\x37\xde\xd0\x4a\x88\xe4\x7e\x54\xa5\x89\x0b" - "\xb2\xc3\xd2\x47\xc7\xa4\x25\x4a\x8e\x61\x35\x07\x23\x59\x0a\x26" - "\xc3\x62\x38\x12\x7d\x86\x61\xb8\x8c\xf8\x0e\xf8\x02\xd5\x7e\x2f" - "\x7c\xeb\xcf\x1e\x00\xe0\x83\x84\x8b\xe1\x99\x29\xc6\x1b\x42\x37", - .prk_size = 64, - .okm = "\x83\x23\x90\x08\x6c\xda\x71\xfb\x47\x62\x5b\xb5\xce\xb1\x68\xe4" - "\xc8\xe2\x6a\x1a\x16\xed\x34\xd9\xfc\x7f\xe9\x2c\x14\x81\x57\x93" - "\x38\xda\x36\x2c\xb8\xd9\xf9\x25\xd7\xcb", - .okm_size = 42, - }, { - .test = "hkdf test with long input", - .ikm = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" - "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" - "\x20\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" - "\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" - "\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f", - .ikm_size = 80, - .salt = "\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" - "\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" - "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" - "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" - "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf", - .salt_size = 80, - .info = "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" - "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" - "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" - "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef" - "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", - .info_size = 80, - .prk = "\x35\x67\x25\x42\x90\x7d\x4e\x14\x2c\x00\xe8\x44\x99\xe7\x4e\x1d" - "\xe0\x8b\xe8\x65\x35\xf9\x24\xe0\x22\x80\x4a\xd7\x75\xdd\xe2\x7e" - "\xc8\x6c\xd1\xe5\xb7\xd1\x78\xc7\x44\x89\xbd\xbe\xb3\x07\x12\xbe" - "\xb8\x2d\x4f\x97\x41\x6c\x5a\x94\xea\x81\xeb\xdf\x3e\x62\x9e\x4a", - .prk_size = 64, - .okm = "\xce\x6c\x97\x19\x28\x05\xb3\x46\xe6\x16\x1e\x82\x1e\xd1\x65\x67" - "\x3b\x84\xf4\x00\xa2\xb5\x14\xb2\xfe\x23\xd8\x4c\xd1\x89\xdd\xf1" - "\xb6\x95\xb4\x8c\xbd\x1c\x83\x88\x44\x11\x37\xb3\xce\x28\xf1\x6a" - "\xa6\x4b\xa3\x3b\xa4\x66\xb2\x4d\xf6\xcf\xcb\x02\x1e\xcf\xf2\x35" - "\xf6\xa2\x05\x6c\xe3\xaf\x1d\xe4\x4d\x57\x20\x97\xa8\x50\x5d\x9e" - "\x7a\x93", - .okm_size = 82, - }, { - .test = "hkdf test with zero salt and info", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b" - "\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 22, - .salt = NULL, - .salt_size = 0, - .info = NULL, - .info_size = 0, - .prk = "\xfd\x20\x0c\x49\x87\xac\x49\x13\x13\xbd\x4a\x2a\x13\x28\x71\x21" - "\x24\x72\x39\xe1\x1c\x9e\xf8\x28\x02\x04\x4b\x66\xef\x35\x7e\x5b" - "\x19\x44\x98\xd0\x68\x26\x11\x38\x23\x48\x57\x2a\x7b\x16\x11\xde" - "\x54\x76\x40\x94\x28\x63\x20\x57\x8a\x86\x3f\x36\x56\x2b\x0d\xf6", - .prk_size = 64, - .okm = "\xf5\xfa\x02\xb1\x82\x98\xa7\x2a\x8c\x23\x89\x8a\x87\x03\x47\x2c" - "\x6e\xb1\x79\xdc\x20\x4c\x03\x42\x5c\x97\x0e\x3b\x16\x4b\xf9\x0f" - "\xff\x22\xd0\x48\x36\xd0\xe2\x34\x3b\xac", - .okm_size = 42, - }, { - .test = "hkdf test with short input", - .ikm = "\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b\x0b", - .ikm_size = 11, - .salt = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c", - .salt_size = 13, - .info = "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9", - .info_size = 10, - .prk = "\x67\x40\x9c\x9c\xac\x28\xb5\x2e\xe9\xfa\xd9\x1c\x2f\xda\x99\x9f" - "\x7c\xa2\x2e\x34\x34\xf0\xae\x77\x28\x63\x83\x65\x68\xad\x6a\x7f" - "\x10\xcf\x11\x3b\xfd\xdd\x56\x01\x29\xa5\x94\xa8\xf5\x23\x85\xc2" - "\xd6\x61\xd7\x85\xd2\x9c\xe9\x3a\x11\x40\x0c\x92\x06\x83\x18\x1d", - .prk_size = 64, - .okm = "\x74\x13\xe8\x99\x7e\x02\x06\x10\xfb\xf6\x82\x3f\x2c\xe1\x4b\xff" - "\x01\x87\x5d\xb1\xca\x55\xf6\x8c\xfc\xf3\x95\x4d\xc8\xaf\xf5\x35" - "\x59\xbd\x5e\x30\x28\xb0\x80\xf7\xc0\x68", - .okm_size = 42, - }, { - .test = "unsalted hkdf test with zero info", - .ikm = "\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c\x0c" - "\x0c\x0c\x0c\x0c\x0c\x0c", - .ikm_size = 22, - .salt = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" - "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", - .salt_size = 64, - .info = NULL, - .info_size = 0, - .prk = "\x53\x46\xb3\x76\xbf\x3a\xa9\xf8\x4f\x8f\x6e\xd5\xb1\xc4\xf4\x89" - "\x17\x2e\x24\x4d\xac\x30\x3d\x12\xf6\x8e\xcc\x76\x6e\xa6\x00\xaa" - "\x88\x49\x5e\x7f\xb6\x05\x80\x31\x22\xfa\x13\x69\x24\xa8\x40\xb1" - "\xf0\x71\x9d\x2d\x5f\x68\xe2\x9b\x24\x22\x99\xd7\x58\xed\x68\x0c", - .prk_size = 64, - .okm = "\x14\x07\xd4\x60\x13\xd9\x8b\xc6\xde\xce\xfc\xfe\xe5\x5f\x0f\x90" - "\xb0\xc7\xf6\x3d\x68\xeb\x1a\x80\xea\xf0\x7e\x95\x3c\xfc\x0a\x3a" - "\x52\x40\xa1\x55\xd6\xe4\xda\xa9\x65\xbb", - .okm_size = 42, - } -}; - -static int hkdf_test(const char *shash, const struct hkdf_testvec *tv) -{ struct crypto_shash *tfm = NULL; - u8 *prk = NULL, *okm = NULL; - unsigned int prk_size; - const char *driver; - int err; - - tfm = crypto_alloc_shash(shash, 0, 0); - if (IS_ERR(tfm)) { - pr_err("%s(%s): failed to allocate transform: %ld\n", - tv->test, shash, PTR_ERR(tfm)); - return PTR_ERR(tfm); - } - driver = crypto_shash_driver_name(tfm); - - prk_size = crypto_shash_digestsize(tfm); - prk = kzalloc(prk_size, GFP_KERNEL); - if (!prk) { - err = -ENOMEM; - goto out_free; - } - - if (tv->prk_size != prk_size) { - pr_err("%s(%s): prk size mismatch (vec %u, digest %u\n", - tv->test, driver, tv->prk_size, prk_size); - err = -EINVAL; - goto out_free; - } - - err = hkdf_extract(tfm, tv->ikm, tv->ikm_size, - tv->salt, tv->salt_size, prk); - if (err) { - pr_err("%s(%s): hkdf_extract failed with %d\n", - tv->test, driver, err); - goto out_free; - } - - if (memcmp(prk, tv->prk, tv->prk_size)) { - pr_err("%s(%s): hkdf_extract prk mismatch\n", - tv->test, driver); - print_hex_dump(KERN_ERR, "prk: ", DUMP_PREFIX_NONE, - 16, 1, prk, tv->prk_size, false); - err = -EINVAL; - goto out_free; - } - - okm = kzalloc(tv->okm_size, GFP_KERNEL); - if (!okm) { - err = -ENOMEM; - goto out_free; - } - - err = crypto_shash_setkey(tfm, tv->prk, tv->prk_size); - if (err) { - pr_err("%s(%s): failed to set prk, error %d\n", - tv->test, driver, err); - goto out_free; - } - - err = hkdf_expand(tfm, tv->info, tv->info_size, - okm, tv->okm_size); - if (err) { - pr_err("%s(%s): hkdf_expand() failed with %d\n", - tv->test, driver, err); - } else if (memcmp(okm, tv->okm, tv->okm_size)) { - pr_err("%s(%s): hkdf_expand() okm mismatch\n", - tv->test, driver); - print_hex_dump(KERN_ERR, "okm: ", DUMP_PREFIX_NONE, - 16, 1, okm, tv->okm_size, false); - err = -EINVAL; - } -out_free: - kfree(okm); - kfree(prk); - crypto_free_shash(tfm); - return err; -} - -static int __init crypto_hkdf_module_init(void) -{ - int ret = 0, i; - - if (!IS_ENABLED(CONFIG_CRYPTO_SELFTESTS)) - return 0; - - for (i = 0; i < ARRAY_SIZE(hkdf_sha256_tv); i++) { - ret = hkdf_test("hmac(sha256)", &hkdf_sha256_tv[i]); - if (ret) - return ret; - } - for (i = 0; i < ARRAY_SIZE(hkdf_sha384_tv); i++) { - ret = hkdf_test("hmac(sha384)", &hkdf_sha384_tv[i]); - if (ret) - return ret; - } - for (i = 0; i < ARRAY_SIZE(hkdf_sha512_tv); i++) { - ret = hkdf_test("hmac(sha512)", &hkdf_sha512_tv[i]); - if (ret) - return ret; - } - return 0; -} - -static void __exit crypto_hkdf_module_exit(void) {} - -late_initcall(crypto_hkdf_module_init); -module_exit(crypto_hkdf_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("HMAC-based Key Derivation Function (HKDF)"); diff --git a/include/crypto/hkdf.h b/include/crypto/hkdf.h deleted file mode 100644 index 6a9678f508f5..000000000000 --- a/include/crypto/hkdf.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * HKDF: HMAC-based Key Derivation Function (HKDF), RFC 5869 - * - * Extracted from fs/crypto/hkdf.c, which has - * Copyright 2019 Google LLC - */ - -#ifndef _CRYPTO_HKDF_H -#define _CRYPTO_HKDF_H - -#include - -int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, const u8 *salt, unsigned int saltlen, - u8 *prk); -int hkdf_expand(struct crypto_shash *hmac_tfm, - const u8 *info, unsigned int infolen, - u8 *okm, unsigned int okmlen); -#endif From ecf4d2d883515850ba838df2537ff1c32d0c4217 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 2 Dec 2025 15:17:52 +1000 Subject: [PATCH 075/146] nvmet-tcp: Don't error if TLS is enabed on a reset If the host sends a AUTH_Negotiate Message on the admin queue with REPLACETLSPSK set then we expect and require a TLS connection and shouldn't report an error if TLS is enabled. This change only enforces the nvmet_queue_tls_keyid() check if we aren't resetting the negotiation. Signed-off-by: Alistair Francis Reviewed-by: Wilfred Mallawa Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/auth.c | 4 ++-- drivers/nvme/target/core.c | 2 +- drivers/nvme/target/fabrics-cmd-auth.c | 3 ++- drivers/nvme/target/nvmet.h | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c index b7417ab6b035..b34610e2f19d 100644 --- a/drivers/nvme/target/auth.c +++ b/drivers/nvme/target/auth.c @@ -130,7 +130,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) return ret; } -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset) { int ret = 0; struct nvmet_host_link *p; @@ -156,7 +156,7 @@ u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq) goto out_unlock; } - if (nvmet_queue_tls_keyid(sq)) { + if (!reset && nvmet_queue_tls_keyid(sq)) { pr_debug("host %s tls enabled\n", ctrl->hostnqn); goto out_unlock; } diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 5e43d0acc86e..d49f41790e4e 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1686,7 +1686,7 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args) if (args->hostid) uuid_copy(&ctrl->hostid, args->hostid); - dhchap_status = nvmet_setup_auth(ctrl, args->sq); + dhchap_status = nvmet_setup_auth(ctrl, args->sq, false); if (dhchap_status) { pr_err("Failed to setup authentication, dhchap status %u\n", dhchap_status); diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index b703e3bebae4..a56b9fc35719 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -291,7 +291,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) pr_debug("%s: ctrl %d qid %d reset negotiation\n", __func__, ctrl->cntlid, req->sq->qid); if (!req->sq->qid) { - dhchap_status = nvmet_setup_auth(ctrl, req->sq); + dhchap_status = nvmet_setup_auth(ctrl, req->sq, + true); if (dhchap_status) { pr_err("ctrl %d qid 0 failed to setup re-authentication\n", ctrl->cntlid); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 986d4c7bd734..f5d22267df6a 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -895,7 +895,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req); int nvmet_auth_set_key(struct nvmet_host *host, const char *secret, bool set_ctrl); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); -u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq); +u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, bool reset); void nvmet_auth_sq_init(struct nvmet_sq *sq); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_auth_sq_free(struct nvmet_sq *sq); @@ -916,7 +916,7 @@ int nvmet_auth_ctrl_sesskey(struct nvmet_req *req, void nvmet_auth_insert_psk(struct nvmet_sq *sq); #else static inline u8 nvmet_setup_auth(struct nvmet_ctrl *ctrl, - struct nvmet_sq *sq) + struct nvmet_sq *sq, bool reset) { return 0; } From 2e6eb6b277f593b98f151ea8eff1beb558bbea3b Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 2 Dec 2025 15:17:53 +1000 Subject: [PATCH 076/146] nvmet-tcp: Don't free SQ on authentication success Curently after the host sends a REPLACETLSPSK we free the TLS keys as part of calling nvmet_auth_sq_free() on success. This means when the host sends a follow up REPLACETLSPSK we return CONCAT_MISMATCH as the check for !nvmet_queue_tls_keyid(req->sq) fails. This patch ensures we don't free the TLS key on success as we might need it again in the future. Signed-off-by: Alistair Francis Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Wilfred Mallawa Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/target/fabrics-cmd-auth.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index a56b9fc35719..35f411fad8f9 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -395,9 +395,10 @@ done: goto complete; } /* Final states, clear up variables */ - nvmet_auth_sq_free(req->sq); - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { + nvmet_auth_sq_free(req->sq); nvmet_ctrl_fatal_error(ctrl); + } complete: nvmet_req_complete(req, status); @@ -573,9 +574,7 @@ void nvmet_execute_auth_receive(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, d, al); kfree(d); done: - if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_SUCCESS2) - nvmet_auth_sq_free(req->sq); - else if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { + if (req->sq->dhchap_step == NVME_AUTH_DHCHAP_MESSAGE_FAILURE1) { nvmet_auth_sq_free(req->sq); nvmet_ctrl_fatal_error(ctrl); } From 56d25f1a6e312e36ee07c605c4102e0848245381 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 2 Dec 2025 15:17:54 +1000 Subject: [PATCH 077/146] nvme: Expose the tls_configured sysfs for secure concat connections Signed-off-by: Alistair Francis Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Wilfred Mallawa Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 45422d4274de..7a4b0924c3a2 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -861,7 +861,7 @@ static umode_t nvme_tls_attrs_are_visible(struct kobject *kobj, !ctrl->opts->tls && !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_configured_key.attr && - (!ctrl->opts->tls_key || ctrl->opts->concat)) + !ctrl->opts->concat) return 0; if (a == &dev_attr_tls_keyring.attr && !ctrl->opts->keyring) From ed6a9f7dabf84a9f2bed418e66eda6f8239b7f60 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Tue, 2 Dec 2025 15:17:55 +1000 Subject: [PATCH 078/146] nvme: Allow reauth from sysfs Allow userspace to trigger a reauth (REPLACETLSPSK) from sysfs. This can be done by writing a zero to the sysfs file. echo 0 > /sys/devices/virtual/nvme-fabrics/ctl/nvme0/tls_configured_key In order to use the new keys for the admin queue we call controller reset. This isn't ideal, but I can't find a simpler way to reset the admin queue TLS connection. Signed-off-by: Alistair Francis Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Wilfred Mallawa Signed-off-by: Keith Busch --- Documentation/ABI/testing/sysfs-nvme | 13 ++++++++ drivers/nvme/host/sysfs.c | 44 +++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 Documentation/ABI/testing/sysfs-nvme diff --git a/Documentation/ABI/testing/sysfs-nvme b/Documentation/ABI/testing/sysfs-nvme new file mode 100644 index 000000000000..499d5f843cd4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-nvme @@ -0,0 +1,13 @@ +What: /sys/devices/virtual/nvme-fabrics/ctl/.../tls_configured_key +Date: November 2025 +KernelVersion: 6.19 +Contact: Linux NVMe mailing list +Description: + The file is avaliable when using a secure concatanation + connection to a NVMe target. Reading the file will return + the serial of the currently negotiated key. + + Writing 0 to the file will trigger a PSK reauthentication + (REPLACETLSPSK) with the target. After a reauthentication + the value returned by tls_configured_key will be the new + serial. diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 7a4b0924c3a2..7bf2e972126b 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -829,7 +829,49 @@ static ssize_t tls_configured_key_show(struct device *dev, return sysfs_emit(buf, "%08x\n", key_serial(key)); } -static DEVICE_ATTR_RO(tls_configured_key); + +static ssize_t tls_configured_key_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int error, qid; + + error = kstrtoint(buf, 10, &qid); + if (error) + return error; + + /* + * We currently only allow userspace to write a `0` indicating + * generate a new key. + */ + if (qid) + return -EINVAL; + + if (!ctrl->opts || !ctrl->opts->concat) + return -EOPNOTSUPP; + + error = nvme_auth_negotiate(ctrl, 0); + if (error < 0) { + nvme_reset_ctrl(ctrl); + return error; + } + + error = nvme_auth_wait(ctrl, 0); + if (error < 0) { + nvme_reset_ctrl(ctrl); + return error; + } + + /* + * We need to reset the TLS connection, so let's just + * reset the controller. + */ + nvme_reset_ctrl(ctrl); + + return count; +} +static DEVICE_ATTR_RW(tls_configured_key); static ssize_t tls_keyring_show(struct device *dev, struct device_attribute *attr, char *buf) From ac61e869bef13a43d624893559acbac3a4e2a341 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:46 -0700 Subject: [PATCH 079/146] nvme: add preferred I/O size fields to struct nvme_id_ns_nvm A subsequent change will use the NPDGL and NPDAL fields of the NVM Command Set Specific Identify Namespace structure, so add them (and the handful of intervening fields) to struct nvme_id_ns_nvm. Add an assertion that the size is still 4 KB. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index edfebbce6745..ec011dce4a97 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -513,9 +513,16 @@ struct nvme_id_ns_nvm { __u8 pic; __u8 rsvd9[3]; __le32 elbaf[64]; - __u8 rsvd268[3828]; + __le32 npdgl; + __le32 nprg; + __le32 npra; + __le32 nors; + __le32 npdal; + __u8 rsvd288[3808]; }; +static_assert(sizeof(struct nvme_id_ns_nvm) == 4096); + enum { NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, From 9110b85244f142ca4bcaea27be408c778d3c48d0 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:47 -0700 Subject: [PATCH 080/146] nvme: fold nvme_config_discard() into nvme_update_disk_info() The choice of what queue limits are set in nvme_update_disk_info() vs. nvme_config_discard() seems a bit arbitrary. A subsequent commit will compute the discard_granularity limit using struct nvme_id_ns, which is only passed to nvme_update_disk_info() currently. So move the logic in nvme_config_discard() to nvme_update_disk_info(). Replace several instances of ns->ctrl in nvme_update_disk_info() with the ctrl variable brought from nvme_config_discard(). Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 43 ++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3de52f1d2723..da477b502762 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1884,26 +1884,6 @@ static bool nvme_init_integrity(struct nvme_ns_head *head, return true; } -static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) -{ - struct nvme_ctrl *ctrl = ns->ctrl; - - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) - lim->max_hw_discard_sectors = - nvme_lba_to_sect(ns->head, ctrl->dmrsl); - else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - lim->max_hw_discard_sectors = UINT_MAX; - else - lim->max_hw_discard_sectors = 0; - - lim->discard_granularity = lim->logical_block_size; - - if (ctrl->dmrl) - lim->max_discard_segments = ctrl->dmrl; - else - lim->max_discard_segments = NVME_DSM_MAX_RANGES; -} - static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -2082,6 +2062,7 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, struct queue_limits *lim) { struct nvme_ns_head *head = ns->head; + struct nvme_ctrl *ctrl = ns->ctrl; u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; bool valid = true; @@ -2116,11 +2097,26 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, lim->physical_block_size = min(phys_bs, atomic_bs); lim->io_min = phys_bs; lim->io_opt = io_opt; - if ((ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) && - (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)) + if ((ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) && + (ctrl->oncs & NVME_CTRL_ONCS_DSM)) lim->max_write_zeroes_sectors = UINT_MAX; else - lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; + lim->max_write_zeroes_sectors = ctrl->max_zeroes_sectors; + + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) + lim->max_hw_discard_sectors = + nvme_lba_to_sect(ns->head, ctrl->dmrsl); + else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) + lim->max_hw_discard_sectors = UINT_MAX; + else + lim->max_hw_discard_sectors = 0; + + lim->discard_granularity = lim->logical_block_size; + + if (ctrl->dmrl) + lim->max_discard_segments = ctrl->dmrl; + else + lim->max_discard_segments = NVME_DSM_MAX_RANGES; return valid; } @@ -2385,7 +2381,6 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if (!nvme_update_disk_info(ns, id, &lim)) capacity = 0; - nvme_config_discard(ns, &lim); if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && ns->head->ids.csi == NVME_CSI_ZNS) nvme_update_zone_info(ns, &lim, &zi); From d3c04a6ea5fd7a3d81f7c80880125108df9a4cbd Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:48 -0700 Subject: [PATCH 081/146] nvme: update nvme_id_ns OPTPERF constants In NVMe verson 2.0 and below, OPTPERF comprises only bit 4 of NSFEAT in the Identify Namespace structure. Since version 2.1, OPTPERF includes both bits 4 and 5 of NSFEAT. Replace the NVME_NS_FEAT_IO_OPT constant with NVME_NS_FEAT_OPTPERF_SHIFT, NVME_NS_FEAT_OPTPERF_MASK, and NVME_NS_FEAT_OPTPERF_MASK_2_1, representing the first bit, pre-2.1 bit width, and post-2.1 bit width of OPTPERF. Update nvme_update_disk_info() to check both OPTPERF bits for controllers that report version 2.1 or newer, as NPWG and NOWS are supported even if only bit 5 is set. Signed-off-by: Caleb Sander Mateos Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 8 +++++++- include/linux/nvme.h | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da477b502762..04a8dae12333 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2066,6 +2066,7 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; bool valid = true; + u8 optperf; /* * The block layer can't support LBA sizes larger than the page size @@ -2080,7 +2081,12 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, phys_bs = bs; atomic_bs = nvme_configure_atomic_write(ns, id, lim, bs); - if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { + optperf = id->nsfeat >> NVME_NS_FEAT_OPTPERF_SHIFT; + if (ctrl->vs >= NVME_VS(2, 1, 0)) + optperf &= NVME_NS_FEAT_OPTPERF_MASK_2_1; + else + optperf &= NVME_NS_FEAT_OPTPERF_MASK; + if (optperf) { /* NPWG = Namespace Preferred Write Granularity */ phys_bs = bs * (1 + le16_to_cpu(id->npwg)); /* NOWS = Namespace Optimal Write Size */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ec011dce4a97..2b66a86d7da6 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -597,7 +597,11 @@ enum { enum { NVME_NS_FEAT_THIN = 1 << 0, NVME_NS_FEAT_ATOMICS = 1 << 1, - NVME_NS_FEAT_IO_OPT = 1 << 4, + NVME_NS_FEAT_OPTPERF_SHIFT = 4, + /* In NVMe version 2.0 and below, OPTPERF is only bit 4 of NSFEAT */ + NVME_NS_FEAT_OPTPERF_MASK = 0x1, + /* Since version 2.1, OPTPERF is bits 4 and 5 of NSFEAT */ + NVME_NS_FEAT_OPTPERF_MASK_2_1 = 0x3, NVME_NS_ATTR_RO = 1 << 0, NVME_NS_FLBAS_LBA_MASK = 0xf, NVME_NS_FLBAS_LBA_UMASK = 0x60, From 823340b7e877b410a814177360df34810878e916 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:49 -0700 Subject: [PATCH 082/146] nvme: always issue I/O Command Set specific Identify Namespace Currently, the I/O Command Set specific Identify Namespace structure is only fetched for controllers that support extended LBA formats. This is because struct nvme_id_ns_nvm is only used by nvme_configure_pi_elbas(), which is only called when the ELBAS bit is set in the CTRATT field of the Identify Controller structure. However, the I/O Command Set specific Identify Namespace structure will soon be used in nvme_update_disk_info(), so always try to obtain it in nvme_update_ns_info_block(). This Identify structure is first defined in NVMe spec version 2.0, but controllers reporting older versions could still implement it. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 04a8dae12333..6e108086fb76 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2356,7 +2356,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, } lbaf = nvme_lbaf_index(id->flbas); - if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { + if (nvme_id_cns_ok(ns->ctrl, NVME_ID_CNS_CS_NS)) { ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); if (ret < 0) goto out; From b465046c8cca9e418c7b39cfb41bb8e3b62f22f6 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:50 -0700 Subject: [PATCH 083/146] nvme: add from0based() helper The NVMe specifications are big fans of "0's based"/"0-based" fields for encoding values that must be positive. The encoded value is 1 less than the value it represents. nvmet already provides a helper to0based() for encoding 0's based values, so add a corresponding helper to decode these fields on the host side. Suggested-by: Christoph Hellwig Signed-off-by: Caleb Sander Mateos Signed-off-by: Keith Busch --- drivers/nvme/host/nvme.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 9971045dbc05..ccd5e05dac98 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -762,6 +762,12 @@ static inline u32 nvme_bytes_to_numd(size_t len) return (len >> 2) - 1; } +/* Decode a 2-byte "0's based"/"0-based" field */ +static inline u32 from0based(__le16 value) +{ + return (u32)le16_to_cpu(value) + 1; +} + static inline bool nvme_is_ana_error(u16 status) { switch (status & NVME_SCT_SC_MASK) { From 1029298da36559866ecb5ddaa3d78deb02f2ab9b Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:51 -0700 Subject: [PATCH 084/146] nvme: set discard_granularity from NPDG/NPDA Currently, nvme_config_discard() always sets the discard_granularity queue limit to the logical block size. However, NVMe namespaces can advertise a larger preferred discard granularity in the NPDG or NPDA field of the Identify Namespace structure or the NPDGL or NPDAL fields of the I/O Command Set Specific Identify Namespace structure. Use these fields to compute the discard_granularity limit. The logic is somewhat involved. First, the fields are optional. NPDG is only reported if the low bit of OPTPERF is set in NSFEAT. NPDA is reported if any bit of OPTPERF is set. And NPDGL and NPDAL are reported if the high bit of OPTPERF is set. NPDGL and NPDAL can also each be set to 0 to opt out of reporting a limit. I/O Command Set Specific Identify Namespace may also not be supported by older NVMe controllers. Another complication is that multiple values may be reported among NPDG, NPDGL, NPDA, and NPDAL. The spec says to prefer the values reported in the L variants. The spec says NPDG should be a multiple of NPDA and NPDGL should be a multiple of NPDAL, but it doesn't specify a relationship between NPDG and NPDAL or NPDGL and NPDA. So use the maximum of the reported NPDG(L) and NPDA(L) values as the discard_granularity. Signed-off-by: Caleb Sander Mateos Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6e108086fb76..d2256fa95685 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2059,12 +2059,13 @@ static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl, } static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, - struct queue_limits *lim) + struct nvme_id_ns_nvm *nvm, struct queue_limits *lim) { struct nvme_ns_head *head = ns->head; struct nvme_ctrl *ctrl = ns->ctrl; u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + u32 npdg = 1, npda = 1; bool valid = true; u8 optperf; @@ -2117,7 +2118,35 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, else lim->max_hw_discard_sectors = 0; - lim->discard_granularity = lim->logical_block_size; + /* + * NVMe namespaces advertise both a preferred deallocate granularity + * (for a discard length) and alignment (for a discard starting offset). + * However, Linux block devices advertise a single discard_granularity. + * From NVM Command Set specification 1.1 section 5.2.2, the NPDGL/NPDAL + * fields in the NVM Command Set Specific Identify Namespace structure + * are preferred to NPDG/NPDA in the Identify Namespace structure since + * they can represent larger values. However, NPDGL or NPDAL may be 0 if + * unsupported. NPDG and NPDA are 0's based. + * From Figure 115 of NVM Command Set specification 1.1, NPDGL and NPDAL + * are supported if the high bit of OPTPERF is set. NPDG is supported if + * the low bit of OPTPERF is set. NPDA is supported if either is set. + * NPDG should be a multiple of NPDA, and likewise NPDGL should be a + * multiple of NPDAL, but the spec doesn't say anything about NPDG vs. + * NPDAL or NPDGL vs. NPDA. So compute the maximum instead of assuming + * NPDG(L) is the larger. If neither NPDG, NPDGL, NPDA, nor NPDAL are + * supported, default the discard_granularity to the logical block size. + */ + if (optperf & 0x2 && nvm && nvm->npdgl) + npdg = le32_to_cpu(nvm->npdgl); + else if (optperf & 0x1) + npdg = from0based(id->npdg); + if (optperf & 0x2 && nvm && nvm->npdal) + npda = le32_to_cpu(nvm->npdal); + else if (optperf) + npda = from0based(id->npda); + if (check_mul_overflow(max(npdg, npda), lim->logical_block_size, + &lim->discard_granularity)) + lim->discard_granularity = lim->logical_block_size; if (ctrl->dmrl) lim->max_discard_segments = ctrl->dmrl; @@ -2384,7 +2413,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, nvme_set_ctrl_limits(ns->ctrl, &lim, false); nvme_configure_metadata(ns->ctrl, ns->head, id, nvm, info); nvme_set_chunk_sectors(ns, id, &lim); - if (!nvme_update_disk_info(ns, id, &lim)) + if (!nvme_update_disk_info(ns, id, nvm, &lim)) capacity = 0; if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && From e0d56e7055d3762732504eddc059a4a142227e0f Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:52 -0700 Subject: [PATCH 085/146] nvmet: use NVME_NS_FEAT_OPTPERF_SHIFT Use the NVME_NS_FEAT_OPTPERF_SHIFT constant in nvmet_bdev_set_limits() to set the OPTPERF bits of the nvme_id_ns NSFEAT field instead of the magic number 4. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/io-cmd-bdev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index f15d1c213bc6..a8a7d3a88ef2 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -30,11 +30,11 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->nacwu = lpp0b; /* - * Bit 4 indicates that the fields NPWG, NPWA, NPDG, NPDA, and + * OPTPERF = 01b indicates that the fields NPWG, NPWA, NPDG, NPDA, and * NOWS are defined for this namespace and should be used by * the host for I/O optimization. */ - id->nsfeat |= 1 << 4; + id->nsfeat |= 0x1 << NVME_NS_FEAT_OPTPERF_SHIFT; /* NPWG = Namespace Preferred Write Granularity. 0's based */ id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev)); /* NPWA = Namespace Preferred Write Alignment. 0's based */ From c4cfe8c328aee9e3519a04810480ce8e1fcaeeb7 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 27 Feb 2026 13:23:53 -0700 Subject: [PATCH 086/146] nvmet: report NPDGL and NPDAL A block device with a very large discard_granularity queue limit may not be able to report it in the 16-bit NPDG and NPDA fields in the Identify Namespace data structure. For this reason, version 2.1 of the NVMe specs added 32-bit fields NPDGL and NPDAL to the NVM Command Set Specific Identify Namespace structure. So report the discard_granularity there too and set OPTPERF to 11b to indicate those fields are supported. Signed-off-by: Caleb Sander Mateos Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 2 ++ drivers/nvme/target/io-cmd-bdev.c | 19 +++++++++++++++---- drivers/nvme/target/nvmet.h | 2 ++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9de93f65d7d7..c0d38480bb62 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1057,6 +1057,8 @@ static void nvme_execute_identify_ns_nvm(struct nvmet_req *req) status = NVME_SC_INTERNAL; goto out; } + if (req->ns->bdev) + nvmet_bdev_set_nvm_limits(req->ns->bdev, id); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); kfree(id); out: diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index a8a7d3a88ef2..f2d9e8901df4 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -30,11 +30,11 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->nacwu = lpp0b; /* - * OPTPERF = 01b indicates that the fields NPWG, NPWA, NPDG, NPDA, and - * NOWS are defined for this namespace and should be used by - * the host for I/O optimization. + * OPTPERF = 11b indicates that the fields NPWG, NPWA, NPDG, NPDA, + * NPDGL, NPDAL, and NOWS are defined for this namespace and should be + * used by the host for I/O optimization. */ - id->nsfeat |= 0x1 << NVME_NS_FEAT_OPTPERF_SHIFT; + id->nsfeat |= 0x3 << NVME_NS_FEAT_OPTPERF_SHIFT; /* NPWG = Namespace Preferred Write Granularity. 0's based */ id->npwg = to0based(bdev_io_min(bdev) / bdev_logical_block_size(bdev)); /* NPWA = Namespace Preferred Write Alignment. 0's based */ @@ -52,6 +52,17 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) id->dlfeat = (1 << 3) | 0x1; } +void nvmet_bdev_set_nvm_limits(struct block_device *bdev, + struct nvme_id_ns_nvm *id) +{ + /* + * NPDGL = Namespace Preferred Deallocate Granularity Large + * NPDAL = Namespace Preferred Deallocate Alignment Large + */ + id->npdgl = id->npdal = cpu_to_le32(bdev_discard_granularity(bdev) / + bdev_logical_block_size(bdev)); +} + void nvmet_bdev_ns_disable(struct nvmet_ns *ns) { if (ns->bdev_file) { diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index f5d22267df6a..5db8f0d6e3f2 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -549,6 +549,8 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl); u16 nvmet_parse_connect_cmd(struct nvmet_req *req); u32 nvmet_connect_cmd_data_len(struct nvmet_req *req); void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id); +void nvmet_bdev_set_nvm_limits(struct block_device *bdev, + struct nvme_id_ns_nvm *id); u16 nvmet_bdev_parse_io_cmd(struct nvmet_req *req); u16 nvmet_file_parse_io_cmd(struct nvmet_req *req); u16 nvmet_bdev_zns_parse_io_cmd(struct nvmet_req *req); From 40f0496b617b431f8d2dd94d7f785c1121f8a68a Mon Sep 17 00:00:00 2001 From: Robert Beckett Date: Fri, 20 Mar 2026 19:22:08 +0000 Subject: [PATCH 087/146] nvme: respect NVME_QUIRK_DISABLE_WRITE_ZEROES when wzsl is set The NVM Command Set Identify Controller data may report a non-zero Write Zeroes Size Limit (wzsl). When present, nvme_init_non_mdts_limits() unconditionally overrides max_zeroes_sectors from wzsl, even if NVME_QUIRK_DISABLE_WRITE_ZEROES previously set it to zero. This effectively re-enables write zeroes for devices that need it disabled, defeating the quirk. Several Kingston OM* drives rely on this quirk to avoid firmware issues with write zeroes commands. Check for the quirk before applying the wzsl override. Fixes: 5befc7c26e5a ("nvme: implement non-mdts command limits") Cc: stable@vger.kernel.org Signed-off-by: Robert Beckett Assisted-by: claude-opus-4-6-v1 Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d2256fa95685..b42d8768d297 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3419,7 +3419,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl) ctrl->dmrl = id->dmrl; ctrl->dmrsl = le32_to_cpu(id->dmrsl); - if (id->wzsl) + if (id->wzsl && !(ctrl->quirks & NVME_QUIRK_DISABLE_WRITE_ZEROES)) ctrl->max_zeroes_sectors = nvme_mps_to_sectors(ctrl, id->wzsl); free_data: From a8eebf9699d69987cc49cec4e4fdb4111ab32423 Mon Sep 17 00:00:00 2001 From: Robert Beckett Date: Fri, 20 Mar 2026 19:22:09 +0000 Subject: [PATCH 088/146] nvme-pci: add NVME_QUIRK_DISABLE_WRITE_ZEROES for Kingston OM3SGP4 The Kingston OM3SGP42048K2-A00 (PCI ID 2646:502f) firmware has a race condition when processing concurrent write zeroes and DSM (discard) commands, causing spurious "LBA Out of Range" errors and IOMMU page faults at address 0x0. The issue is reliably triggered by running two concurrent mkfs commands on different partitions of the same drive, which generates interleaved write zeroes and discard operations. Disable write zeroes for this device, matching the pattern used for other Kingston OM* drives that have similar firmware issues. Cc: stable@vger.kernel.org Signed-off-by: Robert Beckett Assisted-by: claude-opus-4-6-v1 Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8d09fa7d7ff9..9aa19255b041 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -4176,6 +4176,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x2646, 0x502F), /* KINGSTON OM3SGP4xxxxK NVMe SSD */ + .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */ From 09e8f0f93491c6be867f32d4edc0b16fb5da785e Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 20 Mar 2026 10:20:44 +1000 Subject: [PATCH 089/146] nvme: Add the DHCHAP maximum HD IDs In preperation for using DHCHAP length in upcoming host and target patches let's add the hash and diffie-hellman ID length macros. Reviewed-by: Christoph Hellwig Reviewed-by: Yunje Shin Reviewed-by: Hannes Reinecke Reviewed-by: Chris Leech Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- include/linux/nvme.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2b66a86d7da6..041f30931a90 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -2348,4 +2348,8 @@ enum nvme_pr_change_ptpl { #define NVME_PR_IGNORE_KEY (1 << 3) +/* Section 8.3.4.5.2 of the NVMe 2.1 */ +#define NVME_AUTH_DHCHAP_MAX_HASH_IDS 30 +#define NVME_AUTH_DHCHAP_MAX_DH_IDS 30 + #endif /* _LINUX_NVME_H */ From 33eb451044498098babb93b4161e896e0a3e9291 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Fri, 20 Mar 2026 10:20:45 +1000 Subject: [PATCH 090/146] nvme-auth: Don't propose NVME_AUTH_DHGROUP_NULL with SC_C Section 8.3.4.5.2 of the NVMe 2.1 base spec states that """ The 00h identifier shall not be proposed in an AUTH_Negotiate message that requests secure channel concatenation (i.e., with the SC_C field set to a non-zero value). """ We need to ensure that we don't set the NVME_AUTH_DHGROUP_NULL idlist if SC_C is set. Reviewed-by: Hannes Reinecke Reviewed-by: Chris Leech Reviewed-by: Christoph Hellwig Signed-off-by: Kamaljit Singh Signed-off-by: Alistair Francis Signed-off-by: Keith Busch --- drivers/nvme/host/auth.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index c8cd633cb0ea..bbedbe181c8a 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -123,6 +123,8 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, { struct nvmf_auth_dhchap_negotiate_data *data = chap->buf; size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol); + u8 dh_list_offset = NVME_AUTH_DHCHAP_MAX_DH_IDS; + u8 *idlist = data->auth_protocol[0].dhchap.idlist; if (size > CHAP_BUF_SIZE) { chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD; @@ -139,21 +141,22 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl, data->sc_c = NVME_AUTH_SECP_NEWTLSPSK; } else data->sc_c = NVME_AUTH_SECP_NOSC; + chap->sc_c = data->sc_c; data->napd = 1; data->auth_protocol[0].dhchap.authid = NVME_AUTH_DHCHAP_AUTH_ID; data->auth_protocol[0].dhchap.halen = 3; - data->auth_protocol[0].dhchap.dhlen = 6; - data->auth_protocol[0].dhchap.idlist[0] = NVME_AUTH_HASH_SHA256; - data->auth_protocol[0].dhchap.idlist[1] = NVME_AUTH_HASH_SHA384; - data->auth_protocol[0].dhchap.idlist[2] = NVME_AUTH_HASH_SHA512; - data->auth_protocol[0].dhchap.idlist[30] = NVME_AUTH_DHGROUP_NULL; - data->auth_protocol[0].dhchap.idlist[31] = NVME_AUTH_DHGROUP_2048; - data->auth_protocol[0].dhchap.idlist[32] = NVME_AUTH_DHGROUP_3072; - data->auth_protocol[0].dhchap.idlist[33] = NVME_AUTH_DHGROUP_4096; - data->auth_protocol[0].dhchap.idlist[34] = NVME_AUTH_DHGROUP_6144; - data->auth_protocol[0].dhchap.idlist[35] = NVME_AUTH_DHGROUP_8192; - - chap->sc_c = data->sc_c; + idlist[0] = NVME_AUTH_HASH_SHA256; + idlist[1] = NVME_AUTH_HASH_SHA384; + idlist[2] = NVME_AUTH_HASH_SHA512; + if (chap->sc_c == NVME_AUTH_SECP_NOSC) + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_NULL; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_2048; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_3072; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_4096; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_6144; + idlist[dh_list_offset++] = NVME_AUTH_DHGROUP_8192; + data->auth_protocol[0].dhchap.dhlen = + dh_list_offset - NVME_AUTH_DHCHAP_MAX_DH_IDS; return size; } From 3d553be6d295b6cca8fd35cb673fd13934ac4cbc Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 23 Feb 2026 11:23:27 +0100 Subject: [PATCH 091/146] nvmet: replace use of system_wq with system_percpu_wq This patch continues the effort to refactor workqueue APIs, which has begun with the changes introducing new workqueues and a new alloc_workqueue flag: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The point of the refactoring is to eventually alter the default behavior of workqueues to become unbound by default so that their workload placement is optimized by the scheduler. Before that to happen, workqueue users must be converted to the better named new workqueues with no intended behaviour changes: system_wq -> system_percpu_wq system_unbound_wq -> system_dfl_wq This way the old obsolete workqueues (system_wq, system_unbound_wq) can be removed in the future. Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Suggested-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Marco Crivellari Signed-off-by: Keith Busch --- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/fabrics-cmd-auth.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c0d38480bb62..3794ef258556 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -1605,7 +1605,7 @@ void nvmet_execute_keep_alive(struct nvmet_req *req) pr_debug("ctrl %d update keep-alive timer for %d secs\n", ctrl->cntlid, ctrl->kato); - mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + mod_delayed_work(system_percpu_wq, &ctrl->ka_work, ctrl->kato * HZ); out: nvmet_req_complete(req, status); } diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 35f411fad8f9..b9ab80c7a694 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -390,7 +390,7 @@ done: req->sq->dhchap_step != NVME_AUTH_DHCHAP_MESSAGE_FAILURE2) { unsigned long auth_expire_secs = ctrl->kato ? ctrl->kato : 120; - mod_delayed_work(system_wq, &req->sq->auth_expired_work, + mod_delayed_work(system_percpu_wq, &req->sq->auth_expired_work, auth_expire_secs * HZ); goto complete; } From 12f5fb5ee124da68220ed9646ee1ebe3a88f0c89 Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 23 Feb 2026 11:23:29 +0100 Subject: [PATCH 092/146] nvmet-fc: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The refactoring is going to alter the default behavior of alloc_workqueue() to be unbound by default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. For more details see the Link tag below. In order to keep alloc_workqueue() behavior identical, explicitly request WQ_PERCPU. Cc: Justin Tee Cc: Naresh Gottumukkala CC: Paul Ely Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Suggested-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Marco Crivellari Signed-off-by: Keith Busch --- drivers/nvme/target/fc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 4eaadc711c99..d161707559ce 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -792,9 +792,9 @@ nvmet_fc_alloc_target_queue(struct nvmet_fc_tgt_assoc *assoc, if (!queue) return NULL; - queue->work_q = alloc_workqueue("ntfc%d.%d.%d", 0, 0, - assoc->tgtport->fc_target_port.port_num, - assoc->a_id, qid); + queue->work_q = alloc_workqueue("ntfc%d.%d.%d", WQ_PERCPU, 0, + assoc->tgtport->fc_target_port.port_num, + assoc->a_id, qid); if (!queue->work_q) goto out_free_queue; From e8e1a4c0fb2571e19277edc0292bee6102f3652a Mon Sep 17 00:00:00 2001 From: Marco Crivellari Date: Mon, 23 Feb 2026 11:23:28 +0100 Subject: [PATCH 093/146] nvme: add WQ_PERCPU to alloc_workqueue users MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This continues the effort to refactor workqueue APIs, which began with the introduction of new workqueues and a new alloc_workqueue flag in: commit 128ea9f6ccfb ("workqueue: Add system_percpu_wq and system_dfl_wq") commit 930c2ea566af ("workqueue: Add new WQ_PERCPU flag") The refactoring is going to alter the default behavior of alloc_workqueue() to be unbound by default. With the introduction of the WQ_PERCPU flag (equivalent to !WQ_UNBOUND), any alloc_workqueue() caller that doesn’t explicitly specify WQ_UNBOUND must now use WQ_PERCPU. For more details see the Link tag below. In order to keep alloc_workqueue() behavior identical, explicitly request WQ_PERCPU. Link: https://lore.kernel.org/all/20250221112003.1dSuoGyc@linutronix.de/ Suggested-by: Tejun Heo Reviewed-by: Christoph Hellwig Signed-off-by: Marco Crivellari Signed-off-by: Keith Busch --- drivers/nvme/target/core.c | 5 +++-- drivers/nvme/target/tcp.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index d49f41790e4e..03cc7d5f9683 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -1942,12 +1942,13 @@ static int __init nvmet_init(void) if (!nvmet_bvec_cache) return -ENOMEM; - zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0); + zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM | WQ_PERCPU, + 0); if (!zbd_wq) goto out_destroy_bvec_cache; buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq", - WQ_MEM_RECLAIM, 0); + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!buffered_io_wq) goto out_free_zbd_work_queue; diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index acc71a26733f..4b8b02341ddc 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -2225,7 +2225,7 @@ static int __init nvmet_tcp_init(void) int ret; nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU, 0); if (!nvmet_tcp_wq) return -ENOMEM; From 886f35201591ded7958e16fe3750871d3ca0bcdf Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Fri, 13 Mar 2026 17:08:48 +0530 Subject: [PATCH 094/146] nvme-loop: do not cancel I/O and admin tagset during ctrl reset/shutdown Cancelling the I/O and admin tagsets during nvme-loop controller reset or shutdown is unnecessary. The subsequent destruction of the I/O and admin queues already waits for all in-flight target operations to complete. Cancelling the tagsets first also opens a race window. After a request tag has been cancelled, a late completion from the target may still arrive before the queues are destroyed. In that case the completion path may access a request whose tag has already been cancelled or freed, which can lead to a kernel crash. Please see below the kernel crash encountered while running blktests nvme/040: run blktests nvme/040 at 2026-03-08 06:34:27 loop0: detected capacity change from 0 to 2097152 nvmet: adding nsid 1 to subsystem blktests-subsystem-1 nvmet: Created nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349. nvme nvme6: creating 96 I/O queues. nvme nvme6: new ctrl: "blktests-subsystem-1" nvme_log_error: 1 callbacks suppressed block nvme6n1: no usable path - requeuing I/O nvme6c6n1: Read(0x2) @ LBA 2096384, 128 blocks, Host Aborted Command (sct 0x3 / sc 0x71) blk_print_req_error: 1 callbacks suppressed I/O error, dev nvme6c6n1, sector 2096384 op 0x0:(READ) flags 0x2880700 phys_seg 1 prio class 2 block nvme6n1: no usable path - requeuing I/O Kernel attempted to read user page (236) - exploit attempt? (uid: 0) BUG: Kernel NULL pointer dereference on read at 0x00000236 Faulting instruction address: 0xc000000000961274 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries Modules linked in: nvme_loop nvme_fabrics loop nvmet null_blk rpadlpar_io rpaphp xsk_diag bonding rfkill nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink pseries_rng dax_pmem vmx_crypto drm drm_panel_orientation_quirks xfs mlx5_core nvme bnx2x sd_mod nd_pmem nd_btt nvme_core sg papr_scm tls libnvdimm ibmvscsi ibmveth scsi_transport_srp nvme_keyring nvme_auth mdio hkdf pseries_wdt dm_mirror dm_region_hash dm_log dm_mod fuse [last unloaded: loop] CPU: 25 UID: 0 PID: 0 Comm: swapper/25 Kdump: loaded Not tainted 7.0.0-rc3+ #14 PREEMPT Hardware name: IBM,9043-MRX Power11 (architected) 0x820200 0xf000007 of:IBM,FW1120.00 (RF1120_128) hv:phyp pSeries NIP: c000000000961274 LR: c008000009af1808 CTR: c00000000096124c REGS: c0000007ffc0f910 TRAP: 0300 Not tainted (7.0.0-rc3+) MSR: 8000000000009033 CR: 22222222 XER: 00000000 CFAR: c008000009af232c DAR: 0000000000000236 DSISR: 40000000 IRQMASK: 0 GPR00: c008000009af17fc c0000007ffc0fbb0 c000000001c78100 c0000000be05cc00 GPR04: 0000000000000001 0000000000000000 0000000000000007 0000000000000000 GPR08: 0000000000000000 0000000000000000 0000000000000002 c008000009af2318 GPR12: c00000000096124c c0000007ffdab880 0000000000000000 0000000000000000 GPR16: 0000000000000010 0000000000000000 0000000000000004 0000000000000000 GPR20: 0000000000000001 c000000002ca2b00 0000000100043bb2 000000000000000a GPR24: 000000000000000a 0000000000000000 0000000000000000 0000000000000000 GPR28: c000000084021d40 c000000084021d50 c0000000be05cd60 c0000000be05cc00 NIP [c000000000961274] blk_mq_complete_request_remote+0x28/0x2d4 LR [c008000009af1808] nvme_loop_queue_response+0x110/0x290 [nvme_loop] Call Trace: 0xc00000000502c640 (unreliable) nvme_loop_queue_response+0x104/0x290 [nvme_loop] __nvmet_req_complete+0x80/0x498 [nvmet] nvmet_req_complete+0x24/0xf8 [nvmet] nvmet_bio_done+0x58/0xcc [nvmet] bio_endio+0x250/0x390 blk_update_request+0x2e8/0x68c blk_mq_end_request+0x30/0x5c lo_complete_rq+0x94/0x110 [loop] blk_complete_reqs+0x78/0x98 handle_softirqs+0x148/0x454 do_softirq_own_stack+0x3c/0x50 __irq_exit_rcu+0x18c/0x1b4 irq_exit+0x1c/0x34 do_IRQ+0x114/0x278 hardware_interrupt_common_virt+0x28c/0x290 Since the queue teardown path already guarantees that all target-side operations have completed, cancelling the tagsets is redundant and unsafe. So avoid cancelling the I/O and admin tagsets during controller reset and shutdown. Reviewed-by: Christoph Hellwig Signed-off-by: Nilay Shroff Signed-off-by: Keith Busch --- drivers/nvme/target/loop.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 4b3f4f11928d..d98d0cdc5d6f 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -419,7 +419,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) { if (ctrl->ctrl.queue_count > 1) { nvme_quiesce_io_queues(&ctrl->ctrl); - nvme_cancel_tagset(&ctrl->ctrl); nvme_loop_destroy_io_queues(ctrl); } @@ -427,7 +426,6 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_LIVE) nvme_disable_ctrl(&ctrl->ctrl, true); - nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_loop_destroy_admin_queue(ctrl); } From 499d2d2f4cf9f16634db47b06dee9676611b897f Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Tue, 10 Mar 2026 10:53:49 +0100 Subject: [PATCH 095/146] sed-opal: Add STACK_RESET command The TCG Opal device could enter a state where no new session can be created, blocking even Discovery or PSID reset. While a power cycle or waiting for the timeout should work, there is another possibility for recovery: using the Stack Reset command. The Stack Reset command is defined in the TCG Storage Architecture Core Specification and is mandatory for all Opal devices (see Section 3.3.6 of the Opal SSC specification). This patch implements the Stack Reset command. Sending it should clear all active sessions immediately, allowing subsequent commands to run successfully. While it is a TCG transport layer command, the Linux kernel implements only Opal ioctls, so it makes sense to use the IOC_OPAL ioctl interface. The Stack Reset takes no arguments; the response can be success or pending. If the command reports a pending state, userspace can try to repeat it; in this case, the code returns -EBUSY. Signed-off-by: Milan Broz Reviewed-by: Ondrej Kozina Link: https://patch.msgid.link/20260310095349.411287-1-gmazyland@gmail.com Signed-off-by: Jens Axboe --- block/opal_proto.h | 20 +++++++++++++++ block/sed-opal.c | 47 +++++++++++++++++++++++++++++++++++ include/linux/sed-opal.h | 1 + include/uapi/linux/sed-opal.h | 1 + 4 files changed, 69 insertions(+) diff --git a/block/opal_proto.h b/block/opal_proto.h index d138785b8198..7c24247aa186 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -19,6 +19,7 @@ enum { TCG_SECP_00 = 0, TCG_SECP_01, + TCG_SECP_02, }; /* @@ -273,6 +274,25 @@ struct opal_header { struct opal_data_subpacket subpkt; }; +/* + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 3.3.4.7.5 STACK_RESET + */ +#define OPAL_STACK_RESET 0x0002 + +struct opal_stack_reset { + u8 extendedComID[4]; + __be32 request_code; +}; + +struct opal_stack_reset_response { + u8 extendedComID[4]; + __be32 request_code; + u8 reserved0[2]; + __be16 data_length; + __be32 response; +}; + #define FC_TPER 0x0001 #define FC_LOCKING 0x0002 #define FC_GEOMETRY 0x0003 diff --git a/block/sed-opal.c b/block/sed-opal.c index c34d19e91201..79b290d9458a 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -3545,6 +3545,50 @@ static int opal_get_sum_ranges(struct opal_dev *dev, struct opal_sum_ranges *opa return ret; } +static int opal_stack_reset(struct opal_dev *dev) +{ + struct opal_stack_reset *req; + struct opal_stack_reset_response *resp; + int ret; + + mutex_lock(&dev->dev_lock); + + memset(dev->cmd, 0, IO_BUFFER_LENGTH); + req = (struct opal_stack_reset *)dev->cmd; + req->extendedComID[0] = dev->comid >> 8; + req->extendedComID[1] = dev->comid & 0xFF; + req->request_code = cpu_to_be32(OPAL_STACK_RESET); + + ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02, + dev->cmd, IO_BUFFER_LENGTH, true); + if (ret) { + pr_debug("Error sending stack reset: %d\n", ret); + goto out; + } + + memset(dev->resp, 0, IO_BUFFER_LENGTH); + ret = dev->send_recv(dev->data, dev->comid, TCG_SECP_02, + dev->resp, IO_BUFFER_LENGTH, false); + if (ret) { + pr_debug("Error receiving stack reset response: %d\n", ret); + goto out; + } + + resp = (struct opal_stack_reset_response *)dev->resp; + if (be16_to_cpu(resp->data_length) != 4) { + pr_debug("Stack reset pending\n"); + ret = -EBUSY; + goto out; + } + if (be32_to_cpu(resp->response) != 0) { + pr_debug("Stack reset failed: %u\n", be32_to_cpu(resp->response)); + ret = -EIO; + } +out: + mutex_unlock(&dev->dev_lock); + return ret; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -3642,6 +3686,9 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GET_SUM_STATUS: ret = opal_get_sum_ranges(dev, p, arg); break; + case IOC_OPAL_STACK_RESET: + ret = opal_stack_reset(dev); + break; default: break; diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h index aa006edb612b..0630430cc01a 100644 --- a/include/linux/sed-opal.h +++ b/include/linux/sed-opal.h @@ -57,6 +57,7 @@ static inline bool is_sed_ioctl(unsigned int cmd) case IOC_OPAL_LR_SET_START_LEN: case IOC_OPAL_ENABLE_DISABLE_LR: case IOC_OPAL_GET_SUM_STATUS: + case IOC_OPAL_STACK_RESET: return true; } return false; diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 9830298ec51c..ef4d3be6ca7f 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -245,5 +245,6 @@ struct opal_revert_lsp { #define IOC_OPAL_LR_SET_START_LEN _IOW('p', 243, struct opal_user_lr_setup) #define IOC_OPAL_ENABLE_DISABLE_LR _IOW('p', 244, struct opal_user_lr_setup) #define IOC_OPAL_GET_SUM_STATUS _IOW('p', 245, struct opal_sum_ranges) +#define IOC_OPAL_STACK_RESET _IO('p', 246) #endif /* _UAPI_SED_OPAL_H */ From 267ec4d7223a783f029a980f41b93c39b17996da Mon Sep 17 00:00:00 2001 From: Daan De Meyer Date: Tue, 31 Mar 2026 10:51:28 +0000 Subject: [PATCH 096/146] loop: fix partition scan race between udev and loop_reread_partitions() When LOOP_CONFIGURE is called with LO_FLAGS_PARTSCAN, the following sequence occurs: 1. disk_force_media_change() sets GD_NEED_PART_SCAN 2. Uevent suppression is lifted and a KOBJ_CHANGE uevent is sent 3. loop_global_unlock() releases the lock 4. loop_reread_partitions() calls bdev_disk_changed() to scan There is a race between steps 2 and 4: when udev receives the uevent and opens the device before loop_reread_partitions() runs, blkdev_get_whole() in bdev.c sees GD_NEED_PART_SCAN set and calls bdev_disk_changed() for a first scan. Then loop_reread_partitions() does a second scan. The open_mutex serializes these two scans, but does not prevent both from running. The second scan in bdev_disk_changed() drops all partition devices from the first scan (via blk_drop_partitions()) before re-adding them, causing partition block devices to briefly disappear. This breaks any systemd unit with BindsTo= on the partition device: systemd observes the device going dead, fails the dependent units, and does not retry them when the device reappears. Fix this by removing the GD_NEED_PART_SCAN set from disk_force_media_change() entirely. None of the current callers need the lazy on-open partition scan triggered by this flag: - floppy: sets GENHD_FL_NO_PART, so disk_has_partscan() is always false and GD_NEED_PART_SCAN has no effect. - loop (loop_configure, loop_change_fd): when LO_FLAGS_PARTSCAN is set, loop_reread_partitions() performs an explicit scan. When not set, GD_SUPPRESS_PART_SCAN prevents the lazy scan path. - loop (__loop_clr_fd): calls bdev_disk_changed() explicitly if LO_FLAGS_PARTSCAN is set. - nbd (nbd_clear_sock_ioctl): capacity is set to zero immediately after; nbd manages GD_NEED_PART_SCAN explicitly elsewhere. With GD_NEED_PART_SCAN no longer set by disk_force_media_change(), udev opening the loop device after the uevent no longer triggers a redundant scan in blkdev_get_whole(), and only the single explicit scan from loop_reread_partitions() runs. A regression test for this bug has been submitted to blktests: https://github.com/linux-blktests/blktests/pull/240. Fixes: 9f65c489b68d ("loop: raise media_change event") Signed-off-by: Daan De Meyer Acked-by: Christian Brauner Link: https://patch.msgid.link/20260331105130.1077599-1-daan@amutable.com Signed-off-by: Jens Axboe --- block/disk-events.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/disk-events.c b/block/disk-events.c index 9f9f9f8a2d6b..074731ecc3d2 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -290,13 +290,14 @@ EXPORT_SYMBOL(disk_check_media_change); * Should be called when the media changes for @disk. Generates a uevent * and attempts to free all dentries and inodes and invalidates all block * device page cache entries in that case. + * + * Callers that need a partition re-scan should arrange for one explicitly. */ void disk_force_media_change(struct gendisk *disk) { disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); inc_diskseq(disk); bdev_mark_dead(disk->part0, true); - set_bit(GD_NEED_PART_SCAN, &disk->state); } EXPORT_SYMBOL_GPL(disk_force_media_change); From 2a2f520fda824b5a25c93f2249578ea150c24e06 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Tue, 31 Mar 2026 19:12:16 +0800 Subject: [PATCH 097/146] block: fix zones_cond memory leak on zone revalidation error paths When blk_revalidate_disk_zones() fails after disk_revalidate_zone_resources() has allocated args.zones_cond, the memory is leaked because no error path frees it. Fixes: 6e945ffb6555 ("block: use zone condition to determine conventional zones") Suggested-by: Damien Le Moal Signed-off-by: Jackie Liu Link: https://patch.msgid.link/20260331111216.24242-1-liu.yun@linux.dev Signed-off-by: Jens Axboe --- block/blk-zoned.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index e1a23c8b676d..bfd9733ebd31 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -2016,6 +2016,7 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, { struct queue_limits *lim = &disk->queue->limits; unsigned int pool_size; + int ret = 0; args->disk = disk; args->nr_zones = @@ -2038,10 +2039,13 @@ static int disk_revalidate_zone_resources(struct gendisk *disk, pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); - if (!disk->zone_wplugs_hash) - return disk_alloc_zone_resources(disk, pool_size); + if (!disk->zone_wplugs_hash) { + ret = disk_alloc_zone_resources(disk, pool_size); + if (ret) + kfree(args->zones_cond); + } - return 0; + return ret; } /* @@ -2073,6 +2077,7 @@ static int disk_update_zone_resources(struct gendisk *disk, disk->zone_capacity = args->zone_capacity; disk->last_zone_capacity = args->last_zone_capacity; disk_set_zones_cond_array(disk, args->zones_cond); + args->zones_cond = NULL; /* * Some devices can advertise zone resource limits that are larger than @@ -2353,21 +2358,30 @@ int blk_revalidate_disk_zones(struct gendisk *disk) } memalloc_noio_restore(noio_flag); + if (ret <= 0) + goto free_resources; + /* * If zones where reported, make sure that the entire disk capacity * has been checked. */ - if (ret > 0 && args.sector != capacity) { + if (args.sector != capacity) { pr_warn("%s: Missing zones from sector %llu\n", disk->disk_name, args.sector); ret = -ENODEV; + goto free_resources; } - if (ret > 0) - return disk_update_zone_resources(disk, &args); + ret = disk_update_zone_resources(disk, &args); + if (ret) + goto free_resources; + return 0; + +free_resources: pr_warn("%s: failed to revalidate zones\n", disk->disk_name); + kfree(args.zones_cond); memflags = blk_mq_freeze_queue(q); disk_free_zone_resources(disk); blk_mq_unfreeze_queue(q, memflags); From b2a78fec344ead9ffca63ee13018f482392bf09d Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 27 Mar 2026 05:32:45 +0900 Subject: [PATCH 098/146] zloop: add max_open_zones option Introduce the new max_open_zones option to allow specifying a limit on the maximum number of open zones of a zloop device. This change allows creating a zloop device that can more closely mimick the characteristics of a physical SMR drive. When set to a non zero value, only up to max_open_zones zones can be in the implicit open (BLK_ZONE_COND_IMP_OPEN) and explicit open (BLK_ZONE_COND_EXP_OPEN) conditions at any time. The transition to the implicit open condition of a zone on a write operation can result in an implicit close of an already implicitly open zone. This is handled in the function zloop_do_open_zone(). This function also handles transitions to the explicit open condition. Implicit close transitions are handled using an LRU ordered list of open zones which is managed using the helper functions zloop_lru_rotate_open_zone() and zloop_lru_remove_open_zone(). Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260326203245.946830-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- .../admin-guide/blockdev/zoned_loop.rst | 5 +- drivers/block/zloop.c | 180 ++++++++++++++++-- 2 files changed, 168 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zoned_loop.rst b/Documentation/admin-guide/blockdev/zoned_loop.rst index a01f857b36ad..f4f1f3121bf9 100644 --- a/Documentation/admin-guide/blockdev/zoned_loop.rst +++ b/Documentation/admin-guide/blockdev/zoned_loop.rst @@ -62,7 +62,7 @@ The options available for the add command can be listed by reading the /dev/zloop-control device:: $ cat /dev/zloop-control - add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io + add id=%d,capacity_mb=%u,zone_size_mb=%u,zone_capacity_mb=%u,conv_zones=%u,max_open_zones=%u,base_dir=%s,nr_queues=%u,queue_depth=%u,buffered_io,zone_append=%u,ordered_zone_append,discard_write_cache remove id=%d In more details, the options that can be used with the "add" command are as @@ -80,6 +80,9 @@ zone_capacity_mb Device zone capacity (must always be equal to or lower conv_zones Total number of conventioanl zones starting from sector 0 Default: 8 +max_open_zones Maximum number of open sequential write required zones + (0 for no limit). + Default: 0 base_dir Path to the base directory where to create the directory containing the zone files of the device. Default=/var/local/zloop. diff --git a/drivers/block/zloop.c b/drivers/block/zloop.c index 86a1324c27b3..8baf642037fd 100644 --- a/drivers/block/zloop.c +++ b/drivers/block/zloop.c @@ -36,6 +36,7 @@ enum { ZLOOP_OPT_ZONE_APPEND = (1 << 9), ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11), + ZLOOP_OPT_MAX_OPEN_ZONES = (1 << 12), }; static const match_table_t zloop_opt_tokens = { @@ -51,6 +52,7 @@ static const match_table_t zloop_opt_tokens = { { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" }, + { ZLOOP_OPT_MAX_OPEN_ZONES, "max_open_zones=%u" }, { ZLOOP_OPT_ERR, NULL } }; @@ -59,6 +61,7 @@ static const match_table_t zloop_opt_tokens = { #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) #define ZLOOP_DEF_NR_ZONES 64 #define ZLOOP_DEF_NR_CONV_ZONES 8 +#define ZLOOP_DEF_MAX_OPEN_ZONES 0 #define ZLOOP_DEF_BASE_DIR "/var/local/zloop" #define ZLOOP_DEF_NR_QUEUES 1 #define ZLOOP_DEF_QUEUE_DEPTH 128 @@ -76,6 +79,7 @@ struct zloop_options { sector_t zone_size; sector_t zone_capacity; unsigned int nr_conv_zones; + unsigned int max_open_zones; char *base_dir; unsigned int nr_queues; unsigned int queue_depth; @@ -99,7 +103,12 @@ enum zloop_zone_flags { ZLOOP_ZONE_SEQ_ERROR, }; +/* + * Zone descriptor. + * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock + */ struct zloop_zone { + struct list_head open_zone_entry; struct file *file; unsigned long flags; @@ -133,8 +142,13 @@ struct zloop_device { sector_t zone_capacity; unsigned int nr_zones; unsigned int nr_conv_zones; + unsigned int max_open_zones; unsigned int block_size; + spinlock_t open_zones_lock; + struct list_head open_zones_lru_list; + unsigned int nr_open_zones; + struct zloop_zone zones[] __counted_by(nr_zones); }; @@ -158,6 +172,122 @@ static unsigned int rq_zone_no(struct request *rq) return blk_rq_pos(rq) >> zlo->zone_shift; } +/* + * Open an already open zone. This is mostly a no-op, except for the imp open -> + * exp open condition change that may happen. We also move a zone at the tail of + * the list of open zones so that if we need to + * implicitly close one open zone, we can do so in LRU order. + */ +static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo, + struct zloop_zone *zone) +{ + if (zlo->max_open_zones) { + spin_lock(&zlo->open_zones_lock); + list_move_tail(&zone->open_zone_entry, + &zlo->open_zones_lru_list); + spin_unlock(&zlo->open_zones_lock); + } +} + +static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo, + struct zloop_zone *zone) +{ + if (zone->cond == BLK_ZONE_COND_IMP_OPEN || + zone->cond == BLK_ZONE_COND_EXP_OPEN) { + spin_lock(&zlo->open_zones_lock); + list_del_init(&zone->open_zone_entry); + zlo->nr_open_zones--; + spin_unlock(&zlo->open_zones_lock); + } +} + +static inline bool zloop_can_open_zone(struct zloop_device *zlo) +{ + return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones; +} + +/* + * If we have reached the maximum open zones limit, attempt to close an + * implicitly open zone (if we have any) so that we can implicitly open another + * zone without exceeding the maximum number of open zones. + */ +static bool zloop_close_imp_open_zone(struct zloop_device *zlo) +{ + struct zloop_zone *zone; + + lockdep_assert_held(&zlo->open_zones_lock); + + if (zloop_can_open_zone(zlo)) + return true; + + list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) { + if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { + zone->cond = BLK_ZONE_COND_CLOSED; + list_del_init(&zone->open_zone_entry); + zlo->nr_open_zones--; + return true; + } + } + + return false; +} + +static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo, + struct zloop_zone *zone, + bool explicit) +{ + spin_lock(&zlo->open_zones_lock); + + if (explicit) { + /* + * Explicit open: we cannot allow this if we have reached the + * maximum open zones limit. + */ + if (!zloop_can_open_zone(zlo)) + goto fail; + zone->cond = BLK_ZONE_COND_EXP_OPEN; + } else { + /* + * Implicit open case: if we have reached the maximum open zones + * limit, try to close an implicitly open zone first. + */ + if (!zloop_close_imp_open_zone(zlo)) + goto fail; + zone->cond = BLK_ZONE_COND_IMP_OPEN; + } + + zlo->nr_open_zones++; + list_add_tail(&zone->open_zone_entry, + &zlo->open_zones_lru_list); + + spin_unlock(&zlo->open_zones_lock); + + return true; + +fail: + spin_unlock(&zlo->open_zones_lock); + + return false; +} + +static bool zloop_do_open_zone(struct zloop_device *zlo, + struct zloop_zone *zone, bool explicit) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + if (explicit) + zone->cond = BLK_ZONE_COND_EXP_OPEN; + zloop_lru_rotate_open_zone(zlo, zone); + return true; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_CLOSED: + return zloop_open_closed_or_empty_zone(zlo, zone, explicit); + default: + return false; + } +} + static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) { struct zloop_zone *zone = &zlo->zones[zone_no]; @@ -191,13 +321,17 @@ static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) spin_lock_irqsave(&zone->wp_lock, flags); if (!file_sectors) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; } else if (file_sectors == zlo->zone_capacity) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; } else { - zone->cond = BLK_ZONE_COND_CLOSED; + if (zone->cond != BLK_ZONE_COND_IMP_OPEN && + zone->cond != BLK_ZONE_COND_EXP_OPEN) + zone->cond = BLK_ZONE_COND_CLOSED; zone->wp = zone->start + file_sectors; } spin_unlock_irqrestore(&zone->wp_lock, flags); @@ -221,19 +355,8 @@ static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) goto unlock; } - switch (zone->cond) { - case BLK_ZONE_COND_EXP_OPEN: - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_CLOSED: - case BLK_ZONE_COND_IMP_OPEN: - zone->cond = BLK_ZONE_COND_EXP_OPEN; - break; - case BLK_ZONE_COND_FULL: - default: + if (!zloop_do_open_zone(zlo, zone, true)) ret = -EIO; - break; - } unlock: mutex_unlock(&zone->lock); @@ -264,6 +387,7 @@ static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) case BLK_ZONE_COND_IMP_OPEN: case BLK_ZONE_COND_EXP_OPEN: spin_lock_irqsave(&zone->wp_lock, flags); + zloop_lru_remove_open_zone(zlo, zone); if (zone->wp == zone->start) zone->cond = BLK_ZONE_COND_EMPTY; else @@ -305,6 +429,7 @@ static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) } spin_lock_irqsave(&zone->wp_lock, flags); + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); @@ -352,6 +477,7 @@ static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) } spin_lock_irqsave(&zone->wp_lock, flags); + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); @@ -478,9 +604,10 @@ static int zloop_seq_write_prep(struct zloop_cmd *cmd) } /* Implicitly open the target zone. */ - if (zone->cond == BLK_ZONE_COND_CLOSED || - zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; + if (!zloop_do_open_zone(zlo, zone, false)) { + ret = -EIO; + goto out_unlock; + } /* * Advance the write pointer, unless ordered zone append is in use. If @@ -490,6 +617,7 @@ static int zloop_seq_write_prep(struct zloop_cmd *cmd) if (!is_append || !zlo->ordered_zone_append) { zone->wp += nr_sectors; if (zone->wp == zone_end) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; } @@ -746,6 +874,7 @@ static bool zloop_set_zone_append_sector(struct request *rq) rq->__sector = zone->wp; zone->wp += blk_rq_sectors(rq); if (zone->wp >= zone_end) { + zloop_lru_remove_open_zone(zlo, zone); zone->cond = BLK_ZONE_COND_FULL; zone->wp = ULLONG_MAX; } @@ -943,6 +1072,7 @@ static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, int ret; mutex_init(&zone->lock); + INIT_LIST_HEAD(&zone->open_zone_entry); spin_lock_init(&zone->wp_lock); zone->start = (sector_t)zone_no << zlo->zone_shift; @@ -1063,12 +1193,20 @@ static int zloop_ctl_add(struct zloop_options *opts) goto out; } + if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) { + pr_err("Invalid maximum number of open zones %u\n", + opts->max_open_zones); + goto out; + } + zlo = kvzalloc_flex(*zlo, zones, nr_zones); if (!zlo) { ret = -ENOMEM; goto out; } WRITE_ONCE(zlo->state, Zlo_creating); + spin_lock_init(&zlo->open_zones_lock); + INIT_LIST_HEAD(&zlo->open_zones_lru_list); ret = mutex_lock_killable(&zloop_ctl_mutex); if (ret) @@ -1096,6 +1234,7 @@ static int zloop_ctl_add(struct zloop_options *opts) zlo->zone_capacity = zlo->zone_size; zlo->nr_zones = nr_zones; zlo->nr_conv_zones = opts->nr_conv_zones; + zlo->max_open_zones = opts->max_open_zones; zlo->buffered_io = opts->buffered_io; zlo->zone_append = opts->zone_append; if (zlo->zone_append) @@ -1143,6 +1282,7 @@ static int zloop_ctl_add(struct zloop_options *opts) lim.logical_block_size = zlo->block_size; if (zlo->zone_append) lim.max_hw_zone_append_sectors = lim.max_hw_sectors; + lim.max_open_zones = zlo->max_open_zones; zlo->tag_set.ops = &zloop_mq_ops; zlo->tag_set.nr_hw_queues = opts->nr_queues; @@ -1326,6 +1466,7 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; opts->zone_size = ZLOOP_DEF_ZONE_SIZE; opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; + opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES; opts->nr_queues = ZLOOP_DEF_NR_QUEUES; opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; @@ -1404,6 +1545,13 @@ static int zloop_parse_options(struct zloop_options *opts, const char *buf) } opts->nr_conv_zones = token; break; + case ZLOOP_OPT_MAX_OPEN_ZONES: + if (match_uint(args, &token)) { + ret = -EINVAL; + goto out; + } + opts->max_open_zones = token; + break; case ZLOOP_OPT_BASE_DIR: p = match_strdup(args); if (!p) { From 23308af722fefed00af5f238024c11710938fba3 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Tue, 31 Mar 2026 16:50:54 +0800 Subject: [PATCH 099/146] blk-cgroup: fix disk reference leak in blkcg_maybe_throttle_current() Add the missing put_disk() on the error path in blkcg_maybe_throttle_current(). When blkcg lookup, blkg lookup, or blkg_tryget() fails, the function jumps to the out label which only calls rcu_read_unlock() but does not release the disk reference acquired by blkcg_schedule_throttle() via get_device(). Since current->throttle_disk is already set to NULL before the lookup, blkcg_exit() cannot release this reference either, causing the disk to never be freed. Restore the reference release that was present as blk_put_queue() in the original code but was inadvertently dropped during the conversion from request_queue to gendisk. Fixes: f05837ed73d0 ("blk-cgroup: store a gendisk to throttle in struct task_struct") Signed-off-by: Jackie Liu Acked-by: Tejun Heo Reviewed-by: Christoph Hellwig Link: https://patch.msgid.link/20260331085054.46857-1-liu.yun@linux.dev Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 2d7b18eb7291..554c87bb4a86 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -2037,6 +2037,7 @@ void blkcg_maybe_throttle_current(void) return; out: rcu_read_unlock(); + put_disk(disk); } /** From f91ffe89b2016d280995a9c28d73288b02d83615 Mon Sep 17 00:00:00 2001 From: Jialin Wang Date: Tue, 31 Mar 2026 10:05:09 +0000 Subject: [PATCH 100/146] blk-iocost: fix busy_level reset when no IOs complete When a disk is saturated, it is common for no IOs to complete within a timer period. Currently, in this case, rq_wait_pct and missed_ppm are calculated as 0, the iocost incorrectly interprets this as meeting QoS targets and resets busy_level to 0. This reset prevents busy_level from reaching the threshold (4) needed to reduce vrate. On certain cloud storage, such as Azure Premium SSD, we observed that iocost may fail to reduce vrate for tens of seconds during saturation, failing to mitigate noisy neighbor issues. Fix this by tracking the number of IO completions (nr_done) in a period. If nr_done is 0 and there are lagging IOs, the saturation status is unknown, so we keep busy_level unchanged. The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5) VMs with 512GB Premium SSD (P20) using the script below. It was not observed on GCP n2d VMs (with 100G pd-ssd and 1.5T local-ssd), and no regressions were found with this patch. In this script, cgA performs large IOs with iodepth=128, while cgB performs small IOs with iodepth=1 rate_iops=100 rw=randrw. With iocost enabled, we expect it to throttle cgA, the submission latency (slat) of cgA should be significantly higher, cgB can reach 200 IOPS and the completion latency (clat) should below. BLK_DEVID="8:0" MODEL="rbps=173471131 rseqiops=3566 rrandiops=3566 wbps=173333269 wseqiops=3566 wrandiops=3566" QOS="rpct=90 rlat=3500 wpct=90 wlat=3500 min=80 max=10000" echo "$BLK_DEVID ctrl=user model=linear $MODEL" > /sys/fs/cgroup/io.cost.model echo "$BLK_DEVID enable=1 ctrl=user $QOS" > /sys/fs/cgroup/io.cost.qos CG_A="/sys/fs/cgroup/cgA" CG_B="/sys/fs/cgroup/cgB" FILE_A="/path/to/sda/A.fio.testfile" FILE_B="/path/to/sda/B.fio.testfile" RESULT_DIR="./iocost_results_$(date +%Y%m%d_%H%M%S)" mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR" get_result() { local file=$1 local label=$2 local results=$(jq -r ' .jobs[0].mixed | ( .iops | tonumber | round ) as $iops | ( .bw_bytes / 1024 / 1024 ) as $bps | ( .slat_ns.mean / 1000000 ) as $slat | ( .clat_ns.mean / 1000000 ) as $avg | ( .clat_ns.max / 1000000 ) as $max | ( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 | ( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 | ( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 | ( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 | "\($iops)|\($bps)|\($slat)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)" ' "$file") IFS='|' read -r iops bps slat avg max p90 p99 p999 p9999 <<<"$results" printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \ "$label" "$iops" "$bps" "$slat" "$avg" "$max" "$p90" "$p99" "$p999" "$p9999" } run_fio() { local cg_path=$1 local filename=$2 local name=$3 local bs=$4 local qd=$5 local out=$6 shift 6 local extra=$@ ( pid=$(sh -c 'echo $PPID') echo $pid >"${cg_path}/cgroup.procs" fio --name="$name" --filename="$filename" --direct=1 --rw=randrw --rwmixread=50 \ --ioengine=libaio --bs="$bs" --iodepth="$qd" --size=4G --runtime=10 \ --time_based --group_reporting --unified_rw_reporting=mixed \ --output-format=json --output="$out" $extra >/dev/null 2>&1 ) & } echo "Starting Test ..." for bs_b in "4k" "32k" "256k"; do echo "Running iteration: BS=$bs_b" out_a="${RESULT_DIR}/cgA_1m.json" out_b="${RESULT_DIR}/cgB_${bs_b}.json" # cgA: Heavy background (BS 1MB, QD 128) run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a" # cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100) run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=100" wait SUMMARY_DATA+="$(get_result "$out_a" "cgA-1m")"$'\n' SUMMARY_DATA+="$(get_result "$out_b" "cgB-$bs_b")"$'\n\n' done echo -e "\nFinal Results Summary:\n" printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n" \ "" "" "" "slat" "clat" "clat" "clat" "clat" "clat" "clat" printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \ "CGROUP" "IOPS" "MB/s" "avg(ms)" "avg(ms)" "max(ms)" "P90(ms)" "P99" "P99.9" "P99.99" echo "$SUMMARY_DATA" echo "Results saved in $RESULT_DIR" Before: slat clat clat clat clat clat clat CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99 cgA-1m 166 166.37 3.44 748.95 1298.29 977.27 1233.13 1300.23 1300.23 cgB-4k 5 0.02 0.02 181.74 761.32 742.39 759.17 759.17 759.17 cgA-1m 167 166.51 1.98 748.68 1549.41 809.50 1451.23 1551.89 1551.89 cgB-32k 6 0.18 0.02 169.98 761.76 742.39 759.17 759.17 759.17 cgA-1m 166 165.55 2.89 750.89 1540.37 851.44 1451.23 1535.12 1535.12 cgB-256k 5 1.30 0.02 191.35 759.51 750.78 759.17 759.17 759.17 After: slat clat clat clat clat clat clat CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99 cgA-1m 162 162.48 6.14 749.69 850.02 826.28 834.67 843.06 851.44 cgB-4k 199 0.78 0.01 1.95 42.12 2.57 7.50 34.87 42.21 cgA-1m 146 146.20 6.83 833.04 908.68 893.39 901.78 910.16 910.16 cgB-32k 200 6.25 0.01 2.32 31.40 3.06 7.50 16.58 31.33 cgA-1m 110 110.46 9.04 1082.67 1197.91 1182.79 1199.57 1199.57 1199.57 cgB-256k 200 49.98 0.02 3.69 22.20 4.88 9.11 20.05 22.15 Signed-off-by: Jialin Wang Acked-by: Tejun Heo Link: https://patch.msgid.link/20260331100509.182882-1-wjl.linux@gmail.com Signed-off-by: Jens Axboe --- block/blk-iocost.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index d145db61e5c3..0cca88a366dc 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) +static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p, + u32 *nr_done) { u32 nr_met[2] = { }; u32 nr_missed[2] = { }; @@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, ioc->period_us * NSEC_PER_USEC); + + *nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE]; } /* was iocg idle this period? */ @@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer) u64 usage_us_sum = 0; u32 ppm_rthr; u32 ppm_wthr; - u32 missed_ppm[2], rq_wait_pct; + u32 missed_ppm[2], rq_wait_pct, nr_done; u64 period_vtime; int prev_busy_level; /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done); /* take care of active iocgs */ spin_lock_irq(&ioc->lock); @@ -2397,9 +2400,17 @@ static void ioc_timer_fn(struct timer_list *timer) * and should increase vtime rate. */ prev_busy_level = ioc->busy_level; - if (rq_wait_pct > RQ_WAIT_BUSY_PCT || - missed_ppm[READ] > ppm_rthr || - missed_ppm[WRITE] > ppm_wthr) { + if (!nr_done && nr_lagging) { + /* + * When there are lagging IOs but no completions, we don't + * know if the IO latency will meet the QoS targets. The + * disk might be saturated or not. We should not reset + * busy_level to 0 (which would prevent vrate from scaling + * up or down), but rather to keep it unchanged. + */ + } else if (rq_wait_pct > RQ_WAIT_BUSY_PCT || + missed_ppm[READ] > ppm_rthr || + missed_ppm[WRITE] > ppm_wthr) { /* clearly missing QoS targets, slow down vrate */ ioc->busy_level = max(ioc->busy_level, 0); ioc->busy_level++; From c691e4b0d80be423f0a7443b53898eafe9c8754b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 26 Mar 2026 22:40:58 +0800 Subject: [PATCH 101/146] bio: fix kmemleak false positives from percpu bio alloc cache When a bio is allocated from the mempool with REQ_ALLOC_CACHE set and later completed, bio_put() places it into the per-cpu bio_alloc_cache via bio_put_percpu_cache() instead of freeing it back to the mempool/slab. The slab allocation remains tracked by kmemleak, but the only reference to the bio is through the percpu cache's free_list, which kmemleak fails to trace through percpu memory. This causes kmemleak to report the cached bios as unreferenced objects. Use symmetric kmemleak_free()/kmemleak_alloc() calls to properly track bios across percpu cache transitions: - bio_put_percpu_cache: call kmemleak_free() when a bio enters the cache, unregistering it from kmemleak tracking. - bio_alloc_percpu_cache: call kmemleak_alloc() when a bio is taken from the cache for reuse, re-registering it so that genuine leaks of reused bios remain detectable. - __bio_alloc_cache_prune: call kmemleak_alloc() before bio_free() so that kmem_cache_free()'s internal kmemleak_free() has a matching allocation to pair with. Tested-by: Yi Zhang Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260326144058.2392319-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/bio.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/block/bio.c b/block/bio.c index 77067fa346d3..c8234d347fc5 100644 --- a/block/bio.c +++ b/block/bio.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "blk.h" @@ -116,6 +117,11 @@ static inline unsigned int bs_bio_slab_size(struct bio_set *bs) return bs->front_pad + sizeof(struct bio) + bs->back_pad; } +static inline void *bio_slab_addr(struct bio *bio) +{ + return (void *)bio - bio->bi_pool->front_pad; +} + static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) { unsigned int size = bs_bio_slab_size(bs); @@ -486,6 +492,9 @@ static struct bio *bio_alloc_percpu_cache(struct bio_set *bs) cache->nr--; put_cpu(); bio->bi_pool = bs; + + kmemleak_alloc(bio_slab_addr(bio), + kmem_cache_size(bs->bio_slab), 1, GFP_NOIO); return bio; } @@ -728,6 +737,9 @@ static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, while ((bio = cache->free_list) != NULL) { cache->free_list = bio->bi_next; cache->nr--; + kmemleak_alloc(bio_slab_addr(bio), + kmem_cache_size(bio->bi_pool->bio_slab), + 1, GFP_KERNEL); bio_free(bio); if (++i == nr) break; @@ -791,6 +803,7 @@ static inline void bio_put_percpu_cache(struct bio *bio) bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; + kmemleak_free(bio_slab_addr(bio)); } else if (in_hardirq()) { lockdep_assert_irqs_disabled(); @@ -798,6 +811,7 @@ static inline void bio_put_percpu_cache(struct bio *bio) bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; + kmemleak_free(bio_slab_addr(bio)); } else { goto out_free; } From 4e56428ed4782e9e1356875af8e714b24c5a8783 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Apr 2026 15:58:51 +0200 Subject: [PATCH 102/146] blk-crypto: fix name of the bio completion callback Fix a simple naming issue in the documentation: the completion routine is called bi_end_io and not bi_complete. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Biggers Link: https://patch.msgid.link/20260401135854.125109-1-hch@lst.de Signed-off-by: Jens Axboe --- Documentation/block/inline-encryption.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst index 7e0703a12dfb..cae23949a626 100644 --- a/Documentation/block/inline-encryption.rst +++ b/Documentation/block/inline-encryption.rst @@ -153,7 +153,7 @@ blk-crypto-fallback completes the original bio. If the original bio is too large, multiple bounce bios may be required; see the code for details. For decryption, blk-crypto-fallback "wraps" the bio's completion callback -(``bi_complete``) and private data (``bi_private``) with its own, unsets the +(``bi_end_io``) and private data (``bi_private``) with its own, unsets the bio's encryption context, then submits the bio. If the read completes successfully, blk-crypto-fallback restores the bio's original completion callback and private data, then decrypts the bio's data in-place using the From a175ee8273319547a4be7584da03831a2fb2f835 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Thu, 2 Apr 2026 18:50:00 +0200 Subject: [PATCH 103/146] block: use sysfs_emit in sysfs show functions Replace sprintf() with sysfs_emit() in sysfs show functions. sysfs_emit() is preferred for formatting sysfs output because it provides safer bounds checking. Signed-off-by: Thorsten Blum Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260402164958.894879-4-thorsten.blum@linux.dev Signed-off-by: Jens Axboe --- block/partitions/core.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/block/partitions/core.c b/block/partitions/core.c index 3b5928836c69..5d5332ce586b 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -177,31 +178,31 @@ static struct parsed_partitions *check_partition(struct gendisk *hd) static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); + return sysfs_emit(buf, "%d\n", bdev_partno(dev_to_bdev(dev))); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); + return sysfs_emit(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); + return sysfs_emit(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); + return sysfs_emit(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); + return sysfs_emit(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); From fec114a98b8735ee89c75216c45a78e28be0f128 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Sun, 22 Mar 2026 21:41:02 +0800 Subject: [PATCH 104/146] bcache: fix cached_dev.sb_bio use-after-free and crash In our production environment, we have received multiple crash reports regarding libceph, which have caught our attention: ``` [6888366.280350] Call Trace: [6888366.280452] blk_update_request+0x14e/0x370 [6888366.280561] blk_mq_end_request+0x1a/0x130 [6888366.280671] rbd_img_handle_request+0x1a0/0x1b0 [rbd] [6888366.280792] rbd_obj_handle_request+0x32/0x40 [rbd] [6888366.280903] __complete_request+0x22/0x70 [libceph] [6888366.281032] osd_dispatch+0x15e/0xb40 [libceph] [6888366.281164] ? inet_recvmsg+0x5b/0xd0 [6888366.281272] ? ceph_tcp_recvmsg+0x6f/0xa0 [libceph] [6888366.281405] ceph_con_process_message+0x79/0x140 [libceph] [6888366.281534] ceph_con_v1_try_read+0x5d7/0xf30 [libceph] [6888366.281661] ceph_con_workfn+0x329/0x680 [libceph] ``` After analyzing the coredump file, we found that the address of dc->sb_bio has been freed. We know that cached_dev is only freed when it is stopped. Since sb_bio is a part of struct cached_dev, rather than an alloc every time. If the device is stopped while writing to the superblock, the released address will be accessed at endio. This patch hopes to wait for sb_write to complete in cached_dev_free. It should be noted that we analyzed the cause of the problem, then tell all details to the QWEN and adopted the modifications it made. Signed-off-by: Mingzhe Zou Fixes: cafe563591446 ("bcache: A block layer cache") Cc: stable@vger.kernel.org # 3.10+ Signed-off-by: Coly Li Link: https://patch.msgid.link/20260322134102.480107-1-colyli@fnnas.com Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 64bb38c95895..6627a381f65a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1373,6 +1373,13 @@ static CLOSURE_CALLBACK(cached_dev_free) mutex_unlock(&bch_register_lock); + /* + * Wait for any pending sb_write to complete before free. + * The sb_bio is embedded in struct cached_dev, so we must + * ensure no I/O is in progress. + */ + closure_sync(&dc->sb_write); + if (dc->sb_disk) folio_put(virt_to_folio(dc->sb_disk)); From 20a8e451ec1c7e99060b1bbaaad03ce88c39ddb8 Mon Sep 17 00:00:00 2001 From: Mingzhe Zou Date: Fri, 3 Apr 2026 12:21:35 +0800 Subject: [PATCH 105/146] bcache: fix uninitialized closure object In the previous patch ("bcache: fix cached_dev.sb_bio use-after-free and crash"), we adopted a simple modification suggestion from AI to fix the use-after-free. But in actual testing, we found an extreme case where the device is stopped before calling bch_write_bdev_super(). At this point, struct closure sb_write has not been initialized yet. For this patch, we ensure that sb_bio has been completed via sb_write_mutex. Signed-off-by: Mingzhe Zou Signed-off-by: Coly Li Link: https://patch.msgid.link/20260403042135.2221247-1-colyli@fnnas.com Fixes: fec114a98b87 ("bcache: fix cached_dev.sb_bio use-after-free and crash") Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 6627a381f65a..97d9adb0bf96 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1378,7 +1378,8 @@ static CLOSURE_CALLBACK(cached_dev_free) * The sb_bio is embedded in struct cached_dev, so we must * ensure no I/O is in progress. */ - closure_sync(&dc->sb_write); + down(&dc->sb_write_mutex); + up(&dc->sb_write_mutex); if (dc->sb_disk) folio_put(virt_to_folio(dc->sb_disk)); From 8b155f2e4a91f3507951e6ace4b413688ac28b96 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 3 Apr 2026 12:48:51 -0600 Subject: [PATCH 106/146] block: remove unused BVEC_ITER_ALL_INIT This macro no longer has any users, so remove it. Signed-off-by: Caleb Sander Mateos Link: https://patch.msgid.link/20260403184852.2140919-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/bvec.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 06fb60471aaf..d36dd476feda 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h @@ -203,15 +203,6 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv, ((bvl = mp_bvec_iter_bvec((bio_vec), (iter))), 1); \ bvec_iter_advance_single((bio_vec), &(iter), (bvl).bv_len)) -/* for iterating one bio from start to end */ -#define BVEC_ITER_ALL_INIT (struct bvec_iter) \ -{ \ - .bi_sector = 0, \ - .bi_size = UINT_MAX, \ - .bi_idx = 0, \ - .bi_bvec_done = 0, \ -} - static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { iter_all->done = 0; From 2aa72276fab9851dbd59c2daeb4b590c5a113908 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 30 Mar 2026 13:52:13 +0800 Subject: [PATCH 107/146] md: fix array_state=clear sysfs deadlock When "clear" is written to array_state, md_attr_store() breaks sysfs active protection so the array can delete itself from its own sysfs store method. However, md_attr_store() currently drops the mddev reference before calling sysfs_unbreak_active_protection(). Once do_md_stop(..., 0) has made the mddev eligible for delayed deletion, the temporary kobject reference taken by sysfs_break_active_protection() can become the last kobject reference protecting the md kobject. That allows sysfs_unbreak_active_protection() to drop the last kobject reference from the current sysfs writer context. kobject teardown then recurses into kernfs removal while the current sysfs node is still being unwound, and lockdep reports recursive locking on kn->active with kernfs_drain() in the call chain. Reproducer on an existing level: 1. Create an md0 linear array and activate it: mknod /dev/md0 b 9 0 echo none > /sys/block/md0/md/metadata_version echo linear > /sys/block/md0/md/level echo 1 > /sys/block/md0/md/raid_disks echo "$(cat /sys/class/block/sdb/dev)" > /sys/block/md0/md/new_dev echo "$(($(cat /sys/class/block/sdb/size) / 2))" > \ /sys/block/md0/md/dev-sdb/size echo 0 > /sys/block/md0/md/dev-sdb/slot echo active > /sys/block/md0/md/array_state 2. Wait briefly for the array to settle, then clear it: sleep 2 echo clear > /sys/block/md0/md/array_state The warning looks like: WARNING: possible recursive locking detected bash/588 is trying to acquire lock: (kn->active#65) at __kernfs_remove+0x157/0x1d0 but task is already holding lock: (kn->active#65) at sysfs_unbreak_active_protection+0x1f/0x40 ... Call Trace: kernfs_drain __kernfs_remove kernfs_remove_by_name_ns sysfs_remove_group sysfs_remove_groups __kobject_del kobject_put md_attr_store kernfs_fop_write_iter vfs_write ksys_write Restore active protection before mddev_put() so the extra sysfs kobject reference is dropped while the mddev is still held alive. The actual md kobject deletion is then deferred until after the sysfs write path has fully returned. Fixes: 9e59d609763f ("md: call del_gendisk in control path") Reviewed-by: Xiao Ni Link: https://lore.kernel.org/linux-raid/20260330055213.3976052-1-yukuai@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 521d9b34cd9e..02efe9700256 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6130,10 +6130,16 @@ md_attr_store(struct kobject *kobj, struct attribute *attr, } spin_unlock(&all_mddevs_lock); rv = entry->store(mddev, page, length); - mddev_put(mddev); + /* + * For "array_state=clear", dropping the extra kobject reference from + * sysfs_break_active_protection() can trigger md kobject deletion. + * Restore active protection before mddev_put() so deletion happens + * after the sysfs write path fully unwinds. + */ if (kn) sysfs_unbreak_active_protection(kn); + mddev_put(mddev); return rv; } From 0842186d2c4e67d2f8c8c2d1d779e8acffd41b5b Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Sun, 5 Apr 2026 22:25:30 -0600 Subject: [PATCH 108/146] ublk: reset per-IO canceled flag on each fetch If a ublk server starts recovering devices but dies before issuing fetch commands for all IOs, cancellation of the fetch commands that were successfully issued may never complete. This is because the per-IO canceled flag can remain set even after the fetch for that IO has been submitted - the per-IO canceled flags for all IOs in a queue are reset together only once all IOs for that queue have been fetched. So if a nonempty proper subset of the IOs for a queue are fetched when the ublk server dies, the IOs in that subset will never successfully be canceled, as their canceled flags remain set, and this prevents ublk_cancel_cmd from actually calling io_uring_cmd_done on the commands, despite the fact that they are outstanding. Fix this by resetting the per-IO cancel flags immediately when each IO is fetched instead of waiting for all IOs for the queue (which may never happen). Signed-off-by: Uday Shankar Fixes: 728cbac5fe21 ("ublk: move device reset into ublk_ch_release()") Reviewed-by: Ming Lei Reviewed-by: zhang, the-essence-of-life Link: https://patch.msgid.link/20260405-cancel-v2-1-02d711e643c2@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 71c7c56b38ca..eb96010625e5 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2916,22 +2916,26 @@ static void ublk_stop_dev(struct ublk_device *ub) ublk_cancel_dev(ub); } +static void ublk_reset_io_flags(struct ublk_queue *ubq, struct ublk_io *io) +{ + /* UBLK_IO_FLAG_CANCELED can be cleared now */ + spin_lock(&ubq->cancel_lock); + io->flags &= ~UBLK_IO_FLAG_CANCELED; + spin_unlock(&ubq->cancel_lock); +} + /* reset per-queue io flags */ static void ublk_queue_reset_io_flags(struct ublk_queue *ubq) { - int j; - - /* UBLK_IO_FLAG_CANCELED can be cleared now */ spin_lock(&ubq->cancel_lock); - for (j = 0; j < ubq->q_depth; j++) - ubq->ios[j].flags &= ~UBLK_IO_FLAG_CANCELED; ubq->canceling = false; spin_unlock(&ubq->cancel_lock); ubq->fail_io = false; } /* device can only be started after all IOs are ready */ -static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) +static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id, + struct ublk_io *io) __must_hold(&ub->mutex) { struct ublk_queue *ubq = ublk_get_queue(ub, q_id); @@ -2940,6 +2944,7 @@ static void ublk_mark_io_ready(struct ublk_device *ub, u16 q_id) ub->unprivileged_daemons = true; ubq->nr_io_ready++; + ublk_reset_io_flags(ubq, io); /* Check if this specific queue is now fully ready */ if (ublk_queue_ready(ubq)) { @@ -3202,7 +3207,7 @@ static int ublk_fetch(struct io_uring_cmd *cmd, struct ublk_device *ub, if (!ret) ret = ublk_config_io_buf(ub, io, cmd, buf_addr, NULL); if (!ret) - ublk_mark_io_ready(ub, q_id); + ublk_mark_io_ready(ub, q_id, io); mutex_unlock(&ub->mutex); return ret; } @@ -3610,7 +3615,7 @@ static int ublk_batch_prep_io(struct ublk_queue *ubq, ublk_io_unlock(io); if (!ret) - ublk_mark_io_ready(data->ub, ubq->q_id); + ublk_mark_io_ready(data->ub, ubq->q_id, io); return ret; } From 320f9b1c6a942a73c26be56742ee1da04f893a4f Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Sun, 5 Apr 2026 22:25:31 -0600 Subject: [PATCH 109/146] selftests: ublk: test that teardown after incomplete recovery completes Before the fix, teardown of a ublk server that was attempting to recover a device, but died when it had submitted a nonempty proper subset of the fetch commands to any queue would loop forever. Add a test to verify that, after the fix, teardown completes. This is done by: - Adding a new argument to the fault_inject target that causes it die after fetching a nonempty proper subset of the IOs to a queue - Using that argument in a new test while trying to recover an already-created device - Attempting to delete the ublk device at the end of the test; this hangs forever if teardown from the fault-injected ublk server never completed. It was manually verified that the test passes with the fix and hangs without it. Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://patch.msgid.link/20260405-cancel-v2-2-02d711e643c2@purestorage.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/fault_inject.c | 52 +++++++++++++++++-- tools/testing/selftests/ublk/kublk.c | 7 +++ tools/testing/selftests/ublk/kublk.h | 3 ++ .../testing/selftests/ublk/test_generic_17.sh | 35 +++++++++++++ 5 files changed, 95 insertions(+), 3 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_generic_17.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 8ac2d4a682a1..d338668c5a5f 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -18,6 +18,7 @@ TEST_PROGS += test_generic_10.sh TEST_PROGS += test_generic_12.sh TEST_PROGS += test_generic_13.sh TEST_PROGS += test_generic_16.sh +TEST_PROGS += test_generic_17.sh TEST_PROGS += test_batch_01.sh TEST_PROGS += test_batch_02.sh diff --git a/tools/testing/selftests/ublk/fault_inject.c b/tools/testing/selftests/ublk/fault_inject.c index 3b897f69c014..150896e02ff8 100644 --- a/tools/testing/selftests/ublk/fault_inject.c +++ b/tools/testing/selftests/ublk/fault_inject.c @@ -10,11 +10,17 @@ #include "kublk.h" +struct fi_opts { + long long delay_ns; + bool die_during_fetch; +}; + static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *info = &dev->dev_info; unsigned long dev_size = 250UL << 30; + struct fi_opts *opts = NULL; if (ctx->auto_zc_fallback) { ublk_err("%s: not support auto_zc_fallback\n", __func__); @@ -35,17 +41,52 @@ static int ublk_fault_inject_tgt_init(const struct dev_ctx *ctx, }; ublk_set_integrity_params(ctx, &dev->tgt.params); - dev->private_data = (void *)(unsigned long)(ctx->fault_inject.delay_us * 1000); + opts = calloc(1, sizeof(*opts)); + if (!opts) { + ublk_err("%s: couldn't allocate memory for opts\n", __func__); + return -ENOMEM; + } + + opts->delay_ns = ctx->fault_inject.delay_us * 1000; + opts->die_during_fetch = ctx->fault_inject.die_during_fetch; + dev->private_data = opts; + return 0; } +static void ublk_fault_inject_pre_fetch_io(struct ublk_thread *t, + struct ublk_queue *q, int tag, + bool batch) +{ + struct fi_opts *opts = q->dev->private_data; + + if (!opts->die_during_fetch) + return; + + /* + * Each queue fetches its IOs in increasing order of tags, so + * dying just before we're about to fetch tag 1 (regardless of + * what queue we're on) guarantees that we've fetched a nonempty + * proper subset of the tags on that queue. + */ + if (tag == 1) { + /* + * Ensure our commands are actually live in the kernel + * before we die. + */ + io_uring_submit(&t->ring); + raise(SIGKILL); + } +} + static int ublk_fault_inject_queue_io(struct ublk_thread *t, struct ublk_queue *q, int tag) { const struct ublksrv_io_desc *iod = ublk_get_iod(q, tag); struct io_uring_sqe *sqe; + struct fi_opts *opts = q->dev->private_data; struct __kernel_timespec ts = { - .tv_nsec = (long long)q->dev->private_data, + .tv_nsec = opts->delay_ns, }; ublk_io_alloc_sqes(t, &sqe, 1); @@ -77,29 +118,34 @@ static void ublk_fault_inject_cmd_line(struct dev_ctx *ctx, int argc, char *argv { static const struct option longopts[] = { { "delay_us", 1, NULL, 0 }, + { "die_during_fetch", 1, NULL, 0 }, { 0, 0, 0, 0 } }; int option_idx, opt; ctx->fault_inject.delay_us = 0; + ctx->fault_inject.die_during_fetch = false; while ((opt = getopt_long(argc, argv, "", longopts, &option_idx)) != -1) { switch (opt) { case 0: if (!strcmp(longopts[option_idx].name, "delay_us")) ctx->fault_inject.delay_us = strtoll(optarg, NULL, 10); + if (!strcmp(longopts[option_idx].name, "die_during_fetch")) + ctx->fault_inject.die_during_fetch = strtoll(optarg, NULL, 10); } } } static void ublk_fault_inject_usage(const struct ublk_tgt_ops *ops) { - printf("\tfault_inject: [--delay_us us (default 0)]\n"); + printf("\tfault_inject: [--delay_us us (default 0)] [--die_during_fetch 1]\n"); } const struct ublk_tgt_ops fault_inject_tgt_ops = { .name = "fault_inject", .init_tgt = ublk_fault_inject_tgt_init, + .pre_fetch_io = ublk_fault_inject_pre_fetch_io, .queue_io = ublk_fault_inject_queue_io, .tgt_io_done = ublk_fault_inject_tgt_io_done, .parse_cmd_line = ublk_fault_inject_cmd_line, diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e1c3b3c55e56..e5b787ba2175 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -796,6 +796,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) q = &t->dev->q[q_id]; io = &q->ios[tag]; io->buf_index = j++; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, tag, false); ublk_queue_io_cmd(t, io); } } else { @@ -807,6 +809,8 @@ static void ublk_submit_fetch_commands(struct ublk_thread *t) for (i = 0; i < q->q_depth; i++) { io = &q->ios[i]; io->buf_index = i; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, i, false); ublk_queue_io_cmd(t, io); } } @@ -983,6 +987,9 @@ static void ublk_batch_setup_queues(struct ublk_thread *t) if (t->q_map[i] == 0) continue; + if (q->tgt_ops->pre_fetch_io) + q->tgt_ops->pre_fetch_io(t, q, 0, true); + ret = ublk_batch_queue_prep_io_cmds(t, q); ublk_assert(ret >= 0); } diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 02f0c55d006b..6d1762aa30df 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -60,6 +60,7 @@ struct stripe_ctx { struct fault_inject_ctx { /* fault_inject */ unsigned long delay_us; + bool die_during_fetch; }; struct dev_ctx { @@ -138,6 +139,8 @@ struct ublk_tgt_ops { int (*init_tgt)(const struct dev_ctx *ctx, struct ublk_dev *); void (*deinit_tgt)(struct ublk_dev *); + void (*pre_fetch_io)(struct ublk_thread *t, struct ublk_queue *q, + int tag, bool batch); int (*queue_io)(struct ublk_thread *, struct ublk_queue *, int tag); void (*tgt_io_done)(struct ublk_thread *, struct ublk_queue *, const struct io_uring_cqe *); diff --git a/tools/testing/selftests/ublk/test_generic_17.sh b/tools/testing/selftests/ublk/test_generic_17.sh new file mode 100755 index 000000000000..2278b5fc9dba --- /dev/null +++ b/tools/testing/selftests/ublk/test_generic_17.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "fault_inject" "teardown after incomplete recovery" + +# First start and stop a ublk server with device configured for recovery +dev_id=$(_add_ublk_dev -t fault_inject -r 1) +_check_add_dev $TID $? +state=$(__ublk_kill_daemon "${dev_id}" "QUIESCED") +if [ "$state" != "QUIESCED" ]; then + echo "device isn't quiesced($state) after $action" + ERR_CODE=255 +fi + +# Then recover the device, but use --die_during_fetch to have the ublk +# server die while a queue has some (but not all) I/Os fetched +${UBLK_PROG} recover -n "${dev_id}" --foreground -t fault_inject --die_during_fetch 1 +RECOVER_RES=$? +# 137 is the result when dying of SIGKILL +if (( RECOVER_RES != 137 )); then + echo "recover command exited with unexpected code ${RECOVER_RES}!" + ERR_CODE=255 +fi + +# Clean up the device. This can only succeed once teardown of the above +# exited ublk server completes. So if teardown never completes, we will +# time out here +_ublk_del_dev "${dev_id}" + +_cleanup_test "fault_inject" +_show_result $TID $ERR_CODE From e9b004ff83067cdf96774b45aea4b239ace99a2f Mon Sep 17 00:00:00 2001 From: Yuto Ohnuki Date: Mon, 16 Mar 2026 07:03:59 +0000 Subject: [PATCH 110/146] blk-wbt: remove WARN_ON_ONCE from wbt_init_enable_default() wbt_init_enable_default() uses WARN_ON_ONCE to check for failures from wbt_alloc() and wbt_init(). However, both are expected failure paths: - wbt_alloc() can return NULL under memory pressure (-ENOMEM) - wbt_init() can fail with -EBUSY if wbt is already registered syzbot triggers this by injecting memory allocation failures during MTD partition creation via ioctl(BLKPG), causing a spurious warning. wbt_init_enable_default() is a best-effort initialization called from blk_register_queue() with a void return type. Failure simply means the disk operates without writeback throttling, which is harmless. Replace WARN_ON_ONCE with plain if-checks, consistent with how wbt_set_lat() in the same file already handles these failures. Add a pr_warn() for the wbt_init() failure to retain diagnostic information without triggering a full stack trace. Reported-by: syzbot+71fcf20f7c1e5043d78c@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=71fcf20f7c1e5043d78c Fixes: 41afaeeda509 ("blk-wbt: fix possible deadlock to nest pcpu_alloc_mutex under q_usage_counter") Signed-off-by: Yuto Ohnuki Reviewed-by: Yu Kuai Reviewed-by: Nilay Shroff Link: https://patch.msgid.link/20260316070358.65225-2-ytohnuki@amazon.com Signed-off-by: Jens Axboe --- block/blk-wbt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 33006edfccd4..dcc2438ca16d 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -782,10 +782,11 @@ void wbt_init_enable_default(struct gendisk *disk) return; rwb = wbt_alloc(); - if (WARN_ON_ONCE(!rwb)) + if (!rwb) return; - if (WARN_ON_ONCE(wbt_init(disk, rwb))) { + if (wbt_init(disk, rwb)) { + pr_warn("%s: failed to enable wbt\n", disk->disk_name); wbt_free(rwb); return; } From a9c4b1d37622ed01b75f94a4f68cf55f33153a31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=B6hmwalder?= Date: Fri, 3 Apr 2026 15:29:53 +0200 Subject: [PATCH 111/146] drbd: remove DRBD_GENLA_F_MANDATORY flag handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DRBD used a custom mechanism to mark netlink attributes as "mandatory": bit 14 of nla_type was repurposed as DRBD_GENLA_F_MANDATORY. Attributes sent from userspace that had this bit present and that were unknown to the kernel would lead to an error. Since commit ef6243acb478 ("genetlink: optionally validate strictly/dumps"), the generic netlink layer rejects unknown top-level attributes when strict validation is enabled. DRBD never opted out of strict validation, so unknown top-level attributes are already rejected by the netlink core. The mandatory flag mechanism was required for nested attributes, because these are parsed liberally, silently dropping attributes unknown to the kernel. This prepares for the move to a new YNL-based family, which will use the now-default strict parsing. The current family is not expected to gain any new attributes, which makes this change safe. Old userspace that still sets bit 14 is unaffected: nla_type() strips it before __nla_validate_parse() performs attribute validation, so the bit never reaches DRBD. Remove all references to the mandatory flag in DRBD. Cc: Johannes Berg Cc: Jakub Kicinski Signed-off-by: Christoph Böhmwalder Link: https://patch.msgid.link/20260403132953.2248751-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/Makefile | 1 - drivers/block/drbd/drbd_nl.c | 19 +-- drivers/block/drbd/drbd_nla.c | 56 -------- drivers/block/drbd/drbd_nla.h | 9 -- include/linux/drbd_genl.h | 204 +++++++++++++++--------------- include/linux/genl_magic_func.h | 3 +- include/linux/genl_magic_struct.h | 15 +-- 7 files changed, 112 insertions(+), 195 deletions(-) delete mode 100644 drivers/block/drbd/drbd_nla.c delete mode 100644 drivers/block/drbd/drbd_nla.h diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 67a8b352a1d5..187eaf81f0f8 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -3,7 +3,6 @@ drbd-y := drbd_buildtag.o drbd_bitmap.o drbd_proc.o drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o drbd-y += drbd_interval.o drbd_state.o -drbd-y += drbd_nla.o drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1f8ffdf9b24e..d997d274092c 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -74,7 +74,6 @@ int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb); int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb); #include -#include "drbd_nla.h" static int drbd_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info); @@ -239,14 +238,14 @@ static int drbd_adm_prepare(struct drbd_config_context *adm_ctx, goto fail; /* and assign stuff to the adm_ctx */ - nla = nested_attr_tb[__nla_type(T_ctx_volume)]; + nla = nested_attr_tb[T_ctx_volume]; if (nla) adm_ctx->volume = nla_get_u32(nla); - nla = nested_attr_tb[__nla_type(T_ctx_resource_name)]; + nla = nested_attr_tb[T_ctx_resource_name]; if (nla) adm_ctx->resource_name = nla_data(nla); - adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)]; - adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)]; + adm_ctx->my_addr = nested_attr_tb[T_ctx_my_addr]; + adm_ctx->peer_addr = nested_attr_tb[T_ctx_peer_addr]; if ((adm_ctx->my_addr && nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) || (adm_ctx->peer_addr && @@ -825,7 +824,6 @@ out: static const char *from_attrs_err_to_txt(int err) { return err == -ENOMSG ? "required attribute missing" : - err == -EOPNOTSUPP ? "unknown mandatory attribute" : err == -EEXIST ? "can not change invariant setting" : "invalid attribute value"; } @@ -3303,14 +3301,13 @@ nla_put_failure: static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr) { const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; - const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; struct nlattr *nla; nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), DRBD_NLA_CFG_CONTEXT); if (!nla) return NULL; - return drbd_nla_find_nested(maxtype, nla, __nla_type(attr)); + return nla_find_nested(nla, attr); } static void resource_to_info(struct resource_info *, struct drbd_resource *); @@ -4068,7 +4065,6 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) struct nlattr *nla; const char *resource_name; struct drbd_resource *resource; - int maxtype; /* Is this a followup call? */ if (cb->args[0]) { @@ -4088,10 +4084,7 @@ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb) /* No explicit context given. Dump all. */ if (!nla) goto dump; - maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; - nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name)); - if (IS_ERR(nla)) - return PTR_ERR(nla); + nla = nla_find_nested(nla, T_ctx_resource_name); /* context given, but no name present? */ if (!nla) return -EINVAL; diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c deleted file mode 100644 index df0d241d3f6a..000000000000 --- a/drivers/block/drbd/drbd_nla.c +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include -#include "drbd_nla.h" - -static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla) -{ - struct nlattr *head = nla_data(nla); - int len = nla_len(nla); - int rem; - - /* - * validate_nla (called from nla_parse_nested) ignores attributes - * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag. - * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY - * flag set also, check and remove that flag before calling - * nla_parse_nested. - */ - - nla_for_each_attr(nla, head, len, rem) { - if (nla->nla_type & DRBD_GENLA_F_MANDATORY) { - nla->nla_type &= ~DRBD_GENLA_F_MANDATORY; - if (nla_type(nla) > maxtype) - return -EOPNOTSUPP; - } - } - return 0; -} - -int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy) -{ - int err; - - err = drbd_nla_check_mandatory(maxtype, nla); - if (!err) - err = nla_parse_nested_deprecated(tb, maxtype, nla, policy, - NULL); - - return err; -} - -struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype) -{ - int err; - /* - * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and - * we don't know about that attribute, reject all the nested - * attributes. - */ - err = drbd_nla_check_mandatory(maxtype, nla); - if (err) - return ERR_PTR(err); - return nla_find_nested(nla, attrtype); -} diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h deleted file mode 100644 index d3555df0d353..000000000000 --- a/drivers/block/drbd/drbd_nla.h +++ /dev/null @@ -1,9 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef __DRBD_NLA_H -#define __DRBD_NLA_H - -extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla, - const struct nla_policy *policy); -extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype); - -#endif /* __DRBD_NLA_H */ diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 53f44b8cd75f..f53c534aba0c 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -87,7 +87,7 @@ */ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, /* "arbitrary" size strings, nla_policy.len = 0 */ - __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0) + __str_field(1, 0, info_text, 0) ) /* Configuration requests typically need a context to operate on. @@ -96,10 +96,10 @@ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply, * and/or the replication group (aka resource) name, * and the volume id within the resource. */ GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context, - __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume) - __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128) - __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128) - __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128) + __u32_field(1, 0, ctx_volume) + __str_field(2, 0, ctx_resource_name, 128) + __bin_field(3, 0, ctx_my_addr, 128) + __bin_field(4, 0, ctx_peer_addr, 128) ) GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, @@ -108,86 +108,86 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf, __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx) /* use the resize command to try and change the disk_size */ - __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size) + __u64_field(4, DRBD_F_INVARIANT, disk_size) /* we could change the max_bio_bvecs, * but it won't propagate through the stack */ - __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs) + __u32_field(5, DRBD_F_INVARIANT, max_bio_bvecs) - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF) + __u32_field_def(6, 0, on_io_error, DRBD_ON_IO_ERROR_DEF) + __u32_field_def(7, 0, fencing, DRBD_FENCING_DEF) - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF) - __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF) + __u32_field_def(8, 0, resync_rate, DRBD_RESYNC_RATE_DEF) + __s32_field_def(9, 0, resync_after, DRBD_MINOR_NUMBER_DEF) + __u32_field_def(10, 0, al_extents, DRBD_AL_EXTENTS_DEF) + __u32_field_def(11, 0, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF) + __u32_field_def(12, 0, c_delay_target, DRBD_C_DELAY_TARGET_DEF) + __u32_field_def(13, 0, c_fill_target, DRBD_C_FILL_TARGET_DEF) + __u32_field_def(14, 0, c_max_rate, DRBD_C_MAX_RATE_DEF) + __u32_field_def(15, 0, c_min_rate, DRBD_C_MIN_RATE_DEF) + __u32_field_def(20, 0, disk_timeout, DRBD_DISK_TIMEOUT_DEF) __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF) __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF) - __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF) - __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF) - __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF) - __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF) + __flg_field_def(16, 0, disk_barrier, DRBD_DISK_BARRIER_DEF) + __flg_field_def(17, 0, disk_flushes, DRBD_DISK_FLUSHES_DEF) + __flg_field_def(18, 0, disk_drain, DRBD_DISK_DRAIN_DEF) + __flg_field_def(19, 0, md_flushes, DRBD_MD_FLUSHES_DEF) __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF) __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF) __flg_field_def(26, 0 /* OPTIONAL */, disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF) ) GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts, - __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, DRBD_CPU_MASK_SIZE) - __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF) + __str_field_def(1, 0, cpu_mask, DRBD_CPU_MASK_SIZE) + __u32_field_def(2, 0, on_no_data, DRBD_ON_NO_DATA_DEF) ) GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf, - __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE, + __str_field_def(1, DRBD_F_SENSITIVE, shared_secret, SHARED_SECRET_MAX) - __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX) - __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX) - __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX) - __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX) - __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF) - __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF) - __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF) - __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF) - __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF) - __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) - __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) - __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF) - __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF) - __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) - __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) - __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF) - __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF) - __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF) - __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF) - __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF) - __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF) - __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF) - __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) - __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data) - __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF) - __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF) - __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative) - __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF) - /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */ - /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */ + __str_field_def(2, 0, cram_hmac_alg, SHARED_SECRET_MAX) + __str_field_def(3, 0, integrity_alg, SHARED_SECRET_MAX) + __str_field_def(4, 0, verify_alg, SHARED_SECRET_MAX) + __str_field_def(5, 0, csums_alg, SHARED_SECRET_MAX) + __u32_field_def(6, 0, wire_protocol, DRBD_PROTOCOL_DEF) + __u32_field_def(7, 0, connect_int, DRBD_CONNECT_INT_DEF) + __u32_field_def(8, 0, timeout, DRBD_TIMEOUT_DEF) + __u32_field_def(9, 0, ping_int, DRBD_PING_INT_DEF) + __u32_field_def(10, 0, ping_timeo, DRBD_PING_TIMEO_DEF) + __u32_field_def(11, 0, sndbuf_size, DRBD_SNDBUF_SIZE_DEF) + __u32_field_def(12, 0, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF) + __u32_field_def(13, 0, ko_count, DRBD_KO_COUNT_DEF) + __u32_field_def(14, 0, max_buffers, DRBD_MAX_BUFFERS_DEF) + __u32_field_def(15, 0, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF) + __u32_field_def(16, 0, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) + __u32_field_def(17, 0, after_sb_0p, DRBD_AFTER_SB_0P_DEF) + __u32_field_def(18, 0, after_sb_1p, DRBD_AFTER_SB_1P_DEF) + __u32_field_def(19, 0, after_sb_2p, DRBD_AFTER_SB_2P_DEF) + __u32_field_def(20, 0, rr_conflict, DRBD_RR_CONFLICT_DEF) + __u32_field_def(21, 0, on_congestion, DRBD_ON_CONGESTION_DEF) + __u32_field_def(22, 0, cong_fill, DRBD_CONG_FILL_DEF) + __u32_field_def(23, 0, cong_extents, DRBD_CONG_EXTENTS_DEF) + __flg_field_def(24, 0, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF) + __flg_field(25, DRBD_F_INVARIANT, discard_my_data) + __flg_field_def(26, 0, tcp_cork, DRBD_TCP_CORK_DEF) + __flg_field_def(27, 0, always_asbp, DRBD_ALWAYS_ASBP_DEF) + __flg_field(28, DRBD_F_INVARIANT, tentative) + __flg_field_def(29, 0, use_rle, DRBD_USE_RLE_DEF) + /* 9: __u32_field_def(30, 0, fencing_policy, DRBD_FENCING_DEF) */ + /* 9: __str_field_def(31, 0, name, SHARED_SECRET_MAX) */ /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */ __flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF) __u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF) ) GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate) + __flg_field(1, 0, assume_uptodate) ) GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size) - __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force) - __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync) + __u64_field(1, 0, resize_size) + __flg_field(2, 0, resize_force) + __flg_field(3, 0, no_resync) __u32_field_def(4, 0 /* OPTIONAL */, al_stripes, DRBD_AL_STRIPES_DEF) __u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF) ) @@ -195,31 +195,31 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms, GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, /* the reason of the broadcast, * if this is an event triggered broadcast. */ - __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason) + __u32_field(1, 0, sib_reason) __u32_field(2, DRBD_F_REQUIRED, current_state) - __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity) - __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid) + __u64_field(3, 0, capacity) + __u64_field(4, 0, ed_uuid) /* These are for broadcast from after state change work. * prev_state and new_state are from the moment the state change took * place, new_state is not neccessarily the same as current_state, * there may have been more state changes since. Which will be * broadcasted soon, in their respective after state change work. */ - __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state) - __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state) + __u32_field(5, 0, prev_state) + __u32_field(6, 0, new_state) /* if we have a local disk: */ - __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64))) - __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags) - __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total) - __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos) + __bin_field(7, 0, uuids, (UI_SIZE*sizeof(__u64))) + __u32_field(8, 0, disk_flags) + __u64_field(9, 0, bits_total) + __u64_field(10, 0, bits_oos) /* and in case resync or online verify is active */ - __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total) - __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed) + __u64_field(11, 0, bits_rs_total) + __u64_field(12, 0, bits_rs_failed) /* for pre and post notifications of helper execution */ - __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32) - __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code) + __str_field(13, 0, helper, 32) + __u32_field(14, 0, helper_exit_code) __u64_field(15, 0, send_cnt) __u64_field(16, 0, recv_cnt) @@ -233,12 +233,12 @@ GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info, ) GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms, - __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector) - __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector) + __u64_field(1, 0, ov_start_sector) + __u64_field(2, 0, ov_stop_sector) ) GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm) + __flg_field(1, 0, clear_bm) ) GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, @@ -246,11 +246,11 @@ GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms, ) GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect) + __flg_field(1, 0, force_disconnect) ) GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, - __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) + __flg_field(1, 0, force_detach) ) GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, @@ -315,12 +315,12 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics, ) GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header, - __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type) + __u32_field(1, 0, nh_type) ) GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, - __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32) - __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status) + __str_field(1, 0, helper_name, 32) + __u32_field(2, 0, helper_status) ) /* @@ -333,9 +333,9 @@ GENL_notification( DRBD_EVENT, 1, events, GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_NET_CONF, 0) + GENL_tla_expected(DRBD_NLA_DISK_CONF, 0) + GENL_tla_expected(DRBD_NLA_SYNCER_CONF, 0) ) /* query kernel for specific or all info */ @@ -349,7 +349,7 @@ GENL_op( ), /* To select the object .doit. * Or a subset of objects in .dumpit. */ - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) ) /* add DRBD minor devices as volumes to resources */ @@ -367,7 +367,7 @@ GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource), GENL_op(DRBD_ADM_RESOURCE_OPTS, 9, GENL_doit(drbd_adm_resource_opts), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, 0) ) GENL_op( @@ -403,7 +403,7 @@ GENL_op( DRBD_ADM_RESIZE, 13, GENL_doit(drbd_adm_resize), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, 0) ) GENL_op( @@ -424,18 +424,18 @@ GENL_op( DRBD_ADM_NEW_C_UUID, 16, GENL_doit(drbd_adm_new_c_uuid), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, 0) ) GENL_op( DRBD_ADM_START_OV, 17, GENL_doit(drbd_adm_start_ov), - GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_START_OV_PARMS, 0) ) GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) - GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_DETACH_PARMS, 0)) GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) @@ -460,36 +460,36 @@ GENL_op(DRBD_ADM_GET_RESOURCES, 30, GENL_op_init( .dumpit = drbd_adm_dump_resources, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, 0) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_DEVICES, 31, GENL_op_init( .dumpit = drbd_adm_dump_devices, .done = drbd_adm_dump_devices_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, 0) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_CONNECTIONS, 32, GENL_op_init( .dumpit = drbd_adm_dump_connections, .done = drbd_adm_dump_connections_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, 0) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, 0)) GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33, GENL_op_init( .dumpit = drbd_adm_dump_peer_devices, .done = drbd_adm_dump_peer_devices_done, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) - GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, 0) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, 0)) GENL_notification( DRBD_RESOURCE_STATE, 34, events, @@ -524,7 +524,7 @@ GENL_op( GENL_op_init( .dumpit = drbd_adm_get_initial_state, ), - GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)) + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, 0)) GENL_notification( DRBD_HELPER, 40, events, diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h index 6edcac85155e..a7d36c9ea924 100644 --- a/include/linux/genl_magic_func.h +++ b/include/linux/genl_magic_func.h @@ -149,7 +149,8 @@ static int __ ## s_name ## _from_attrs(struct s_name *s, \ if (!tla) \ return -ENOMSG; \ DPRINT_TLA(#s_name, "<=-", #tag_name); \ - err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \ + err = nla_parse_nested_deprecated(ntb, maxtype, tla, \ + s_name ## _nl_policy, NULL); \ if (err) \ return err; \ \ diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index 621b87a87d74..2200cedd160a 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -25,16 +25,6 @@ extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); * Extension of genl attribute validation policies {{{2 */ -/* - * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not - * know about. This flag can be set in nlattr->nla_type to indicate that this - * attribute must not be ignored. - * - * We check and remove this flag in drbd_nla_check_mandatory() before - * validating the attribute types and lengths via nla_parse_nested(). - */ -#define DRBD_GENLA_F_MANDATORY (1 << 14) - /* * Flags specific to drbd and not visible at the netlink layer, used in * _from_attrs and _to_skb: @@ -52,7 +42,6 @@ extern void CONCATENATE(GENL_MAGIC_FAMILY, _genl_unregister)(void); #define DRBD_F_SENSITIVE (1 << 1) #define DRBD_F_INVARIANT (1 << 2) -#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY)) /* }}}1 * MAGIC @@ -158,12 +147,12 @@ enum { \ #undef __field #define __field(attr_nr, attr_flag, name, nla_type, type, \ __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + T_ ## name = (__u16)(attr_nr), #undef __array #define __array(attr_nr, attr_flag, name, nla_type, type, \ maxlen, __get, __put, __is_signed) \ - T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)), + T_ ## name = (__u16)(attr_nr), #include GENL_MAGIC_INCLUDE_FILE From 078d1d8e688d75419abfedcae47eab8e42b991bb Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Sun, 8 Mar 2026 19:42:02 -0400 Subject: [PATCH 112/146] md/raid0: use kvzalloc/kvfree for strip_zone and devlist allocations syzbot reported a WARNING at mm/page_alloc.c:__alloc_frozen_pages_noprof() triggered by create_strip_zones() in the RAID0 driver. When raid_disks is large, the allocation size exceeds MAX_PAGE_ORDER (4MB on x86), causing WARN_ON_ONCE_GFP(order > MAX_PAGE_ORDER). Convert the strip_zone and devlist allocations from kzalloc/kzalloc_objs to kvzalloc/kvzalloc_objs, which first attempts a contiguous allocation with __GFP_NOWARN and then falls back to vmalloc for large sizes. Convert the corresponding kfree calls to kvfree. Both arrays are pure metadata lookup tables (arrays of pointers and zone descriptors) accessed only via indexing, so they do not require physically contiguous memory. Reported-by: syzbot+924649752adf0d3ac9dd@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/69adaba8.a00a0220.b130.0005.GAE@google.com/ Signed-off-by: Gregory Price Reviewed-by: Yu Kuai Reviewed-by: Li Nan Link: https://lore.kernel.org/linux-raid/20260308234202.3118119-1-gourry@gourry.net/ Signed-off-by: Yu Kuai --- drivers/md/raid0.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index ef0045db409f..5e38a51e349a 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -143,13 +143,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } err = -ENOMEM; - conf->strip_zone = kzalloc_objs(struct strip_zone, conf->nr_strip_zones); + conf->strip_zone = kvzalloc_objs(struct strip_zone, conf->nr_strip_zones); if (!conf->strip_zone) goto abort; - conf->devlist = kzalloc(array3_size(sizeof(struct md_rdev *), - conf->nr_strip_zones, - mddev->raid_disks), - GFP_KERNEL); + conf->devlist = kvzalloc(array3_size(sizeof(struct md_rdev *), + conf->nr_strip_zones, + mddev->raid_disks), + GFP_KERNEL); if (!conf->devlist) goto abort; @@ -291,8 +291,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) return 0; abort: - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); *private_conf = ERR_PTR(err); return err; @@ -373,8 +373,8 @@ static void raid0_free(struct mddev *mddev, void *priv) { struct r0conf *conf = priv; - kfree(conf->strip_zone); - kfree(conf->devlist); + kvfree(conf->strip_zone); + kvfree(conf->devlist); kfree(conf); } From e4979f4fac4d6bbe757be50441b45e28e6bf7360 Mon Sep 17 00:00:00 2001 From: Abd-Alrhman Masalkhi Date: Sat, 28 Mar 2026 22:35:22 +0300 Subject: [PATCH 113/146] md: remove unused static md_wq workqueue The md_wq workqueue is defined as static and initialized in md_init(), but it is not used anywhere within md.c. All asynchronous and deferred work in this file is handled via md_misc_wq or dedicated md threads. Fixes: b75197e86e6d3 ("md: Remove flush handling") Signed-off-by: Abd-Alrhman Masalkhi Link: https://lore.kernel.org/linux-raid/20260328193522.3624-1-abd.masalkhi@gmail.com/ Signed-off-by: Yu Kuai --- drivers/md/md.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 02efe9700256..e0a935f5a3e9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -84,7 +84,6 @@ static DEFINE_XARRAY(md_submodule); static const struct kobj_type md_ktype; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); -static struct workqueue_struct *md_wq; /* * This workqueue is used for sync_work to register new sync_thread, and for @@ -10511,10 +10510,6 @@ static int __init md_init(void) goto err_bitmap; ret = -ENOMEM; - md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM | WQ_PERCPU, 0); - if (!md_wq) - goto err_wq; - md_misc_wq = alloc_workqueue("md_misc", WQ_PERCPU, 0); if (!md_misc_wq) goto err_misc_wq; @@ -10539,8 +10534,6 @@ err_mdp: err_md: destroy_workqueue(md_misc_wq); err_misc_wq: - destroy_workqueue(md_wq); -err_wq: md_llbitmap_exit(); err_bitmap: md_bitmap_exit(); @@ -10849,7 +10842,6 @@ static __exit void md_exit(void) spin_unlock(&all_mddevs_lock); destroy_workqueue(md_misc_wq); - destroy_workqueue(md_wq); md_bitmap_exit(); } From b0cc3ae97e893bf54bbce447f4e9fd2e0b88bff9 Mon Sep 17 00:00:00 2001 From: Junrui Luo Date: Sat, 4 Apr 2026 15:44:35 +0800 Subject: [PATCH 114/146] md/raid5: validate payload size before accessing journal metadata r5c_recovery_analyze_meta_block() and r5l_recovery_verify_data_checksum_for_mb() iterate over payloads in a journal metadata block using on-disk payload size fields without validating them against the remaining space in the metadata block. A corrupted journal contains payload sizes extending beyond the PAGE_SIZE boundary can cause out-of-bounds reads when accessing payload fields or computing offsets. Add bounds validation for each payload type to ensure the full payload fits within meta_size before processing. Fixes: b4c625c67362 ("md/r5cache: r5cache recovery: part 1") Cc: stable@vger.kernel.org Signed-off-by: Junrui Luo Link: https://lore.kernel.org/linux-raid/SYBPR01MB78815E78D829BB86CD7C8015AF5FA@SYBPR01MB7881.ausprd01.prod.outlook.com/ Signed-off-by: Yu Kuai --- drivers/md/raid5-cache.c | 48 +++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 66b10cbda96d..7b7546bfa21f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -2002,15 +2002,27 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, return -ENOMEM; while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; + payload = (void *)mb + mb_offset; payload_flush = (void *)mb + mb_offset; if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) { + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; if (r5l_recovery_verify_data_checksum( log, ctx, page, log_offset, payload->checksum[0]) < 0) @@ -2023,22 +2035,18 @@ r5l_recovery_verify_data_checksum_for_mb(struct r5l_log *log, payload->checksum[1]) < 0) goto mismatch; } else if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - /* nothing to do for R5LOG_PAYLOAD_FLUSH here */ + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + goto mismatch; } else /* not R5LOG_PAYLOAD_DATA/PARITY/FLUSH */ goto mismatch; - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); - } else { - /* DATA or PARITY payload */ + if (le16_to_cpu(payload->header.type) != R5LOG_PAYLOAD_FLUSH) { log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); } - + mb_offset += payload_len; } put_page(page); @@ -2089,6 +2097,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); while (mb_offset < le32_to_cpu(mb->meta_size)) { + sector_t payload_len; int dd; payload = (void *)mb + mb_offset; @@ -2097,6 +2106,12 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_FLUSH) { int i, count; + payload_len = sizeof(struct r5l_payload_flush) + + (sector_t)le32_to_cpu(payload_flush->size); + if (mb_offset + payload_len > + le32_to_cpu(mb->meta_size)) + return -EINVAL; + count = le32_to_cpu(payload_flush->size) / sizeof(__le64); for (i = 0; i < count; ++i) { stripe_sect = le64_to_cpu(payload_flush->flush_stripes[i]); @@ -2110,12 +2125,17 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, } } - mb_offset += sizeof(struct r5l_payload_flush) + - le32_to_cpu(payload_flush->size); + mb_offset += payload_len; continue; } /* DATA or PARITY payload */ + payload_len = sizeof(struct r5l_payload_data_parity) + + (sector_t)sizeof(__le32) * + (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + if (mb_offset + payload_len > le32_to_cpu(mb->meta_size)) + return -EINVAL; + stripe_sect = (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) ? raid5_compute_sector( conf, le64_to_cpu(payload->location), 0, &dd, @@ -2180,9 +2200,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log, log_offset = r5l_ring_add(log, log_offset, le32_to_cpu(payload->size)); - mb_offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); + mb_offset += payload_len; } return 0; From 09af773650024279a60348e7319d599e6571b15c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 23 Mar 2026 13:46:42 +0800 Subject: [PATCH 115/146] md: add fallback to correct bitmap_ops on version mismatch If default bitmap version and on-disk version doesn't match, and mdadm is not the latest version to set bitmap_type, set bitmap_ops based on the disk version. Link: https://lore.kernel.org/linux-raid/20260323054644.3351791-2-yukuai@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/md.c | 111 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index e0a935f5a3e9..ee01e050ee12 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6454,15 +6454,124 @@ static void md_safemode_timeout(struct timer_list *t) static int start_dirty_degraded; +/* + * Read bitmap superblock and return the bitmap_id based on disk version. + * This is used as fallback when default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set bitmap_type. + */ +static enum md_submodule_id md_bitmap_get_id_from_sb(struct mddev *mddev) +{ + struct md_rdev *rdev; + struct page *sb_page; + bitmap_super_t *sb; + enum md_submodule_id id = ID_BITMAP_NONE; + sector_t sector; + u32 version; + + if (!mddev->bitmap_info.offset) + return ID_BITMAP_NONE; + + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) { + pr_warn("md: %s: failed to allocate memory for bitmap\n", + mdname(mddev)); + return ID_BITMAP_NONE; + } + + sector = mddev->bitmap_info.offset; + + rdev_for_each(rdev, mddev) { + u32 iosize; + + if (!test_bit(In_sync, &rdev->flags) || + test_bit(Faulty, &rdev->flags) || + test_bit(Bitmap_sync, &rdev->flags)) + continue; + + iosize = roundup(sizeof(bitmap_super_t), + bdev_logical_block_size(rdev->bdev)); + if (sync_page_io(rdev, sector, iosize, sb_page, REQ_OP_READ, + true)) + goto read_ok; + } + pr_warn("md: %s: failed to read bitmap from any device\n", + mdname(mddev)); + goto out; + +read_ok: + sb = kmap_local_page(sb_page); + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) { + pr_warn("md: %s: invalid bitmap magic 0x%x\n", + mdname(mddev), le32_to_cpu(sb->magic)); + goto out_unmap; + } + + version = le32_to_cpu(sb->version); + switch (version) { + case BITMAP_MAJOR_LO: + case BITMAP_MAJOR_HI: + case BITMAP_MAJOR_CLUSTERED: + id = ID_BITMAP; + break; + case BITMAP_MAJOR_LOCKLESS: + id = ID_LLBITMAP; + break; + default: + pr_warn("md: %s: unknown bitmap version %u\n", + mdname(mddev), version); + break; + } + +out_unmap: + kunmap_local(sb); +out: + __free_page(sb_page); + return id; +} + static int md_bitmap_create(struct mddev *mddev) { + enum md_submodule_id orig_id = mddev->bitmap_id; + enum md_submodule_id sb_id; + int err; + if (mddev->bitmap_id == ID_BITMAP_NONE) return -EINVAL; if (!mddev_set_bitmap_ops(mddev)) return -ENOENT; - return mddev->bitmap_ops->create(mddev); + err = mddev->bitmap_ops->create(mddev); + if (!err) + return 0; + + /* + * Create failed, if default bitmap version and on-disk version + * doesn't match, and mdadm is not the latest version to set + * bitmap_type, set bitmap_ops based on the disk version. + */ + mddev_clear_bitmap_ops(mddev); + + sb_id = md_bitmap_get_id_from_sb(mddev); + if (sb_id == ID_BITMAP_NONE || sb_id == orig_id) + return err; + + pr_info("md: %s: bitmap version mismatch, switching from %d to %d\n", + mdname(mddev), orig_id, sb_id); + + mddev->bitmap_id = sb_id; + if (!mddev_set_bitmap_ops(mddev)) { + mddev->bitmap_id = orig_id; + return -ENOENT; + } + + err = mddev->bitmap_ops->create(mddev); + if (err) { + mddev_clear_bitmap_ops(mddev); + mddev->bitmap_id = orig_id; + } + + return err; } static void md_bitmap_destroy(struct mddev *mddev) From 4403023e2aa7bab0193121d2ec543bea862d7304 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 23 Mar 2026 13:46:43 +0800 Subject: [PATCH 116/146] md/md-llbitmap: add CleanUnwritten state for RAID-5 proactive parity building Add new states to the llbitmap state machine to support proactive XOR parity building for RAID-5 arrays. This allows users to pre-build parity data for unwritten regions before any user data is written. New states added: - BitNeedSyncUnwritten: Transitional state when proactive sync is triggered via sysfs on Unwritten regions. - BitSyncingUnwritten: Proactive sync in progress for unwritten region. - BitCleanUnwritten: XOR parity has been pre-built, but no user data written yet. When user writes to this region, it transitions to BitDirty. New actions added: - BitmapActionProactiveSync: Trigger for proactive XOR parity building. - BitmapActionClearUnwritten: Convert CleanUnwritten/NeedSyncUnwritten/ SyncingUnwritten states back to Unwritten before recovery starts. State flows: - Current (lazy): Unwritten -> (write) -> NeedSync -> (sync) -> Dirty -> Clean - New (proactive): Unwritten -> (sysfs) -> NeedSyncUnwritten -> (sync) -> CleanUnwritten - On write to CleanUnwritten: CleanUnwritten -> (write) -> Dirty -> Clean - On disk replacement: CleanUnwritten regions are converted to Unwritten before recovery starts, so recovery only rebuilds regions with user data A new sysfs interface is added at /sys/block/mdX/md/llbitmap/proactive_sync (write-only) to trigger proactive sync. This only works for RAID-456 arrays. Link: https://lore.kernel.org/linux-raid/20260323054644.3351791-3-yukuai@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/md-llbitmap.c | 140 +++++++++++++++++++++++++++++++++++---- 1 file changed, 128 insertions(+), 12 deletions(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index cdfecaca216b..f10374242c9a 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -208,6 +208,20 @@ enum llbitmap_state { BitNeedSync, /* data is synchronizing */ BitSyncing, + /* + * Proactive sync requested for unwritten region (raid456 only). + * Triggered via sysfs when user wants to pre-build XOR parity + * for regions that have never been written. + */ + BitNeedSyncUnwritten, + /* Proactive sync in progress for unwritten region */ + BitSyncingUnwritten, + /* + * XOR parity has been pre-built for a region that has never had + * user data written. When user writes to this region, it transitions + * to BitDirty. + */ + BitCleanUnwritten, BitStateCount, BitNone = 0xff, }; @@ -232,6 +246,12 @@ enum llbitmap_action { * BitNeedSync. */ BitmapActionStale, + /* + * Proactive sync trigger for raid456 - builds XOR parity for + * Unwritten regions without requiring user data write first. + */ + BitmapActionProactiveSync, + BitmapActionClearUnwritten, BitmapActionCount, /* Init state is BitUnwritten */ BitmapActionInit, @@ -304,6 +324,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitNone, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNeedSyncUnwritten, + [BitmapActionClearUnwritten] = BitNone, }, [BitClean] = { [BitmapActionStartwrite] = BitDirty, @@ -314,6 +336,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitDirty] = { [BitmapActionStartwrite] = BitNone, @@ -324,6 +348,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitClean, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitNeedSync] = { [BitmapActionStartwrite] = BitNone, @@ -334,6 +360,8 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNone, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, }, [BitSyncing] = { [BitmapActionStartwrite] = BitNone, @@ -344,6 +372,44 @@ static char state_machine[BitStateCount][BitmapActionCount] = { [BitmapActionDaemon] = BitNone, [BitmapActionDiscard] = BitUnwritten, [BitmapActionStale] = BitNeedSync, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitNone, + }, + [BitNeedSyncUnwritten] = { + [BitmapActionStartwrite] = BitNeedSync, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitSyncingUnwritten] = { + [BitmapActionStartwrite] = BitSyncing, + [BitmapActionStartsync] = BitSyncingUnwritten, + [BitmapActionEndsync] = BitCleanUnwritten, + [BitmapActionAbortsync] = BitUnwritten, + [BitmapActionReload] = BitUnwritten, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, + }, + [BitCleanUnwritten] = { + [BitmapActionStartwrite] = BitDirty, + [BitmapActionStartsync] = BitNone, + [BitmapActionEndsync] = BitNone, + [BitmapActionAbortsync] = BitNone, + [BitmapActionReload] = BitNone, + [BitmapActionDaemon] = BitNone, + [BitmapActionDiscard] = BitUnwritten, + [BitmapActionStale] = BitUnwritten, + [BitmapActionProactiveSync] = BitNone, + [BitmapActionClearUnwritten] = BitUnwritten, }, }; @@ -376,6 +442,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, pctl->state[pos] = level_456 ? BitNeedSync : BitDirty; break; case BitClean: + case BitCleanUnwritten: pctl->state[pos] = BitDirty; break; } @@ -383,7 +450,7 @@ static void llbitmap_infect_dirty_bits(struct llbitmap *llbitmap, } static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, - int offset) + int offset, bool infect) { struct llbitmap_page_ctl *pctl = llbitmap->pctl[idx]; unsigned int io_size = llbitmap->io_size; @@ -398,7 +465,7 @@ static void llbitmap_set_page_dirty(struct llbitmap *llbitmap, int idx, * resync all the dirty bits, hence skip infect new dirty bits to * prevent resync unnecessary data. */ - if (llbitmap->mddev->degraded) { + if (llbitmap->mddev->degraded || !infect) { set_bit(block, pctl->dirty); return; } @@ -438,7 +505,9 @@ static void llbitmap_write(struct llbitmap *llbitmap, enum llbitmap_state state, llbitmap->pctl[idx]->state[bit] = state; if (state == BitDirty || state == BitNeedSync) - llbitmap_set_page_dirty(llbitmap, idx, bit); + llbitmap_set_page_dirty(llbitmap, idx, bit, true); + else if (state == BitNeedSyncUnwritten) + llbitmap_set_page_dirty(llbitmap, idx, bit, false); } static struct page *llbitmap_read_page(struct llbitmap *llbitmap, int idx) @@ -627,11 +696,10 @@ static enum llbitmap_state llbitmap_state_machine(struct llbitmap *llbitmap, goto write_bitmap; } - if (c == BitNeedSync) + if (c == BitNeedSync || c == BitNeedSyncUnwritten) need_resync = !mddev->degraded; state = state_machine[c][action]; - write_bitmap: if (unlikely(mddev->degraded)) { /* For degraded array, mark new data as need sync. */ @@ -658,8 +726,7 @@ write_bitmap: } llbitmap_write(llbitmap, state, start); - - if (state == BitNeedSync) + if (state == BitNeedSync || state == BitNeedSyncUnwritten) need_resync = !mddev->degraded; else if (state == BitDirty && !timer_pending(&llbitmap->pending_timer)) @@ -1229,7 +1296,7 @@ static bool llbitmap_blocks_synced(struct mddev *mddev, sector_t offset) unsigned long p = offset >> llbitmap->chunkshift; enum llbitmap_state c = llbitmap_read(llbitmap, p); - return c == BitClean || c == BitDirty; + return c == BitClean || c == BitDirty || c == BitCleanUnwritten; } static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) @@ -1243,6 +1310,10 @@ static sector_t llbitmap_skip_sync_blocks(struct mddev *mddev, sector_t offset) if (c == BitUnwritten) return blocks; + /* Skip CleanUnwritten - no user data, will be reset after recovery */ + if (c == BitCleanUnwritten) + return blocks; + /* For degraded array, don't skip */ if (mddev->degraded) return 0; @@ -1261,14 +1332,25 @@ static bool llbitmap_start_sync(struct mddev *mddev, sector_t offset, { struct llbitmap *llbitmap = mddev->bitmap; unsigned long p = offset >> llbitmap->chunkshift; + enum llbitmap_state state; + + /* + * Before recovery starts, convert CleanUnwritten to Unwritten. + * This ensures the new disk won't have stale parity data. + */ + if (offset == 0 && test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_LAZY_RECOVER, &mddev->recovery)) + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionClearUnwritten); + /* * Handle one bit at a time, this is much simpler. And it doesn't matter * if md_do_sync() loop more times. */ *blocks = llbitmap->chunksize - (offset & (llbitmap->chunksize - 1)); - return llbitmap_state_machine(llbitmap, p, p, - BitmapActionStartsync) == BitSyncing; + state = llbitmap_state_machine(llbitmap, p, p, BitmapActionStartsync); + return state == BitSyncing || state == BitSyncingUnwritten; } /* Something is wrong, sync_thread stop at @offset */ @@ -1474,9 +1556,15 @@ static ssize_t bits_show(struct mddev *mddev, char *page) } mutex_unlock(&mddev->bitmap_info.mutex); - return sprintf(page, "unwritten %d\nclean %d\ndirty %d\nneed sync %d\nsyncing %d\n", + return sprintf(page, + "unwritten %d\nclean %d\ndirty %d\n" + "need sync %d\nsyncing %d\n" + "need sync unwritten %d\nsyncing unwritten %d\n" + "clean unwritten %d\n", bits[BitUnwritten], bits[BitClean], bits[BitDirty], - bits[BitNeedSync], bits[BitSyncing]); + bits[BitNeedSync], bits[BitSyncing], + bits[BitNeedSyncUnwritten], bits[BitSyncingUnwritten], + bits[BitCleanUnwritten]); } static struct md_sysfs_entry llbitmap_bits = __ATTR_RO(bits); @@ -1549,11 +1637,39 @@ barrier_idle_store(struct mddev *mddev, const char *buf, size_t len) static struct md_sysfs_entry llbitmap_barrier_idle = __ATTR_RW(barrier_idle); +static ssize_t +proactive_sync_store(struct mddev *mddev, const char *buf, size_t len) +{ + struct llbitmap *llbitmap; + + /* Only for RAID-456 */ + if (!raid_is_456(mddev)) + return -EINVAL; + + mutex_lock(&mddev->bitmap_info.mutex); + llbitmap = mddev->bitmap; + if (!llbitmap || !llbitmap->pctl) { + mutex_unlock(&mddev->bitmap_info.mutex); + return -ENODEV; + } + + /* Trigger proactive sync on all Unwritten regions */ + llbitmap_state_machine(llbitmap, 0, llbitmap->chunks - 1, + BitmapActionProactiveSync); + + mutex_unlock(&mddev->bitmap_info.mutex); + return len; +} + +static struct md_sysfs_entry llbitmap_proactive_sync = + __ATTR(proactive_sync, 0200, NULL, proactive_sync_store); + static struct attribute *md_llbitmap_attrs[] = { &llbitmap_bits.attr, &llbitmap_metadata.attr, &llbitmap_daemon_sleep.attr, &llbitmap_barrier_idle.attr, + &llbitmap_proactive_sync.attr, NULL }; From e92a5325b5d3bc30730b4842249ba8990a0a92b8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 23 Mar 2026 13:46:44 +0800 Subject: [PATCH 117/146] md/md-llbitmap: optimize initial sync with write_zeroes_unmap support For RAID-456 arrays with llbitmap, if all underlying disks support write_zeroes with unmap, issue write_zeroes to zero all disk data regions and initialize the bitmap to BitCleanUnwritten instead of BitUnwritten. This optimization skips the initial XOR parity building because: 1. write_zeroes with unmap guarantees zeroed reads after the operation 2. For RAID-456, when all data is zero, parity is automatically consistent (0 XOR 0 XOR ... = 0) 3. BitCleanUnwritten indicates parity is valid but no user data has been written The implementation adds two helper functions: - llbitmap_all_disks_support_wzeroes_unmap(): Checks if all active disks support write_zeroes with unmap - llbitmap_zero_all_disks(): Issues blkdev_issue_zeroout() to each rdev's data region to zero all disks The zeroing and bitmap state setting happens in llbitmap_init_state() during bitmap initialization. If any disk fails to zero, we fall back to BitUnwritten and normal lazy recovery. This significantly reduces array initialization time for RAID-456 arrays built on modern NVMe SSDs or other devices that support write_zeroes with unmap. Reviewed-by: Xiao Ni Link: https://lore.kernel.org/linux-raid/20260323054644.3351791-4-yukuai@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/md-llbitmap.c | 62 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-llbitmap.c b/drivers/md/md-llbitmap.c index f10374242c9a..9e7e6b1a6f15 100644 --- a/drivers/md/md-llbitmap.c +++ b/drivers/md/md-llbitmap.c @@ -654,13 +654,73 @@ static int llbitmap_cache_pages(struct llbitmap *llbitmap) return 0; } +/* + * Check if all underlying disks support write_zeroes with unmap. + */ +static bool llbitmap_all_disks_support_wzeroes_unmap(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + if (bdev_write_zeroes_unmap_sectors(rdev->bdev) == 0) + return false; + } + + return true; +} + +/* + * Issue write_zeroes to all underlying disks to zero their data regions. + * This ensures parity consistency for RAID-456 (0 XOR 0 = 0). + * Returns true if all disks were successfully zeroed. + */ +static bool llbitmap_zero_all_disks(struct llbitmap *llbitmap) +{ + struct mddev *mddev = llbitmap->mddev; + struct md_rdev *rdev; + sector_t dev_sectors = mddev->dev_sectors; + int ret; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk < 0 || test_bit(Faulty, &rdev->flags)) + continue; + + ret = blkdev_issue_zeroout(rdev->bdev, + rdev->data_offset, + dev_sectors, + GFP_KERNEL, 0); + if (ret) { + pr_warn("md/llbitmap: failed to zero disk %pg: %d\n", + rdev->bdev, ret); + return false; + } + } + + return true; +} + static void llbitmap_init_state(struct llbitmap *llbitmap) { + struct mddev *mddev = llbitmap->mddev; enum llbitmap_state state = BitUnwritten; unsigned long i; - if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) + if (test_and_clear_bit(BITMAP_CLEAN, &llbitmap->flags)) { state = BitClean; + } else if (raid_is_456(mddev) && + llbitmap_all_disks_support_wzeroes_unmap(llbitmap)) { + /* + * All disks support write_zeroes with unmap. Zero all disks + * to ensure parity consistency, then set BitCleanUnwritten + * to skip initial sync. + */ + if (llbitmap_zero_all_disks(llbitmap)) + state = BitCleanUnwritten; + } for (i = 0; i < llbitmap->chunks; i++) llbitmap_write(llbitmap, state, i); From 808cec74601cfddea87b6970134febfdc7f574b9 Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Tue, 24 Mar 2026 15:24:54 +0800 Subject: [PATCH 118/146] md/raid1: serialize overlap io for writemostly disk Previously, using wait_event() would wake up all waiters simultaneously, and they would compete for the tree lock. The bio which gets the lock first will be handled, so the write sequence cannot be guaranteed. For example: bio1(100,200) bio2(150,200) bio3(150,300) The write sequence of fast device is bio1,bio2,bio3. But the write sequence of slow device could be bio1,bio3,bio2 due to lock competition. This causes data corruption. Replace waitqueue with a fifo list to guarantee the write sequence. And it also needs to iterate the list when removing one entry. If not, it may miss the opportunity to wake up the waiting io. For example: bio1(1,3), bio2(2,4) bio3(5,7), bio4(6,8) These four bios are in the same bucket. bio1 and bio3 are inserted into the rbtree. bio2 and bio4 are added to the waiting list and bio2 is the first one. bio3 returns from slow disk and tries to wake up the waiting bios. bio2 is removed from the list and will be handled. But bio1 hasn't finished. So bio2 will be added into waiting list again. Then bio1 returns from slow disk and wakes up waiting bios. bio4 is removed from the list and will be handled. Now bio1, bio3 and bio4 all finish and bio2 is left on the waiting list. So it needs to iterate the waiting list to wake up the right bio. Signed-off-by: Xiao Ni Link: https://lore.kernel.org/linux-raid/20260324072501.59865-1-xni@redhat.com/ Signed-off-by: Yu Kuai --- drivers/md/md.c | 1 - drivers/md/md.h | 5 ++++- drivers/md/raid1.c | 49 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index ee01e050ee12..67e2b501d94f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -187,7 +187,6 @@ static int rdev_init_serial(struct md_rdev *rdev) spin_lock_init(&serial_tmp->serial_lock); serial_tmp->serial_rb = RB_ROOT_CACHED; - init_waitqueue_head(&serial_tmp->serial_io_wait); } rdev->serial = serial; diff --git a/drivers/md/md.h b/drivers/md/md.h index ac84289664cd..d6f5482e2479 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -126,7 +126,6 @@ enum sync_action { struct serial_in_rdev { struct rb_root_cached serial_rb; spinlock_t serial_lock; - wait_queue_head_t serial_io_wait; }; /* @@ -381,7 +380,11 @@ struct serial_info { struct rb_node node; sector_t start; /* start sector of rb node */ sector_t last; /* end sector of rb node */ + sector_t wnode_start; /* address of waiting nodes on the same list */ sector_t _subtree_last; /* highest sector in subtree of rb node */ + struct list_head list_node; + struct list_head waiters; + struct completion ready; }; /* diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 16f671ab12c0..ba91f7e61920 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -57,21 +57,29 @@ INTERVAL_TREE_DEFINE(struct serial_info, node, sector_t, _subtree_last, START, LAST, static inline, raid1_rb); static int check_and_add_serial(struct md_rdev *rdev, struct r1bio *r1_bio, - struct serial_info *si, int idx) + struct serial_info *si) { unsigned long flags; int ret = 0; sector_t lo = r1_bio->sector; sector_t hi = lo + r1_bio->sectors - 1; + int idx = sector_to_idx(r1_bio->sector); struct serial_in_rdev *serial = &rdev->serial[idx]; + struct serial_info *head_si; spin_lock_irqsave(&serial->serial_lock, flags); /* collision happened */ - if (raid1_rb_iter_first(&serial->serial_rb, lo, hi)) - ret = -EBUSY; - else { + head_si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); + if (head_si && head_si != si) { si->start = lo; si->last = hi; + si->wnode_start = head_si->wnode_start; + list_add_tail(&si->list_node, &head_si->waiters); + ret = -EBUSY; + } else if (!head_si) { + si->start = lo; + si->last = hi; + si->wnode_start = si->start; raid1_rb_insert(si, &serial->serial_rb); } spin_unlock_irqrestore(&serial->serial_lock, flags); @@ -83,19 +91,22 @@ static void wait_for_serialization(struct md_rdev *rdev, struct r1bio *r1_bio) { struct mddev *mddev = rdev->mddev; struct serial_info *si; - int idx = sector_to_idx(r1_bio->sector); - struct serial_in_rdev *serial = &rdev->serial[idx]; if (WARN_ON(!mddev->serial_info_pool)) return; si = mempool_alloc(mddev->serial_info_pool, GFP_NOIO); - wait_event(serial->serial_io_wait, - check_and_add_serial(rdev, r1_bio, si, idx) == 0); + INIT_LIST_HEAD(&si->waiters); + INIT_LIST_HEAD(&si->list_node); + init_completion(&si->ready); + while (check_and_add_serial(rdev, r1_bio, si)) { + wait_for_completion(&si->ready); + reinit_completion(&si->ready); + } } static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) { - struct serial_info *si; + struct serial_info *si, *iter_si; unsigned long flags; int found = 0; struct mddev *mddev = rdev->mddev; @@ -106,16 +117,28 @@ static void remove_serial(struct md_rdev *rdev, sector_t lo, sector_t hi) for (si = raid1_rb_iter_first(&serial->serial_rb, lo, hi); si; si = raid1_rb_iter_next(si, lo, hi)) { if (si->start == lo && si->last == hi) { - raid1_rb_remove(si, &serial->serial_rb); - mempool_free(si, mddev->serial_info_pool); found = 1; break; } } - if (!found) + if (found) { + raid1_rb_remove(si, &serial->serial_rb); + if (!list_empty(&si->waiters)) { + list_for_each_entry(iter_si, &si->waiters, list_node) { + if (iter_si->wnode_start == si->wnode_start) { + list_del_init(&iter_si->list_node); + list_splice_init(&si->waiters, &iter_si->waiters); + raid1_rb_insert(iter_si, &serial->serial_rb); + complete(&iter_si->ready); + break; + } + } + } + mempool_free(si, mddev->serial_info_pool); + } else { WARN(1, "The write IO is not recorded for serialization\n"); + } spin_unlock_irqrestore(&serial->serial_lock, flags); - wake_up(&serial->serial_io_wait); } /* From cf86bb53b9c92354904a328e947a05ffbfdd1840 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 27 Mar 2026 22:07:29 +0800 Subject: [PATCH 119/146] md: wake raid456 reshape waiters before suspend During raid456 reshape, direct IO across the reshape position can sleep in raid5_make_request() waiting for reshape progress while still holding an active_io reference. If userspace then freezes reshape and writes md/suspend_lo or md/suspend_hi, mddev_suspend() kills active_io and waits for all in-flight IO to drain. This can deadlock: the IO needs reshape progress to continue, but the reshape thread is already frozen, so the active_io reference is never dropped and suspend never completes. raid5_prepare_suspend() already wakes wait_for_reshape for dm-raid. Do the same for normal md suspend when reshape is already interrupted, so waiting raid456 IO can abort, drop its reference, and let suspend finish. The mdadm test tests/25raid456-reshape-deadlock reproduces the hang. Fixes: 714d20150ed8 ("md: add new helpers to suspend/resume array") Link: https://lore.kernel.org/linux-raid/20260327140729.2030564-1-yukuai@fnnas.com/ Signed-off-by: Yu Kuai --- drivers/md/md.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 67e2b501d94f..5fb5ae8368ba 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -487,6 +487,17 @@ int mddev_suspend(struct mddev *mddev, bool interruptible) } percpu_ref_kill(&mddev->active_io); + + /* + * RAID456 IO can sleep in wait_for_reshape while still holding an + * active_io reference. If reshape is already interrupted or frozen, + * wake those waiters so they can abort and drop the reference instead + * of deadlocking suspend. + */ + if (mddev->pers && mddev->pers->prepare_suspend && + reshape_interrupted(mddev)) + mddev->pers->prepare_suspend(mddev); + if (interruptible) err = wait_event_interruptible(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io)); From 7f9f7c697474268d9ef9479df3ddfe7cdcfbbffc Mon Sep 17 00:00:00 2001 From: Chia-Ming Chang Date: Thu, 2 Apr 2026 14:14:06 +0800 Subject: [PATCH 120/146] md/raid5: fix soft lockup in retry_aligned_read() When retry_aligned_read() encounters an overlapped stripe, it releases the stripe via raid5_release_stripe() which puts it on the lockless released_stripes llist. In the next raid5d loop iteration, release_stripe_list() drains the stripe onto handle_list (since STRIPE_HANDLE is set by the original IO), but retry_aligned_read() runs before handle_active_stripes() and removes the stripe from handle_list via find_get_stripe() -> list_del_init(). This prevents handle_stripe() from ever processing the stripe to resolve the overlap, causing an infinite loop and soft lockup. Fix this by using __release_stripe() with temp_inactive_list instead of raid5_release_stripe() in the failure path, so the stripe does not go through the released_stripes llist. This allows raid5d to break out of its loop, and the overlap will be resolved when the stripe is eventually processed by handle_stripe(). Fixes: 773ca82fa1ee ("raid5: make release_stripe lockless") Cc: stable@vger.kernel.org Signed-off-by: FengWei Shih Signed-off-by: Chia-Ming Chang Link: https://lore.kernel.org/linux-raid/20260402061406.455755-1-chiamingc@synology.com/ Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1f8360d4cdb7..6e79829c5acb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6641,7 +6641,13 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, } if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { - raid5_release_stripe(sh); + int hash; + + spin_lock_irq(&conf->device_lock); + hash = sh->hash_lock_index; + __release_stripe(conf, sh, + &conf->temp_inactive_list[hash]); + spin_unlock_irq(&conf->device_lock); conf->retry_read_aligned = raid_bio; conf->retry_read_offset = scnt; return handled; From fa0cac9a515877fad856c860ad51107b86ed6c4f Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sun, 5 Apr 2026 16:47:04 +0100 Subject: [PATCH 121/146] drbd: use get_random_u64() where appropriate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the typed random integer helpers instead of get_random_bytes() when filling a single integer variable. The helpers return the value directly, require no pointer or size argument, and better express intent. Signed-off-by: David Carlier Reviewed-by: Christoph Böhmwalder Link: https://patch.msgid.link/20260405154704.4610-1-devnexen@gmail.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_main.c | 4 ++-- drivers/block/drbd/drbd_nl.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 200d464e984b..b1a721dd0496 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -874,7 +874,7 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) if (uuid && uuid != UUID_JUST_CREATED) uuid = uuid + UUID_NEW_BM_OFFSET; else - get_random_bytes(&uuid, sizeof(u64)); + uuid = get_random_u64(); drbd_uuid_set(device, UI_BITMAP, uuid); drbd_print_uuids(device, "updated sync UUID"); drbd_md_sync(device); @@ -3337,7 +3337,7 @@ void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local) u64 val; unsigned long long bm_uuid; - get_random_bytes(&val, sizeof(u64)); + val = get_random_u64(); spin_lock_irq(&device->ldev->md.uuid_lock); bm_uuid = device->ldev->md.uuid[UI_BITMAP]; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index d997d274092c..c2ac555473e7 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -3236,7 +3236,7 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) * matching real data uuid exists). */ u64 val; - get_random_bytes(&val, sizeof(u64)); + val = get_random_u64(); drbd_set_ed_uuid(device, val); drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n"); } From 2fb0ded237bb55dae45bc076666b348fc948ac9e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:52 +0800 Subject: [PATCH 122/146] ublk: add UBLK_U_CMD_REG_BUF/UNREG_BUF control commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add control commands for registering and unregistering shared memory buffers for zero-copy I/O: - UBLK_U_CMD_REG_BUF (0x18): pins pages from userspace, inserts PFN ranges into a per-device maple tree for O(log n) lookup during I/O. Buffer pointers are tracked in a per-device xarray. Returns the assigned buffer index. - UBLK_U_CMD_UNREG_BUF (0x19): removes PFN entries and unpins pages. Queue freeze/unfreeze is handled internally so userspace need not quiesce the device during registration. Also adds: - UBLK_IO_F_SHMEM_ZC flag and addr encoding helpers in UAPI header (16-bit buffer index supporting up to 65536 buffers) - Data structures (ublk_buf, ublk_buf_range) and xarray/maple tree - __ublk_ctrl_reg_buf() helper for PFN insertion with error unwinding - __ublk_ctrl_unreg_buf() helper for cleanup reuse - ublk_support_shmem_zc() / ublk_dev_support_shmem_zc() stubs (returning false — feature not enabled yet) Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-2-ming.lei@redhat.com [axboe: fixup ublk_buf_reg -> ublk_shmem_buf_reg errors, comments] Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 295 ++++++++++++++++++++++++++++++++++ include/uapi/linux/ublk_cmd.h | 72 +++++++++ 2 files changed, 367 insertions(+) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index eb96010625e5..e6a10a1c8cdb 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -46,6 +46,8 @@ #include #include #include +#include +#include #include #include @@ -58,6 +60,8 @@ #define UBLK_CMD_UPDATE_SIZE _IOC_NR(UBLK_U_CMD_UPDATE_SIZE) #define UBLK_CMD_QUIESCE_DEV _IOC_NR(UBLK_U_CMD_QUIESCE_DEV) #define UBLK_CMD_TRY_STOP_DEV _IOC_NR(UBLK_U_CMD_TRY_STOP_DEV) +#define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF) +#define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF) #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -289,6 +293,20 @@ struct ublk_queue { struct ublk_io ios[] __counted_by(q_depth); }; +/* Per-registered shared memory buffer */ +struct ublk_buf { + struct page **pages; + unsigned int nr_pages; +}; + +/* Maple tree value: maps a PFN range to buffer location */ +struct ublk_buf_range { + unsigned long base_pfn; + unsigned short buf_index; + unsigned short flags; + unsigned int base_offset; /* byte offset within buffer */ +}; + struct ublk_device { struct gendisk *ub_disk; @@ -323,6 +341,10 @@ struct ublk_device { bool block_open; /* protected by open_mutex */ + /* shared memory zero copy */ + struct maple_tree buf_tree; + struct xarray bufs_xa; + struct ublk_queue *queues[]; }; @@ -334,6 +356,7 @@ struct ublk_params_header { static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); +static void ublk_buf_cleanup(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, u16 q_id, u16 tag, struct ublk_io *io); @@ -398,6 +421,16 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) return ub->dev_info.flags & UBLK_F_SUPPORT_ZERO_COPY; } +static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq) +{ + return false; +} + +static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub) +{ + return false; +} + static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) { return ubq->flags & UBLK_F_AUTO_BUF_REG; @@ -1460,6 +1493,7 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->op_flags = ublk_op | ublk_req_build_flags(req); iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); + iod->addr = io->buf.addr; return BLK_STS_OK; @@ -1665,6 +1699,7 @@ static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, { unsigned mapped_bytes = ublk_map_io(ubq, req, io); + /* partially mapped, update io descriptor */ if (unlikely(mapped_bytes != blk_rq_bytes(req))) { /* @@ -4211,6 +4246,7 @@ static void ublk_cdev_rel(struct device *dev) { struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); + ublk_buf_cleanup(ub); blk_mq_free_tag_set(&ub->tag_set); ublk_deinit_queues(ub); ublk_free_dev_number(ub); @@ -4630,6 +4666,8 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) mutex_init(&ub->mutex); spin_lock_init(&ub->lock); mutex_init(&ub->cancel_mutex); + mt_init(&ub->buf_tree); + xa_init_flags(&ub->bufs_xa, XA_FLAGS_ALLOC); INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work); ret = ublk_alloc_dev_number(ub, header->dev_id); @@ -5173,6 +5211,255 @@ exit: return err; } +/* + * Drain inflight I/O and quiesce the queue. Freeze drains all inflight + * requests, quiesce_nowait marks the queue so no new requests dispatch, + * then unfreeze allows new submissions (which won't dispatch due to + * quiesce). This keeps freeze and ub->mutex non-nested. + */ +static void ublk_quiesce_and_release(struct gendisk *disk) +{ + unsigned int memflags; + + memflags = blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue_nowait(disk->queue); + blk_mq_unfreeze_queue(disk->queue, memflags); +} + +static void ublk_unquiesce_and_resume(struct gendisk *disk) +{ + blk_mq_unquiesce_queue(disk->queue); +} + +/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */ +static void ublk_buf_erase_ranges(struct ublk_device *ub, + struct ublk_buf *ubuf, + unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; ) { + unsigned long pfn = page_to_pfn(ubuf->pages[i]); + unsigned long start = i; + + while (i + 1 < nr_pages && + page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1) + i++; + i++; + kfree(mtree_erase(&ub->buf_tree, pfn)); + } +} + +static int __ublk_ctrl_reg_buf(struct ublk_device *ub, + struct ublk_buf *ubuf, int index, + unsigned short flags) +{ + unsigned long nr_pages = ubuf->nr_pages; + unsigned long i; + int ret; + + for (i = 0; i < nr_pages; ) { + unsigned long pfn = page_to_pfn(ubuf->pages[i]); + unsigned long start = i; + struct ublk_buf_range *range; + + /* Find run of consecutive PFNs */ + while (i + 1 < nr_pages && + page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1) + i++; + i++; /* past the last page in this run */ + + range = kzalloc(sizeof(*range), GFP_KERNEL); + if (!range) { + ret = -ENOMEM; + goto unwind; + } + range->buf_index = index; + range->flags = flags; + range->base_pfn = pfn; + range->base_offset = start << PAGE_SHIFT; + + ret = mtree_insert_range(&ub->buf_tree, pfn, + pfn + (i - start) - 1, + range, GFP_KERNEL); + if (ret) { + kfree(range); + goto unwind; + } + } + return 0; + +unwind: + ublk_buf_erase_ranges(ub, ubuf, i); + return ret; +} + +/* + * Register a shared memory buffer for zero-copy I/O. + * Pins pages, builds PFN maple tree, freezes/unfreezes the queue + * internally. Returns buffer index (>= 0) on success. + */ +static int ublk_ctrl_reg_buf(struct ublk_device *ub, + struct ublksrv_ctrl_cmd *header) +{ + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_shmem_buf_reg buf_reg; + unsigned long addr, size, nr_pages; + unsigned int gup_flags; + struct gendisk *disk; + struct ublk_buf *ubuf; + long pinned; + u32 index; + int ret; + + if (!ublk_dev_support_shmem_zc(ub)) + return -EOPNOTSUPP; + + memset(&buf_reg, 0, sizeof(buf_reg)); + if (copy_from_user(&buf_reg, argp, + min_t(size_t, header->len, sizeof(buf_reg)))) + return -EFAULT; + + if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY) + return -EINVAL; + + addr = buf_reg.addr; + size = buf_reg.len; + nr_pages = size >> PAGE_SHIFT; + + if (!size || !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr)) + return -EINVAL; + + disk = ublk_get_disk(ub); + if (!disk) + return -ENODEV; + + /* Pin pages before quiescing (may sleep) */ + ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL); + if (!ubuf) { + ret = -ENOMEM; + goto put_disk; + } + + ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages), + GFP_KERNEL); + if (!ubuf->pages) { + ret = -ENOMEM; + goto err_free; + } + + gup_flags = FOLL_LONGTERM; + if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY)) + gup_flags |= FOLL_WRITE; + + pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages); + if (pinned < 0) { + ret = pinned; + goto err_free_pages; + } + if (pinned != nr_pages) { + ret = -EFAULT; + goto err_unpin; + } + ubuf->nr_pages = nr_pages; + + /* + * Drain inflight I/O and quiesce the queue so no new requests + * are dispatched while we modify the maple tree. Keep freeze + * and mutex non-nested to avoid lock dependency. + */ + ublk_quiesce_and_release(disk); + + mutex_lock(&ub->mutex); + + ret = xa_alloc(&ub->bufs_xa, &index, ubuf, xa_limit_16b, GFP_KERNEL); + if (ret) + goto err_unlock; + + ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags); + if (ret) { + xa_erase(&ub->bufs_xa, index); + goto err_unlock; + } + + mutex_unlock(&ub->mutex); + + ublk_unquiesce_and_resume(disk); + ublk_put_disk(disk); + return index; + +err_unlock: + mutex_unlock(&ub->mutex); + ublk_unquiesce_and_resume(disk); +err_unpin: + unpin_user_pages(ubuf->pages, pinned); +err_free_pages: + kvfree(ubuf->pages); +err_free: + kfree(ubuf); +put_disk: + ublk_put_disk(disk); + return ret; +} + +static void __ublk_ctrl_unreg_buf(struct ublk_device *ub, + struct ublk_buf *ubuf) +{ + ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages); + unpin_user_pages(ubuf->pages, ubuf->nr_pages); + kvfree(ubuf->pages); + kfree(ubuf); +} + +static int ublk_ctrl_unreg_buf(struct ublk_device *ub, + struct ublksrv_ctrl_cmd *header) +{ + int index = (int)header->data[0]; + struct gendisk *disk; + struct ublk_buf *ubuf; + + if (!ublk_dev_support_shmem_zc(ub)) + return -EOPNOTSUPP; + + disk = ublk_get_disk(ub); + if (!disk) + return -ENODEV; + + /* Drain inflight I/O before modifying the maple tree */ + ublk_quiesce_and_release(disk); + + mutex_lock(&ub->mutex); + + ubuf = xa_erase(&ub->bufs_xa, index); + if (!ubuf) { + mutex_unlock(&ub->mutex); + ublk_unquiesce_and_resume(disk); + ublk_put_disk(disk); + return -ENOENT; + } + + __ublk_ctrl_unreg_buf(ub, ubuf); + + mutex_unlock(&ub->mutex); + + ublk_unquiesce_and_resume(disk); + ublk_put_disk(disk); + return 0; +} + +static void ublk_buf_cleanup(struct ublk_device *ub) +{ + struct ublk_buf *ubuf; + unsigned long index; + + xa_for_each(&ub->bufs_xa, index, ubuf) + __ublk_ctrl_unreg_buf(ub, ubuf); + xa_destroy(&ub->bufs_xa); + mtree_destroy(&ub->buf_tree); +} + + + static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, u32 cmd_op, struct ublksrv_ctrl_cmd *header) { @@ -5230,6 +5517,8 @@ static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, case UBLK_CMD_UPDATE_SIZE: case UBLK_CMD_QUIESCE_DEV: case UBLK_CMD_TRY_STOP_DEV: + case UBLK_CMD_REG_BUF: + case UBLK_CMD_UNREG_BUF: mask = MAY_READ | MAY_WRITE; break; default: @@ -5355,6 +5644,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, case UBLK_CMD_TRY_STOP_DEV: ret = ublk_ctrl_try_stop_dev(ub); break; + case UBLK_CMD_REG_BUF: + ret = ublk_ctrl_reg_buf(ub, &header); + break; + case UBLK_CMD_UNREG_BUF: + ret = ublk_ctrl_unreg_buf(ub, &header); + break; default: ret = -EOPNOTSUPP; break; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a88876756805..5b71c19d3b9c 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -57,6 +57,44 @@ _IOWR('u', 0x16, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_TRY_STOP_DEV \ _IOWR('u', 0x17, struct ublksrv_ctrl_cmd) +/* + * Register a shared memory buffer for zero-copy I/O. + * Input: ctrl_cmd.addr points to struct ublk_shmem_buf_reg (buffer VA + size) + * ctrl_cmd.len = sizeof(struct ublk_shmem_buf_reg) + * Result: >= 0 is the assigned buffer index, < 0 is error + * + * The kernel pins pages from the calling process's address space + * and inserts PFN ranges into a per-device maple tree. When a block + * request's pages match registered pages, the driver sets + * UBLK_IO_F_SHMEM_ZC and encodes the buffer index + offset in addr, + * allowing the server to access the data via its own mapping of the + * same shared memory — true zero copy. + * + * The memory can be backed by memfd, hugetlbfs, or any GUP-compatible + * shared mapping. Queue freeze is handled internally. + * + * The buffer VA and size are passed via a user buffer (not inline in + * ctrl_cmd) so that unprivileged devices can prepend the device path + * to ctrl_cmd.addr without corrupting the VA. + */ +#define UBLK_U_CMD_REG_BUF \ + _IOWR('u', 0x18, struct ublksrv_ctrl_cmd) +/* + * Unregister a shared memory buffer. + * Input: ctrl_cmd.data[0] = buffer index + */ +#define UBLK_U_CMD_UNREG_BUF \ + _IOWR('u', 0x19, struct ublksrv_ctrl_cmd) + +/* Parameter buffer for UBLK_U_CMD_REG_BUF, pointed to by ctrl_cmd.addr */ +struct ublk_shmem_buf_reg { + __u64 addr; /* userspace virtual address of shared memory */ + __u32 len; /* buffer size in bytes (page-aligned, max 4GB) */ + __u32 flags; +}; + +/* Pin pages without FOLL_WRITE; usable with write-sealed memfd */ +#define UBLK_SHMEM_BUF_READ_ONLY (1U << 0) /* * 64bits are enough now, and it should be easy to extend in case of * running out of feature flags @@ -370,6 +408,7 @@ /* Disable automatic partition scanning when device is started */ #define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) + /* device state */ #define UBLK_S_DEV_DEAD 0 #define UBLK_S_DEV_LIVE 1 @@ -469,6 +508,12 @@ struct ublksrv_ctrl_dev_info { #define UBLK_IO_F_NEED_REG_BUF (1U << 17) /* Request has an integrity data buffer */ #define UBLK_IO_F_INTEGRITY (1UL << 18) +/* + * I/O buffer is in a registered shared memory buffer. When set, the addr + * field in ublksrv_io_desc encodes buffer index and byte offset instead + * of a userspace virtual address. + */ +#define UBLK_IO_F_SHMEM_ZC (1U << 19) /* * io cmd is described by this structure, and stored in share memory, indexed @@ -743,4 +788,31 @@ struct ublk_params { struct ublk_param_integrity integrity; }; +/* + * Shared memory zero-copy addr encoding for UBLK_IO_F_SHMEM_ZC. + * + * When UBLK_IO_F_SHMEM_ZC is set, ublksrv_io_desc.addr is encoded as: + * bits [0:31] = byte offset within the buffer (up to 4GB) + * bits [32:47] = buffer index (up to 65536) + * bits [48:63] = reserved (must be zero) + */ +#define UBLK_SHMEM_ZC_OFF_MASK 0xffffffffULL +#define UBLK_SHMEM_ZC_IDX_OFF 32 +#define UBLK_SHMEM_ZC_IDX_MASK 0xffffULL + +static inline __u64 ublk_shmem_zc_addr(__u16 index, __u32 offset) +{ + return ((__u64)index << UBLK_SHMEM_ZC_IDX_OFF) | offset; +} + +static inline __u16 ublk_shmem_zc_index(__u64 addr) +{ + return (addr >> UBLK_SHMEM_ZC_IDX_OFF) & UBLK_SHMEM_ZC_IDX_MASK; +} + +static inline __u32 ublk_shmem_zc_offset(__u64 addr) +{ + return (__u32)(addr & UBLK_SHMEM_ZC_OFF_MASK); +} + #endif From 4d4a512a1f87b156f694d25c800e3d525aa56e8a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:53 +0800 Subject: [PATCH 123/146] ublk: add PFN-based buffer matching in I/O path Add ublk_try_buf_match() which walks a request's bio_vecs, looks up each page's PFN in the per-device maple tree, and verifies all pages belong to the same registered buffer at contiguous offsets. Add ublk_iod_is_shmem_zc() inline helper for checking whether a request uses the shmem zero-copy path. Integrate into the I/O path: - ublk_setup_iod(): if pages match a registered buffer, set UBLK_IO_F_SHMEM_ZC and encode buffer index + offset in addr - ublk_start_io(): skip ublk_map_io() for zero-copy requests - __ublk_complete_rq(): skip ublk_unmap_io() for zero-copy requests The feature remains disabled (ublk_support_shmem_zc() returns false) until the UBLK_F_SHMEM_ZC flag is enabled in the next patch. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 77 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index e6a10a1c8cdb..264b41ceedd8 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -356,6 +356,8 @@ struct ublk_params_header { static void ublk_io_release(void *priv); static void ublk_stop_dev_unlocked(struct ublk_device *ub); +static bool ublk_try_buf_match(struct ublk_device *ub, struct request *rq, + u32 *buf_idx, u32 *buf_off); static void ublk_buf_cleanup(struct ublk_device *ub); static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq); static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, @@ -426,6 +428,12 @@ static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq) return false; } +static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq, + unsigned int tag) +{ + return ublk_get_iod(ubq, tag)->op_flags & UBLK_IO_F_SHMEM_ZC; +} + static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub) { return false; @@ -1494,6 +1502,18 @@ static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) iod->nr_sectors = blk_rq_sectors(req); iod->start_sector = blk_rq_pos(req); + /* Try shmem zero-copy match before setting addr */ + if (ublk_support_shmem_zc(ubq) && ublk_rq_has_data(req)) { + u32 buf_idx, buf_off; + + if (ublk_try_buf_match(ubq->dev, req, + &buf_idx, &buf_off)) { + iod->op_flags |= UBLK_IO_F_SHMEM_ZC; + iod->addr = ublk_shmem_zc_addr(buf_idx, buf_off); + return BLK_STS_OK; + } + } + iod->addr = io->buf.addr; return BLK_STS_OK; @@ -1539,6 +1559,10 @@ static inline void __ublk_complete_rq(struct request *req, struct ublk_io *io, req_op(req) != REQ_OP_DRV_IN) goto exit; + /* shmem zero copy: no data to unmap, pages already shared */ + if (ublk_iod_is_shmem_zc(req->mq_hctx->driver_data, req->tag)) + goto exit; + /* for READ request, writing data in iod->addr to rq buffers */ unmapped_bytes = ublk_unmap_io(need_map, req, io); @@ -1697,8 +1721,13 @@ static void ublk_auto_buf_dispatch(const struct ublk_queue *ubq, static bool ublk_start_io(const struct ublk_queue *ubq, struct request *req, struct ublk_io *io) { - unsigned mapped_bytes = ublk_map_io(ubq, req, io); + unsigned mapped_bytes; + /* shmem zero copy: skip data copy, pages already shared */ + if (ublk_iod_is_shmem_zc(ubq, req->tag)) + return true; + + mapped_bytes = ublk_map_io(ubq, req, io); /* partially mapped, update io descriptor */ if (unlikely(mapped_bytes != blk_rq_bytes(req))) { @@ -5458,7 +5487,53 @@ static void ublk_buf_cleanup(struct ublk_device *ub) mtree_destroy(&ub->buf_tree); } +/* Check if request pages match a registered shared memory buffer */ +static bool ublk_try_buf_match(struct ublk_device *ub, + struct request *rq, + u32 *buf_idx, u32 *buf_off) +{ + struct req_iterator iter; + struct bio_vec bv; + int index = -1; + unsigned long expected_offset = 0; + bool first = true; + rq_for_each_bvec(bv, rq, iter) { + unsigned long pfn = page_to_pfn(bv.bv_page); + struct ublk_buf_range *range; + unsigned long off; + + range = mtree_load(&ub->buf_tree, pfn); + if (!range) + return false; + + off = range->base_offset + + (pfn - range->base_pfn) * PAGE_SIZE + bv.bv_offset; + + if (first) { + /* Read-only buffer can't serve READ (kernel writes) */ + if ((range->flags & UBLK_SHMEM_BUF_READ_ONLY) && + req_op(rq) != REQ_OP_WRITE) + return false; + index = range->buf_index; + expected_offset = off; + *buf_off = off; + first = false; + } else { + if (range->buf_index != index) + return false; + if (off != expected_offset) + return false; + } + expected_offset += bv.bv_len; + } + + if (first) + return false; + + *buf_idx = index; + return true; +} static int ublk_ctrl_uring_cmd_permission(struct ublk_device *ub, u32 cmd_op, struct ublksrv_ctrl_cmd *header) From 08677040a91199175149d1fd465c02e3b3fc768a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:54 +0800 Subject: [PATCH 124/146] ublk: enable UBLK_F_SHMEM_ZC feature flag Add UBLK_F_SHMEM_ZC (1ULL << 19) to the UAPI header and UBLK_F_ALL. Switch ublk_support_shmem_zc() and ublk_dev_support_shmem_zc() from returning false to checking the actual flag, enabling the shared memory zero-copy feature for devices that request it. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-4-ming.lei@redhat.com [axboe: ublk_buf_reg -> ublk_shmem_buf_reg errors] Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 117 ++++++++++++++++++++++++++++++++++ drivers/block/ublk_drv.c | 7 +- include/uapi/linux/ublk_cmd.h | 7 ++ 3 files changed, 128 insertions(+), 3 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index 6ad28039663d..e80cc415a739 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -485,6 +485,123 @@ Limitations in case that too many ublk devices are handled by this single io_ring_ctx and each one has very large queue depth +Shared Memory Zero Copy (UBLK_F_SHMEM_ZC) +------------------------------------------ + +The ``UBLK_F_SHMEM_ZC`` feature provides an alternative zero-copy path +that works by sharing physical memory pages between the client application +and the ublk server. Unlike the io_uring fixed buffer approach above, +shared memory zero copy does not require io_uring buffer registration +per I/O — instead, it relies on the kernel matching page frame numbers +(PFNs) at I/O time. This allows the ublk server to access the shared +buffer directly, which is unlikely for the io_uring fixed buffer +approach. + +Motivation +~~~~~~~~~~ + +Shared memory zero copy takes a different approach: if the client +application and the ublk server both map the same physical memory, there is +nothing to copy. The kernel detects the shared pages automatically and +tells the server where the data already lives. + +``UBLK_F_SHMEM_ZC`` can be thought of as a supplement for optimized client +applications — when the client is willing to allocate I/O buffers from +shared memory, the entire data path becomes zero-copy without any per-I/O +overhead. + +Use Cases +~~~~~~~~~ + +This feature is useful when the client application can be configured to +use a specific shared memory region for its I/O buffers: + +- **Custom storage clients** that allocate I/O buffers from shared memory + (memfd, hugetlbfs) and issue direct I/O to the ublk device +- **Database engines** that use pre-allocated buffer pools with O_DIRECT + +How It Works +~~~~~~~~~~~~ + +1. The ublk server and client both ``mmap()`` the same file (memfd or + hugetlbfs) with ``MAP_SHARED``. This gives both processes access to the + same physical pages. + +2. The ublk server registers its mapping with the kernel:: + + struct ublk_shmem_buf_reg buf = { .addr = mmap_va, .len = size }; + ublk_ctrl_cmd(UBLK_U_CMD_REG_BUF, .addr = &buf); + + The kernel pins the pages and builds a PFN lookup tree. + +3. When the client issues direct I/O (``O_DIRECT``) to ``/dev/ublkb*``, + the kernel checks whether the I/O buffer pages match any registered + pages by comparing PFNs. + +4. On a match, the kernel sets ``UBLK_IO_F_SHMEM_ZC`` in the I/O + descriptor and encodes the buffer index and offset in ``addr``:: + + if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) { + /* Data is already in our shared mapping — zero copy */ + index = ublk_shmem_zc_index(iod->addr); + offset = ublk_shmem_zc_offset(iod->addr); + buf = shmem_table[index].mmap_base + offset; + } + +5. If pages do not match (e.g., the client used a non-shared buffer), + the I/O falls back to the normal copy path silently. + +The shared memory can be set up via two methods: + +- **Socket-based**: the client sends a memfd to the ublk server via + ``SCM_RIGHTS`` on a unix socket. The server mmaps and registers it. +- **Hugetlbfs-based**: both processes ``mmap(MAP_SHARED)`` the same + hugetlbfs file. No IPC needed — same file gives same physical pages. + +Advantages +~~~~~~~~~~ + +- **Simple**: no per-I/O buffer registration or unregistration commands. + Once the shared buffer is registered, all matching I/O is zero-copy + automatically. +- **Direct buffer access**: the ublk server can read and write the shared + buffer directly via its own mmap, without going through io_uring fixed + buffer operations. This is more friendly for server implementations. +- **Fast**: PFN matching is a single maple tree lookup per bvec. No + io_uring command round-trips for buffer management. +- **Compatible**: non-matching I/O silently falls back to the copy path. + The device works normally for any client, with zero-copy as an + optimization when shared memory is available. + +Limitations +~~~~~~~~~~~ + +- **Requires client cooperation**: the client must allocate its I/O + buffers from the shared memory region. This requires a custom or + configured client — standard applications using their own buffers + will not benefit. +- **Direct I/O only**: buffered I/O (without ``O_DIRECT``) goes through + the page cache, which allocates its own pages. These kernel-allocated + pages will never match the registered shared buffer. Only ``O_DIRECT`` + puts the client's buffer pages directly into the block I/O. + +Control Commands +~~~~~~~~~~~~~~~~ + +- ``UBLK_U_CMD_REG_BUF`` + + Register a shared memory buffer. ``ctrl_cmd.addr`` points to a + ``struct ublk_shmem_buf_reg`` containing the buffer virtual address and size. + Returns the assigned buffer index (>= 0) on success. The kernel pins + pages and builds the PFN lookup tree. Queue freeze is handled + internally. + +- ``UBLK_U_CMD_UNREG_BUF`` + + Unregister a previously registered buffer. ``ctrl_cmd.data[0]`` is the + buffer index. Unpins pages and removes PFN entries from the lookup + tree. + References ========== diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 264b41ceedd8..bdb1de41d526 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -85,7 +85,8 @@ | (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ? UBLK_F_INTEGRITY : 0) \ | UBLK_F_SAFE_STOP_DEV \ | UBLK_F_BATCH_IO \ - | UBLK_F_NO_AUTO_PART_SCAN) + | UBLK_F_NO_AUTO_PART_SCAN \ + | UBLK_F_SHMEM_ZC) #define UBLK_F_ALL_RECOVERY_FLAGS (UBLK_F_USER_RECOVERY \ | UBLK_F_USER_RECOVERY_REISSUE \ @@ -425,7 +426,7 @@ static inline bool ublk_dev_support_zero_copy(const struct ublk_device *ub) static inline bool ublk_support_shmem_zc(const struct ublk_queue *ubq) { - return false; + return ubq->flags & UBLK_F_SHMEM_ZC; } static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq, @@ -436,7 +437,7 @@ static inline bool ublk_iod_is_shmem_zc(const struct ublk_queue *ubq, static inline bool ublk_dev_support_shmem_zc(const struct ublk_device *ub) { - return false; + return ub->dev_info.flags & UBLK_F_SHMEM_ZC; } static inline bool ublk_support_auto_buf_reg(const struct ublk_queue *ubq) diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index 5b71c19d3b9c..a7078b798791 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -408,6 +408,13 @@ struct ublk_shmem_buf_reg { /* Disable automatic partition scanning when device is started */ #define UBLK_F_NO_AUTO_PART_SCAN (1ULL << 18) +/* + * Enable shared memory zero copy. When enabled, the server can register + * shared memory buffers via UBLK_U_CMD_REG_BUF. If a block request's + * pages match a registered buffer, UBLK_IO_F_SHMEM_ZC is set and addr + * encodes the buffer index + offset instead of a userspace buffer address. + */ +#define UBLK_F_SHMEM_ZC (1ULL << 19) /* device state */ #define UBLK_S_DEV_DEAD 0 From 8a34e88769f617dc980edb5a0079e347bd1b9a89 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:55 +0800 Subject: [PATCH 125/146] ublk: eliminate permanent pages[] array from struct ublk_buf The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer) was stored permanently in struct ublk_buf but only needed during pin_user_pages_fast() and maple tree construction. Since the maple tree already stores PFN ranges via ublk_buf_range, struct page pointers can be recovered via pfn_to_page() during unregistration. Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf() to iterate the maple tree for matching buf_index entries, recovering struct page pointers via pfn_to_page() and unpinning in batches of 32. Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index instead of walking the now-removed pages[] array. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-5-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 87 +++++++++++++++++++++++++--------------- 1 file changed, 55 insertions(+), 32 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index bdb1de41d526..1af42850f5b1 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -296,7 +296,6 @@ struct ublk_queue { /* Per-registered shared memory buffer */ struct ublk_buf { - struct page **pages; unsigned int nr_pages; }; @@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk) blk_mq_unquiesce_queue(disk->queue); } -/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */ -static void ublk_buf_erase_ranges(struct ublk_device *ub, - struct ublk_buf *ubuf, - unsigned long nr_pages) +/* Erase coalesced PFN ranges from the maple tree matching buf_index */ +static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index) { - unsigned long i; + MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); + struct ublk_buf_range *range; - for (i = 0; i < nr_pages; ) { - unsigned long pfn = page_to_pfn(ubuf->pages[i]); - unsigned long start = i; - - while (i + 1 < nr_pages && - page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1) - i++; - i++; - kfree(mtree_erase(&ub->buf_tree, pfn)); + mas_lock(&mas); + mas_for_each(&mas, range, ULONG_MAX) { + if (range->buf_index == buf_index) { + mas_erase(&mas); + kfree(range); + } } + mas_unlock(&mas); } static int __ublk_ctrl_reg_buf(struct ublk_device *ub, - struct ublk_buf *ubuf, int index, + struct ublk_buf *ubuf, + struct page **pages, int index, unsigned short flags) { unsigned long nr_pages = ubuf->nr_pages; @@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub, int ret; for (i = 0; i < nr_pages; ) { - unsigned long pfn = page_to_pfn(ubuf->pages[i]); + unsigned long pfn = page_to_pfn(pages[i]); unsigned long start = i; struct ublk_buf_range *range; /* Find run of consecutive PFNs */ while (i + 1 < nr_pages && - page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1) + page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1) i++; i++; /* past the last page in this run */ @@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub, return 0; unwind: - ublk_buf_erase_ranges(ub, ubuf, i); + ublk_buf_erase_ranges(ub, index); return ret; } @@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_shmem_buf_reg buf_reg; unsigned long addr, size, nr_pages; + struct page **pages = NULL; unsigned int gup_flags; struct gendisk *disk; struct ublk_buf *ubuf; @@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, goto put_disk; } - ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages), - GFP_KERNEL); - if (!ubuf->pages) { + pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); + if (!pages) { ret = -ENOMEM; goto err_free; } @@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY)) gup_flags |= FOLL_WRITE; - pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages); + pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages); if (pinned < 0) { ret = pinned; goto err_free_pages; @@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, if (ret) goto err_unlock; - ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags); + ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags); if (ret) { xa_erase(&ub->bufs_xa, index); goto err_unlock; @@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, mutex_unlock(&ub->mutex); + kvfree(pages); ublk_unquiesce_and_resume(disk); ublk_put_disk(disk); return index; @@ -5422,9 +5420,9 @@ err_unlock: mutex_unlock(&ub->mutex); ublk_unquiesce_and_resume(disk); err_unpin: - unpin_user_pages(ubuf->pages, pinned); + unpin_user_pages(pages, pinned); err_free_pages: - kvfree(ubuf->pages); + kvfree(pages); err_free: kfree(ubuf); put_disk: @@ -5433,11 +5431,36 @@ put_disk: } static void __ublk_ctrl_unreg_buf(struct ublk_device *ub, - struct ublk_buf *ubuf) + struct ublk_buf *ubuf, int buf_index) { - ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages); - unpin_user_pages(ubuf->pages, ubuf->nr_pages); - kvfree(ubuf->pages); + MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); + struct ublk_buf_range *range; + struct page *pages[32]; + + mas_lock(&mas); + mas_for_each(&mas, range, ULONG_MAX) { + unsigned long base, nr, off; + + if (range->buf_index != buf_index) + continue; + + base = range->base_pfn; + nr = mas.last - mas.index + 1; + mas_erase(&mas); + + for (off = 0; off < nr; ) { + unsigned int batch = min_t(unsigned long, + nr - off, 32); + unsigned int j; + + for (j = 0; j < batch; j++) + pages[j] = pfn_to_page(base + off + j); + unpin_user_pages(pages, batch); + off += batch; + } + kfree(range); + } + mas_unlock(&mas); kfree(ubuf); } @@ -5468,7 +5491,7 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub, return -ENOENT; } - __ublk_ctrl_unreg_buf(ub, ubuf); + __ublk_ctrl_unreg_buf(ub, ubuf, index); mutex_unlock(&ub->mutex); @@ -5483,7 +5506,7 @@ static void ublk_buf_cleanup(struct ublk_device *ub) unsigned long index; xa_for_each(&ub->bufs_xa, index, ubuf) - __ublk_ctrl_unreg_buf(ub, ubuf); + __ublk_ctrl_unreg_buf(ub, ubuf, index); xa_destroy(&ub->bufs_xa); mtree_destroy(&ub->buf_tree); } From 166b476b8dee61dc6501f6eb91619d28c3430f75 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:56 +0800 Subject: [PATCH 126/146] selftests/ublk: add shared memory zero-copy support in kublk Add infrastructure for UBLK_F_SHMEM_ZC shared memory zero-copy: - kublk.h: struct ublk_shmem_entry and table for tracking registered shared memory buffers - kublk.c: per-device unix socket listener that accepts memfd registrations from clients via SCM_RIGHTS fd passing. The listener mmaps the memfd and registers the VA range with the kernel for PFN matching. Also adds --shmem_zc command line option. - kublk.c: --htlb option to open a pre-allocated hugetlbfs file, mmap it with MAP_SHARED|MAP_POPULATE, and register it with the kernel via ublk_ctrl_reg_buf(). Any process that mmaps the same hugetlbfs file shares the same physical pages, enabling zero-copy without socket-based fd passing. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 340 ++++++++++++++++++++++++++- tools/testing/selftests/ublk/kublk.h | 14 ++ 2 files changed, 352 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index e5b787ba2175..fce3f80c3eba 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -4,6 +4,7 @@ */ #include +#include #include "kublk.h" #define MAX_NR_TGT_ARG 64 @@ -1092,13 +1093,312 @@ static int ublk_send_dev_event(const struct dev_ctx *ctx, struct ublk_dev *dev, } +/* + * Shared memory registration socket listener. + * + * The parent daemon context listens on a per-device unix socket at + * /run/ublk/ublkb.sock for shared memory registration requests + * from clients. Clients send a memfd via SCM_RIGHTS; the server + * registers it with the kernel, mmaps it, and returns the assigned index. + */ +#define UBLK_SHMEM_SOCK_DIR "/run/ublk" + +/* defined in kublk.h, shared with file_backed.c (loop target) */ +struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; +int shmem_count; + +static void ublk_shmem_sock_path(int dev_id, char *buf, size_t len) +{ + snprintf(buf, len, "%s/ublkb%d.sock", UBLK_SHMEM_SOCK_DIR, dev_id); +} + +static int ublk_shmem_sock_create(int dev_id) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + char path[108]; + int fd; + + mkdir(UBLK_SHMEM_SOCK_DIR, 0755); + ublk_shmem_sock_path(dev_id, path, sizeof(path)); + unlink(path); + + fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + if (fd < 0) + return -1; + + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s", path); + if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + close(fd); + return -1; + } + + listen(fd, 4); + ublk_dbg(UBLK_DBG_DEV, "shmem socket created: %s\n", path); + return fd; +} + +static void ublk_shmem_sock_destroy(int dev_id, int sock_fd) +{ + char path[108]; + + if (sock_fd >= 0) + close(sock_fd); + ublk_shmem_sock_path(dev_id, path, sizeof(path)); + unlink(path); +} + +/* Receive a memfd from a client via SCM_RIGHTS */ +static int ublk_shmem_recv_fd(int client_fd) +{ + char buf[1]; + struct iovec iov = { .iov_base = buf, .iov_len = sizeof(buf) }; + union { + char cmsg_buf[CMSG_SPACE(sizeof(int))]; + struct cmsghdr align; + } u; + struct msghdr msg = { + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = u.cmsg_buf, + .msg_controllen = sizeof(u.cmsg_buf), + }; + struct cmsghdr *cmsg; + + if (recvmsg(client_fd, &msg, 0) <= 0) + return -1; + + cmsg = CMSG_FIRSTHDR(&msg); + if (!cmsg || cmsg->cmsg_level != SOL_SOCKET || + cmsg->cmsg_type != SCM_RIGHTS) + return -1; + + return *(int *)CMSG_DATA(cmsg); +} + +/* Register a shared memory buffer: store fd, mmap it, return index */ +static int ublk_shmem_register(int shmem_fd) +{ + off_t size; + void *base; + int idx; + + if (shmem_count >= UBLK_BUF_MAX) + return -1; + + size = lseek(shmem_fd, 0, SEEK_END); + if (size <= 0) + return -1; + + base = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, + shmem_fd, 0); + if (base == MAP_FAILED) + return -1; + + idx = shmem_count++; + shmem_table[idx].fd = shmem_fd; + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = size; + + ublk_dbg(UBLK_DBG_DEV, "shmem registered: index=%d fd=%d size=%zu\n", + idx, shmem_fd, (size_t)size); + return idx; +} + +static void ublk_shmem_unregister_all(void) +{ + int i; + + for (i = 0; i < shmem_count; i++) { + if (shmem_table[i].mmap_base) { + munmap(shmem_table[i].mmap_base, + shmem_table[i].size); + close(shmem_table[i].fd); + shmem_table[i].mmap_base = NULL; + } + } + shmem_count = 0; +} + +static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size) +{ + struct ublk_shmem_buf_reg buf_reg = { + .addr = (unsigned long)addr, + .len = size, + }; + struct ublk_ctrl_cmd_data data = { + .cmd_op = UBLK_U_CMD_REG_BUF, + .flags = CTRL_CMD_HAS_BUF, + .addr = (unsigned long)&buf_reg, + .len = sizeof(buf_reg), + }; + + return __ublk_ctrl_cmd(dev, &data); +} + +/* + * Handle one client connection: receive memfd, mmap it, register + * the VA range with kernel, send back the assigned index. + */ +static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev) +{ + int client_fd, memfd, idx, ret; + int32_t reply; + off_t size; + void *base; + + client_fd = accept(sock_fd, NULL, NULL); + if (client_fd < 0) + return; + + memfd = ublk_shmem_recv_fd(client_fd); + if (memfd < 0) { + reply = -1; + goto out; + } + + /* mmap the memfd in server address space */ + size = lseek(memfd, 0, SEEK_END); + if (size <= 0) { + reply = -1; + close(memfd); + goto out; + } + base = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, memfd, 0); + if (base == MAP_FAILED) { + reply = -1; + close(memfd); + goto out; + } + + /* Register server's VA range with kernel for PFN matching */ + ret = ublk_ctrl_reg_buf(dev, base, size); + if (ret < 0) { + ublk_dbg(UBLK_DBG_DEV, + "shmem_zc: kernel reg failed %d\n", ret); + munmap(base, size); + close(memfd); + reply = ret; + goto out; + } + + /* Store in table for I/O handling */ + idx = ublk_shmem_register(memfd); + if (idx >= 0) { + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = size; + } + reply = idx; +out: + send(client_fd, &reply, sizeof(reply), 0); + close(client_fd); +} + +struct shmem_listener_info { + int dev_id; + int stop_efd; /* eventfd to signal listener to stop */ + int sock_fd; /* listener socket fd (output) */ + struct ublk_dev *dev; +}; + +/* + * Socket listener thread: runs in the parent daemon context alongside + * the I/O threads. Accepts shared memory registration requests from + * clients via SCM_RIGHTS. Exits when stop_efd is signaled. + */ +static void *ublk_shmem_listener_fn(void *data) +{ + struct shmem_listener_info *info = data; + struct pollfd pfds[2]; + + info->sock_fd = ublk_shmem_sock_create(info->dev_id); + if (info->sock_fd < 0) + return NULL; + + pfds[0].fd = info->sock_fd; + pfds[0].events = POLLIN; + pfds[1].fd = info->stop_efd; + pfds[1].events = POLLIN; + + while (1) { + int ret = poll(pfds, 2, -1); + + if (ret < 0) + break; + + /* Stop signal from parent */ + if (pfds[1].revents & POLLIN) + break; + + /* Client connection */ + if (pfds[0].revents & POLLIN) + ublk_shmem_handle_client(info->sock_fd, info->dev); + } + + return NULL; +} + +static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, + struct ublk_dev *dev) +{ + int fd, idx, ret; + struct stat st; + void *base; + + fd = open(ctx->htlb_path, O_RDWR); + if (fd < 0) { + ublk_err("htlb: can't open %s\n", ctx->htlb_path); + return -errno; + } + + if (fstat(fd, &st) < 0 || st.st_size <= 0) { + ublk_err("htlb: invalid file size\n"); + close(fd); + return -EINVAL; + } + + base = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (base == MAP_FAILED) { + ublk_err("htlb: mmap failed\n"); + close(fd); + return -ENOMEM; + } + + ret = ublk_ctrl_reg_buf(dev, base, st.st_size); + if (ret < 0) { + ublk_err("htlb: reg_buf failed: %d\n", ret); + munmap(base, st.st_size); + close(fd); + return ret; + } + + if (shmem_count >= UBLK_BUF_MAX) { + munmap(base, st.st_size); + close(fd); + return -ENOMEM; + } + + idx = shmem_count++; + shmem_table[idx].fd = fd; + shmem_table[idx].mmap_base = base; + shmem_table[idx].size = st.st_size; + + ublk_dbg(UBLK_DBG_DEV, "htlb registered: index=%d size=%zu\n", + idx, (size_t)st.st_size); + return 0; +} + static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) { const struct ublksrv_ctrl_dev_info *dinfo = &dev->dev_info; + struct shmem_listener_info linfo = {}; struct ublk_thread_info *tinfo; unsigned long long extra_flags = 0; cpu_set_t *affinity_buf; unsigned char (*q_thread_map)[UBLK_MAX_QUEUES] = NULL; + uint64_t stop_val = 1; + pthread_t listener; void *thread_ret; sem_t ready; int ret, i; @@ -1187,15 +1487,44 @@ static int ublk_start_daemon(const struct dev_ctx *ctx, struct ublk_dev *dev) goto fail_start; } + if (ctx->htlb_path) { + ret = ublk_shmem_htlb_setup(ctx, dev); + if (ret < 0) { + ublk_err("htlb setup failed: %d\n", ret); + ublk_ctrl_stop_dev(dev); + goto fail_start; + } + } + ublk_ctrl_get_info(dev); if (ctx->fg) ublk_ctrl_dump(dev); else ublk_send_dev_event(ctx, dev, dev->dev_info.dev_id); fail_start: - /* wait until we are terminated */ - for (i = 0; i < dev->nthreads; i++) + /* + * Wait for I/O threads to exit. While waiting, a listener + * thread accepts shared memory registration requests from + * clients via a per-device unix socket (SCM_RIGHTS fd passing). + */ + linfo.dev_id = dinfo->dev_id; + linfo.dev = dev; + linfo.stop_efd = eventfd(0, 0); + if (linfo.stop_efd >= 0) + pthread_create(&listener, NULL, + ublk_shmem_listener_fn, &linfo); + + for (i = 0; i < (int)dev->nthreads; i++) pthread_join(tinfo[i].thread, &thread_ret); + + /* Signal listener thread to stop and wait for it */ + if (linfo.stop_efd >= 0) { + write(linfo.stop_efd, &stop_val, sizeof(stop_val)); + pthread_join(listener, NULL); + close(linfo.stop_efd); + ublk_shmem_sock_destroy(dinfo->dev_id, linfo.sock_fd); + } + ublk_shmem_unregister_all(); free(tinfo); fail: for (i = 0; i < dinfo->nr_hw_queues; i++) @@ -1625,6 +1954,7 @@ static int cmd_dev_get_features(void) FEAT_NAME(UBLK_F_SAFE_STOP_DEV), FEAT_NAME(UBLK_F_BATCH_IO), FEAT_NAME(UBLK_F_NO_AUTO_PART_SCAN), + FEAT_NAME(UBLK_F_SHMEM_ZC), }; struct ublk_dev *dev; __u64 features = 0; @@ -1797,6 +2127,8 @@ int main(int argc, char *argv[]) { "safe", 0, NULL, 0 }, { "batch", 0, NULL, 'b'}, { "no_auto_part_scan", 0, NULL, 0 }, + { "shmem_zc", 0, NULL, 0 }, + { "htlb", 1, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -1912,6 +2244,10 @@ int main(int argc, char *argv[]) ctx.safe_stop = 1; if (!strcmp(longopts[option_idx].name, "no_auto_part_scan")) ctx.flags |= UBLK_F_NO_AUTO_PART_SCAN; + if (!strcmp(longopts[option_idx].name, "shmem_zc")) + ctx.flags |= UBLK_F_SHMEM_ZC; + if (!strcmp(longopts[option_idx].name, "htlb")) + ctx.htlb_path = strdup(optarg); break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 6d1762aa30df..8ed2efc3ecb9 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -96,6 +96,8 @@ struct dev_ctx { /* for 'update_size' command */ unsigned long long size; + char *htlb_path; + union { struct stripe_ctx stripe; struct fault_inject_ctx fault_inject; @@ -602,6 +604,18 @@ static inline void ublk_queued_tgt_io(struct ublk_thread *t, struct ublk_queue * } } +/* shared memory zero-copy support */ +#define UBLK_BUF_MAX 256 + +struct ublk_shmem_entry { + int fd; + void *mmap_base; + size_t size; +}; + +extern struct ublk_shmem_entry shmem_table[UBLK_BUF_MAX]; +extern int shmem_count; + extern const struct ublk_tgt_ops null_tgt_ops; extern const struct ublk_tgt_ops loop_tgt_ops; extern const struct ublk_tgt_ops stripe_tgt_ops; From ec20aa44ac2629943c9b2b5524bcb55d778f746c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:57 +0800 Subject: [PATCH 127/146] selftests/ublk: add UBLK_F_SHMEM_ZC support for loop target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add loop_queue_shmem_zc_io() which handles I/O requests marked with UBLK_IO_F_SHMEM_ZC. When the kernel sets this flag, the request data lives in a registered shared memory buffer — decode index + offset from iod->addr and use the server's mmap as the I/O buffer. The dispatch check in loop_queue_tgt_rw_io() routes SHMEM_ZC requests to this new function, bypassing the normal buffer registration path. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-7-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/file_backed.c | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tools/testing/selftests/ublk/file_backed.c b/tools/testing/selftests/ublk/file_backed.c index 228af2580ac6..d28da98f917a 100644 --- a/tools/testing/selftests/ublk/file_backed.c +++ b/tools/testing/selftests/ublk/file_backed.c @@ -27,6 +27,40 @@ static int loop_queue_flush_io(struct ublk_thread *t, struct ublk_queue *q, return 1; } +/* + * Shared memory zero-copy I/O: when UBLK_IO_F_SHMEM_ZC is set, the + * request's data lives in a registered shared memory buffer. Decode + * index + offset from iod->addr and use the server's mmap of that + * buffer as the I/O buffer for the backing file. + */ +static int loop_queue_shmem_zc_io(struct ublk_thread *t, struct ublk_queue *q, + const struct ublksrv_io_desc *iod, int tag) +{ + unsigned ublk_op = ublksrv_get_op(iod); + enum io_uring_op op = ublk_to_uring_op(iod, 0); + __u64 file_offset = iod->start_sector << 9; + __u32 len = iod->nr_sectors << 9; + __u32 shmem_idx = ublk_shmem_zc_index(iod->addr); + __u32 shmem_off = ublk_shmem_zc_offset(iod->addr); + struct io_uring_sqe *sqe[1]; + void *addr; + + if (shmem_idx >= UBLK_BUF_MAX || !shmem_table[shmem_idx].mmap_base) + return -EINVAL; + + addr = shmem_table[shmem_idx].mmap_base + shmem_off; + + ublk_io_alloc_sqes(t, sqe, 1); + if (!sqe[0]) + return -ENOMEM; + + io_uring_prep_rw(op, sqe[0], ublk_get_registered_fd(q, 1), + addr, len, file_offset); + io_uring_sqe_set_flags(sqe[0], IOSQE_FIXED_FILE); + sqe[0]->user_data = build_user_data(tag, ublk_op, 0, q->q_id, 1); + return 1; +} + static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, const struct ublksrv_io_desc *iod, int tag) { @@ -41,6 +75,10 @@ static int loop_queue_tgt_rw_io(struct ublk_thread *t, struct ublk_queue *q, void *addr = io->buf_addr; unsigned short buf_index = ublk_io_buf_idx(t, q, tag); + /* shared memory zero-copy path */ + if (iod->op_flags & UBLK_IO_F_SHMEM_ZC) + return loop_queue_shmem_zc_io(t, q, iod, tag); + if (iod->op_flags & UBLK_IO_F_INTEGRITY) { ublk_io_alloc_sqes(t, sqe, 1); /* Use second backing file for integrity data */ From 2f1e9468bdcba7e7572e16defd3c516f24281f14 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:58 +0800 Subject: [PATCH 128/146] selftests/ublk: add shared memory zero-copy test Add test_shmem_zc_01.sh which tests UBLK_IO_F_SHMEM_ZC on the null target using a hugetlbfs shared buffer. Both kublk (--htlb) and fio (--mem=mmaphuge:) mmap the same hugetlbfs file with MAP_SHARED, sharing physical pages. The kernel PFN match enables zero-copy I/O. Uses standard fio --mem=mmaphuge: (supported since fio 1.10), no patched fio required. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-8-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 2 + .../testing/selftests/ublk/test_shmemzc_01.sh | 72 +++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_shmemzc_01.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index d338668c5a5f..bf5e9fcf36b8 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -52,6 +52,8 @@ TEST_PROGS += test_stripe_06.sh TEST_PROGS += test_part_01.sh TEST_PROGS += test_part_02.sh +TEST_PROGS += test_shmemzc_01.sh + TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh TEST_PROGS += test_stress_03.sh diff --git a/tools/testing/selftests/ublk/test_shmemzc_01.sh b/tools/testing/selftests/ublk/test_shmemzc_01.sh new file mode 100755 index 000000000000..47210af2aa20 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_01.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with hugetlbfs buffer on null target +# +# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED), +# so they share physical pages. The kernel PFN match enables +# zero-copy I/O without socket-based fd passing. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE") +_check_add_dev $TID $? + +fio --name=htlb_zc \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=io_uring \ + --rw=randwrite \ + --direct=1 \ + --bs=4k \ + --size=4M \ + --iodepth=32 \ + --mem=mmaphuge:"$HTLB_FILE" \ + > /dev/null 2>&1 +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE From d4866503324c062f70dddfdd2e59957d335fc230 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:31:59 +0800 Subject: [PATCH 129/146] selftests/ublk: add hugetlbfs shmem_zc test for loop target MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add test_shmem_zc_02.sh which tests the UBLK_IO_F_SHMEM_ZC zero-copy path on the loop target using a hugetlbfs shared buffer. Both kublk and fio mmap the same hugetlbfs file with MAP_SHARED, sharing physical pages. The kernel's PFN matching enables zero-copy — the loop target reads/writes directly from the shared buffer to the backing file. Uses standard fio --mem=mmaphuge: (supported since fio 1.10), no patched fio required. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-9-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + .../testing/selftests/ublk/test_shmemzc_02.sh | 68 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100755 tools/testing/selftests/ublk/test_shmemzc_02.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index bf5e9fcf36b8..799cdcc39643 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -53,6 +53,7 @@ TEST_PROGS += test_part_01.sh TEST_PROGS += test_part_02.sh TEST_PROGS += test_shmemzc_01.sh +TEST_PROGS += test_shmemzc_02.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/test_shmemzc_02.sh b/tools/testing/selftests/ublk/test_shmemzc_02.sh new file mode 100755 index 000000000000..aed9262494e9 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_02.sh @@ -0,0 +1,68 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with hugetlbfs buffer on loop target +# +# kublk and fio both mmap the same hugetlbfs file (MAP_SHARED), +# so they share physical pages. The kernel PFN match enables +# zero-copy I/O without socket-based fd passing. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +_create_backfile 0 128M +BACKFILE="${UBLK_BACKFILES[0]}" + +dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE") +_check_add_dev $TID $? + +_run_fio_verify_io --filename=/dev/ublkb"${dev_id}" \ + --size=128M \ + --mem=mmaphuge:"$HTLB_FILE" +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE From 12075992c62ee330b2c531fa066b19be21698115 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:32:00 +0800 Subject: [PATCH 130/146] selftests/ublk: add filesystem fio verify test for shmem_zc Add test_shmemzc_03.sh which exercises shmem_zc through the full filesystem stack: mkfs ext4 on the ublk device, mount it, then run fio verify on a file inside the filesystem with --mem=mmaphuge. Extend _mkfs_mount_test() to accept an optional command that runs between mount and umount. The function cd's into the mount directory so the command can use relative file paths. Existing callers that pass only the device are unaffected. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-10-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/test_common.sh | 17 +++-- .../testing/selftests/ublk/test_shmemzc_03.sh | 69 +++++++++++++++++++ 3 files changed, 82 insertions(+), 5 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_shmemzc_03.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index 799cdcc39643..c453cd369088 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -54,6 +54,7 @@ TEST_PROGS += test_part_02.sh TEST_PROGS += test_shmemzc_01.sh TEST_PROGS += test_shmemzc_02.sh +TEST_PROGS += test_shmemzc_03.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 163a40007910..af2ea4fa1111 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -88,6 +88,7 @@ _remove_tmp_dir() { _mkfs_mount_test() { local dev=$1 + shift local err_code=0 local mnt_dir; @@ -99,12 +100,17 @@ _mkfs_mount_test() fi mount -t ext4 "$dev" "$mnt_dir" > /dev/null 2>&1 - umount "$dev" - err_code=$? - _remove_tmp_dir "$mnt_dir" - if [ $err_code -ne 0 ]; then - return $err_code + if [ $# -gt 0 ]; then + cd "$mnt_dir" && "$@" + err_code=$? + cd - > /dev/null fi + umount "$dev" + if [ $err_code -eq 0 ]; then + err_code=$? + fi + _remove_tmp_dir "$mnt_dir" + return $err_code } _check_root() { @@ -132,6 +138,7 @@ _prep_test() { local base_dir=${TMPDIR:-./ublktest-dir} mkdir -p "$base_dir" UBLK_TEST_DIR=$(mktemp -d ${base_dir}/${TID}.XXXXXX) + UBLK_TEST_DIR=$(realpath ${UBLK_TEST_DIR}) UBLK_TMP=$(mktemp ${UBLK_TEST_DIR}/ublk_test_XXXXX) [ "$UBLK_TEST_QUIET" -eq 0 ] && echo "ublk $type: $*" echo "ublk selftest: $TID starting at $(date '+%F %T')" | tee /dev/kmsg diff --git a/tools/testing/selftests/ublk/test_shmemzc_03.sh b/tools/testing/selftests/ublk/test_shmemzc_03.sh new file mode 100755 index 000000000000..db967a9ffe81 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_03.sh @@ -0,0 +1,69 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with fio verify over filesystem on loop target +# +# mkfs + mount ext4 on the ublk device, then run fio verify on a +# file inside that filesystem. Exercises the full stack: +# filesystem -> block layer -> ublk shmem_zc -> loop target backing file. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "loop target hugetlbfs shmem zero-copy fs verify test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +_create_backfile 0 256M +BACKFILE="${UBLK_BACKFILES[0]}" + +dev_id=$(_add_ublk_dev -t loop --shmem_zc --htlb "$HTLB_FILE" "$BACKFILE") +_check_add_dev $TID $? + +_mkfs_mount_test /dev/ublkb"${dev_id}" \ + _run_fio_verify_io --filename=testfile \ + --size=128M \ + --mem=mmaphuge:"$HTLB_FILE" +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE From affb5f67d73c1e0bd412e7807a55691502b5679e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 31 Mar 2026 23:32:01 +0800 Subject: [PATCH 131/146] selftests/ublk: add read-only buffer registration test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add --rdonly_shmem_buf option to kublk that registers shared memory buffers with UBLK_SHMEM_BUF_READ_ONLY (read-only pinning without FOLL_WRITE) and mmaps with PROT_READ only. Add test_shmemzc_04.sh which exercises the new flag with a null target, hugetlbfs buffer, and write workload. Write I/O works because the server only reads from the shared buffer — the data flows from client to kernel to the shared pages, and the server reads them out. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260331153207.3635125-11-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/Makefile | 1 + tools/testing/selftests/ublk/kublk.c | 15 ++-- tools/testing/selftests/ublk/kublk.h | 1 + .../testing/selftests/ublk/test_shmemzc_04.sh | 72 +++++++++++++++++++ 4 files changed, 85 insertions(+), 4 deletions(-) create mode 100755 tools/testing/selftests/ublk/test_shmemzc_04.sh diff --git a/tools/testing/selftests/ublk/Makefile b/tools/testing/selftests/ublk/Makefile index c453cd369088..ec6a8ce83d38 100644 --- a/tools/testing/selftests/ublk/Makefile +++ b/tools/testing/selftests/ublk/Makefile @@ -55,6 +55,7 @@ TEST_PROGS += test_part_02.sh TEST_PROGS += test_shmemzc_01.sh TEST_PROGS += test_shmemzc_02.sh TEST_PROGS += test_shmemzc_03.sh +TEST_PROGS += test_shmemzc_04.sh TEST_PROGS += test_stress_01.sh TEST_PROGS += test_stress_02.sh diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index fce3f80c3eba..fbd9b1e7342a 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1219,11 +1219,13 @@ static void ublk_shmem_unregister_all(void) shmem_count = 0; } -static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size) +static int ublk_ctrl_reg_buf(struct ublk_dev *dev, void *addr, size_t size, + __u32 flags) { struct ublk_shmem_buf_reg buf_reg = { .addr = (unsigned long)addr, .len = size, + .flags = flags, }; struct ublk_ctrl_cmd_data data = { .cmd_op = UBLK_U_CMD_REG_BUF, @@ -1272,7 +1274,7 @@ static void ublk_shmem_handle_client(int sock_fd, struct ublk_dev *dev) } /* Register server's VA range with kernel for PFN matching */ - ret = ublk_ctrl_reg_buf(dev, base, size); + ret = ublk_ctrl_reg_buf(dev, base, size, 0); if (ret < 0) { ublk_dbg(UBLK_DBG_DEV, "shmem_zc: kernel reg failed %d\n", ret); @@ -1357,7 +1359,8 @@ static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, return -EINVAL; } - base = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, + base = mmap(NULL, st.st_size, + ctx->rdonly_shmem_buf ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, 0); if (base == MAP_FAILED) { ublk_err("htlb: mmap failed\n"); @@ -1365,7 +1368,8 @@ static int ublk_shmem_htlb_setup(const struct dev_ctx *ctx, return -ENOMEM; } - ret = ublk_ctrl_reg_buf(dev, base, st.st_size); + ret = ublk_ctrl_reg_buf(dev, base, st.st_size, + ctx->rdonly_shmem_buf ? UBLK_SHMEM_BUF_READ_ONLY : 0); if (ret < 0) { ublk_err("htlb: reg_buf failed: %d\n", ret); munmap(base, st.st_size); @@ -2129,6 +2133,7 @@ int main(int argc, char *argv[]) { "no_auto_part_scan", 0, NULL, 0 }, { "shmem_zc", 0, NULL, 0 }, { "htlb", 1, NULL, 0 }, + { "rdonly_shmem_buf", 0, NULL, 0 }, { 0, 0, 0, 0 } }; const struct ublk_tgt_ops *ops = NULL; @@ -2248,6 +2253,8 @@ int main(int argc, char *argv[]) ctx.flags |= UBLK_F_SHMEM_ZC; if (!strcmp(longopts[option_idx].name, "htlb")) ctx.htlb_path = strdup(optarg); + if (!strcmp(longopts[option_idx].name, "rdonly_shmem_buf")) + ctx.rdonly_shmem_buf = 1; break; case '?': /* diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 8ed2efc3ecb9..742c41d77df1 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -81,6 +81,7 @@ struct dev_ctx { unsigned int no_ublk_fixed_fd:1; unsigned int safe_stop:1; unsigned int no_auto_part_scan:1; + unsigned int rdonly_shmem_buf:1; __u32 integrity_flags; __u8 metadata_size; __u8 pi_offset; diff --git a/tools/testing/selftests/ublk/test_shmemzc_04.sh b/tools/testing/selftests/ublk/test_shmemzc_04.sh new file mode 100755 index 000000000000..899de088ece4 --- /dev/null +++ b/tools/testing/selftests/ublk/test_shmemzc_04.sh @@ -0,0 +1,72 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Test: shmem_zc with read-only buffer registration on null target +# +# Same as test_shmemzc_01 but with --rdonly_shmem_buf: pages are pinned +# without FOLL_WRITE (UBLK_BUF_F_READ). Write I/O works because +# the server only reads from the shared buffer. + +. "$(cd "$(dirname "$0")" && pwd)"/test_common.sh + +ERR_CODE=0 + +_prep_test "shmem_zc" "null target hugetlbfs shmem zero-copy rdonly_buf test" + +if ! _have_program fio; then + echo "SKIP: fio not available" + exit "$UBLK_SKIP_CODE" +fi + +if ! grep -q hugetlbfs /proc/filesystems; then + echo "SKIP: hugetlbfs not supported" + exit "$UBLK_SKIP_CODE" +fi + +# Allocate hugepages +OLD_NR_HP=$(cat /proc/sys/vm/nr_hugepages) +echo 10 > /proc/sys/vm/nr_hugepages +NR_HP=$(cat /proc/sys/vm/nr_hugepages) +if [ "$NR_HP" -lt 2 ]; then + echo "SKIP: cannot allocate hugepages" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +# Mount hugetlbfs +HTLB_MNT=$(mktemp -d "${UBLK_TEST_DIR}/htlb_mnt_XXXXXX") +if ! mount -t hugetlbfs none "$HTLB_MNT"; then + echo "SKIP: cannot mount hugetlbfs" + rmdir "$HTLB_MNT" + echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + exit "$UBLK_SKIP_CODE" +fi + +HTLB_FILE="$HTLB_MNT/ublk_buf" +fallocate -l 4M "$HTLB_FILE" + +dev_id=$(_add_ublk_dev -t null --shmem_zc --htlb "$HTLB_FILE" --rdonly_shmem_buf) +_check_add_dev $TID $? + +fio --name=htlb_zc_rdonly \ + --filename=/dev/ublkb"${dev_id}" \ + --ioengine=io_uring \ + --rw=randwrite \ + --direct=1 \ + --bs=4k \ + --size=4M \ + --iodepth=32 \ + --mem=mmaphuge:"$HTLB_FILE" \ + > /dev/null 2>&1 +ERR_CODE=$? + +# Delete device first so daemon releases the htlb mmap +_ublk_del_dev "${dev_id}" + +rm -f "$HTLB_FILE" +umount "$HTLB_MNT" +rmdir "$HTLB_MNT" +echo "$OLD_NR_HP" > /proc/sys/vm/nr_hugepages + +_cleanup_test "shmem_zc" + +_show_result $TID $ERR_CODE From 452c8f6cbd0ef1408474a875c5c4149a02c7610f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2026 16:05:24 +0200 Subject: [PATCH 132/146] xfs: fix number of GC bvecs GC scratch allocations can wrap around and use the same buffer twice, and the current code fails to account for that. So far this worked due to rounding in the block layer, but changes to the bio allocator drop the over-provisioning and generic/256 or generic/361 will now usually fail when running against the current block tree. Simplify the allocation to always pass the maximum value that is easier to verify, as a saving of up to one bvec per allocation isn't worth the effort to verify a complicated calculated value. Fixes: 102f444b57b3 ("xfs: rework zone GC buffer management") Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hans Holmberg Link: https://patch.msgid.link/20260407140538.633364-2-hch@lst.de Signed-off-by: Jens Axboe --- fs/xfs/xfs_zone_gc.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 7efeecd2d85f..f279dcca53cc 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -671,7 +671,6 @@ xfs_zone_gc_start_chunk( struct xfs_inode *ip; struct bio *bio; xfs_daddr_t daddr; - unsigned int len; bool is_seq; if (xfs_is_shutdown(mp)) @@ -686,15 +685,16 @@ xfs_zone_gc_start_chunk( return false; } - len = XFS_FSB_TO_B(mp, irec.rm_blockcount); - bio = bio_alloc_bioset(bdev, - min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS), - REQ_OP_READ, GFP_NOFS, &data->bio_set); - + /* + * Scratch allocation can wrap around to the same buffer again, + * provision an extra bvec for that case. + */ + bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS, + &data->bio_set); chunk = container_of(bio, struct xfs_gc_bio, bio); chunk->ip = ip; chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); - chunk->len = len; + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); chunk->old_startblock = xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); chunk->new_daddr = daddr; @@ -708,8 +708,9 @@ xfs_zone_gc_start_chunk( bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); bio->bi_end_io = xfs_zone_gc_end_io; xfs_zone_gc_add_data(chunk); - data->scratch_head = (data->scratch_head + len) % data->scratch_size; - data->scratch_available -= len; + data->scratch_head = + (data->scratch_head + chunk->len) % data->scratch_size; + data->scratch_available -= chunk->len; XFS_STATS_INC(mp, xs_gc_read_calls); From 65565ca5f99b42fe62b9a10117cca04f4311dc66 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2026 16:05:25 +0200 Subject: [PATCH 133/146] block: unify the synchronous bi_end_io callbacks Put the bio in bio_await_chain after waiting for the completion, and share the now identical callbacks between submit_bio_wait and bio_await_chain. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260407140538.633364-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/block/bio.c b/block/bio.c index c8234d347fc5..434e41182c05 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1462,7 +1462,7 @@ void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty) bio_iov_iter_unbounce_read(bio, is_error, mark_dirty); } -static void submit_bio_wait_endio(struct bio *bio) +static void bio_wait_end_io(struct bio *bio) { complete(bio->bi_private); } @@ -1484,7 +1484,7 @@ int submit_bio_wait(struct bio *bio) bio->bi_bdev->bd_disk->lockdep_map); bio->bi_private = &done; - bio->bi_end_io = submit_bio_wait_endio; + bio->bi_end_io = bio_wait_end_io; bio->bi_opf |= REQ_SYNC; submit_bio(bio); blk_wait_io(&done); @@ -1523,12 +1523,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, } EXPORT_SYMBOL_GPL(bdev_rw_virt); -static void bio_wait_end_io(struct bio *bio) -{ - complete(bio->bi_private); - bio_put(bio); -} - /* * bio_await_chain - ends @bio and waits for every chained bio to complete */ @@ -1541,6 +1535,7 @@ void bio_await_chain(struct bio *bio) bio->bi_end_io = bio_wait_end_io; bio_endio(bio); blk_wait_io(&done); + bio_put(bio); } void __bio_advance(struct bio *bio, unsigned bytes) From 6fa747550e35f0a74e649b19d97055988a25b2e4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2026 16:05:26 +0200 Subject: [PATCH 134/146] block: factor out a bio_await helper Add a new helper to wait for a bio and anything chained off it to complete synchronously after submitting it. This factors common code out of submit_bio_wait and bio_await_chain and will also be useful for file system code and thus is exported. Note that this will now set REQ_SYNC also for the bio_await case for consistency. Nothing should look at the flag in the end_io handler, but if something does having the flag set makes more sense. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260407140538.633364-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 53 +++++++++++++++++++++++++++++++-------------- include/linux/bio.h | 2 ++ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/block/bio.c b/block/bio.c index 434e41182c05..61d65c544bcc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1467,6 +1467,36 @@ static void bio_wait_end_io(struct bio *bio) complete(bio->bi_private); } +/** + * bio_await - call a function on a bio, and wait until it completes + * @bio: the bio which describes the I/O + * @submit: function called to submit the bio + * @priv: private data passed to @submit + * + * Wait for the bio as well as any bio chained off it after executing the + * passed in callback @submit. The wait for the bio is set up before calling + * @submit to ensure that the completion is captured. If @submit is %NULL, + * submit_bio() is used instead to submit the bio. + * + * Note: this overrides the bi_private and bi_end_io fields in the bio. + */ +void bio_await(struct bio *bio, void *priv, + void (*submit)(struct bio *bio, void *priv)) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = bio_wait_end_io; + bio->bi_opf |= REQ_SYNC; + if (submit) + submit(bio, priv); + else + submit_bio(bio); + blk_wait_io(&done); +} +EXPORT_SYMBOL_GPL(bio_await); + /** * submit_bio_wait - submit a bio, and wait until it completes * @bio: The &struct bio which describes the I/O @@ -1480,19 +1510,16 @@ static void bio_wait_end_io(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, - bio->bi_bdev->bd_disk->lockdep_map); - - bio->bi_private = &done; - bio->bi_end_io = bio_wait_end_io; - bio->bi_opf |= REQ_SYNC; - submit_bio(bio); - blk_wait_io(&done); - + bio_await(bio, NULL, NULL); return blk_status_to_errno(bio->bi_status); } EXPORT_SYMBOL(submit_bio_wait); +static void bio_endio_cb(struct bio *bio, void *priv) +{ + bio_endio(bio); +} + /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access @@ -1528,13 +1555,7 @@ EXPORT_SYMBOL_GPL(bdev_rw_virt); */ void bio_await_chain(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, - bio->bi_bdev->bd_disk->lockdep_map); - - bio->bi_private = &done; - bio->bi_end_io = bio_wait_end_io; - bio_endio(bio); - blk_wait_io(&done); + bio_await(bio, NULL, bio_endio_cb); bio_put(bio); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 984844d2870b..97d747320b35 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -432,6 +432,8 @@ extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_reuse(struct bio *bio, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); +void bio_await(struct bio *bio, void *priv, + void (*submit)(struct bio *bio, void *priv)); int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); From 92c3737a2473ff5b83f90f5c1b353a27492a10f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2026 16:05:27 +0200 Subject: [PATCH 135/146] block: add a bio_submit_or_kill helper Factor the common logic for the ioctl helpers to either submit a bio or end if the process is being killed. As this is now the only user of bio_await_chain, open code that. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260407140538.633364-5-hch@lst.de Signed-off-by: Jens Axboe --- block/bio.c | 23 ++++++++++++++--------- block/blk-lib.c | 16 ++-------------- block/blk.h | 2 +- block/ioctl.c | 11 ++--------- 4 files changed, 19 insertions(+), 33 deletions(-) diff --git a/block/bio.c b/block/bio.c index 61d65c544bcc..641ef0928d73 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1520,6 +1520,20 @@ static void bio_endio_cb(struct bio *bio, void *priv) bio_endio(bio); } +/* + * Submit @bio synchronously, or call bio_endio on it if the current process + * is being killed. + */ +int bio_submit_or_kill(struct bio *bio, unsigned int flags) +{ + if ((flags & BLKDEV_ZERO_KILLABLE) && fatal_signal_pending(current)) { + bio_await(bio, NULL, bio_endio_cb); + return -EINTR; + } + + return submit_bio_wait(bio); +} + /** * bdev_rw_virt - synchronously read into / write from kernel mapping * @bdev: block device to access @@ -1550,15 +1564,6 @@ int bdev_rw_virt(struct block_device *bdev, sector_t sector, void *data, } EXPORT_SYMBOL_GPL(bdev_rw_virt); -/* - * bio_await_chain - ends @bio and waits for every chained bio to complete - */ -void bio_await_chain(struct bio *bio) -{ - bio_await(bio, NULL, bio_endio_cb); - bio_put(bio); -} - void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) diff --git a/block/blk-lib.c b/block/blk-lib.c index 3213afc7f0d5..688bc67cbf73 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -155,13 +155,7 @@ static int blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp, &bio, flags, limit); if (bio) { - if ((flags & BLKDEV_ZERO_KILLABLE) && - fatal_signal_pending(current)) { - bio_await_chain(bio); - blk_finish_plug(&plug); - return -EINTR; - } - ret = submit_bio_wait(bio); + ret = bio_submit_or_kill(bio, flags); bio_put(bio); } blk_finish_plug(&plug); @@ -236,13 +230,7 @@ static int blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, blk_start_plug(&plug); __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp, &bio, flags); if (bio) { - if ((flags & BLKDEV_ZERO_KILLABLE) && - fatal_signal_pending(current)) { - bio_await_chain(bio); - blk_finish_plug(&plug); - return -EINTR; - } - ret = submit_bio_wait(bio); + ret = bio_submit_or_kill(bio, flags); bio_put(bio); } blk_finish_plug(&plug); diff --git a/block/blk.h b/block/blk.h index 103cb1d0b9cb..ec4674cdf2ea 100644 --- a/block/blk.h +++ b/block/blk.h @@ -55,7 +55,7 @@ bool __blk_freeze_queue_start(struct request_queue *q, struct task_struct *owner); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio, bool split); -void bio_await_chain(struct bio *bio); +int bio_submit_or_kill(struct bio *bio, unsigned int flags); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { diff --git a/block/ioctl.c b/block/ioctl.c index 0b04661ac809..fc3be0549aa7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -153,13 +153,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, nr_sects = len >> SECTOR_SHIFT; blk_start_plug(&plug); - while (1) { - if (fatal_signal_pending(current)) { - if (prev) - bio_await_chain(prev); - err = -EINTR; - goto out_unplug; - } + while (!fatal_signal_pending(current)) { bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, GFP_KERNEL); if (!bio) @@ -167,12 +161,11 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, prev = bio_chain_and_submit(prev, bio); } if (prev) { - err = submit_bio_wait(prev); + err = bio_submit_or_kill(prev, BLKDEV_ZERO_KILLABLE); if (err == -EOPNOTSUPP) err = 0; bio_put(prev); } -out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); From 2d148a214b24b4a2525f649cced0c3e9e57281cd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Apr 2026 16:05:28 +0200 Subject: [PATCH 136/146] xfs: use bio_await in xfs_zone_gc_reset_sync Replace the open-coded bio wait logic with the new bio_await helper. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Link: https://patch.msgid.link/20260407140538.633364-6-hch@lst.de Signed-off-by: Jens Axboe --- fs/xfs/xfs_zone_gc.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index f279dcca53cc..441b99727bbc 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -902,9 +902,10 @@ out: static void xfs_submit_zone_reset_bio( - struct xfs_rtgroup *rtg, - struct bio *bio) + struct bio *bio, + void *priv) { + struct xfs_rtgroup *rtg = priv; struct xfs_mount *mp = rtg_mount(rtg); trace_xfs_zone_reset(rtg); @@ -936,26 +937,16 @@ xfs_submit_zone_reset_bio( submit_bio(bio); } -static void xfs_bio_wait_endio(struct bio *bio) -{ - complete(bio->bi_private); -} - int xfs_zone_gc_reset_sync( struct xfs_rtgroup *rtg) { - DECLARE_COMPLETION_ONSTACK(done); struct bio bio; int error; bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, REQ_OP_ZONE_RESET | REQ_SYNC); - bio.bi_private = &done; - bio.bi_end_io = xfs_bio_wait_endio; - xfs_submit_zone_reset_bio(rtg, &bio); - wait_for_completion_io(&done); - + bio_await(&bio, rtg, xfs_submit_zone_reset_bio); error = blk_status_to_errno(bio.bi_status); bio_uninit(&bio); return error; @@ -992,7 +983,7 @@ xfs_zone_gc_reset_zones( chunk->data = data; WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); list_add_tail(&chunk->entry, &data->resetting); - xfs_submit_zone_reset_bio(rtg, bio); + xfs_submit_zone_reset_bio(bio, rtg); } while (next); } From 23b3b6f0b584b70a427d5bb826d320151890d7da Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:13 +0800 Subject: [PATCH 137/146] ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support The __u32 len field cannot represent a 4GB buffer (0x100000000 overflows to 0). Change it to __u64 so buffers up to 4GB can be registered. Add a reserved field for alignment and validate it is zero. The kernel enforces a default max of 4GB (UBLK_SHMEM_BUF_SIZE_MAX) which may be increased in future. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-2-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 9 ++++++++- include/uapi/linux/ublk_cmd.h | 3 ++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1af42850f5b1..3f8bb80b1e8f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -63,6 +63,9 @@ #define UBLK_CMD_REG_BUF _IOC_NR(UBLK_U_CMD_REG_BUF) #define UBLK_CMD_UNREG_BUF _IOC_NR(UBLK_U_CMD_UNREG_BUF) +/* Default max shmem buffer size: 4GB (may be increased in future) */ +#define UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32) + #define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) #define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) @@ -5351,11 +5354,15 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, if (buf_reg.flags & ~UBLK_SHMEM_BUF_READ_ONLY) return -EINVAL; + if (buf_reg.reserved) + return -EINVAL; + addr = buf_reg.addr; size = buf_reg.len; nr_pages = size >> PAGE_SHIFT; - if (!size || !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr)) + if (!size || size > UBLK_SHMEM_BUF_SIZE_MAX || + !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr)) return -EINVAL; disk = ublk_get_disk(ub); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a7078b798791..6991370a72ce 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -89,8 +89,9 @@ /* Parameter buffer for UBLK_U_CMD_REG_BUF, pointed to by ctrl_cmd.addr */ struct ublk_shmem_buf_reg { __u64 addr; /* userspace virtual address of shared memory */ - __u32 len; /* buffer size in bytes (page-aligned, max 4GB) */ + __u64 len; /* buffer size in bytes, page-aligned, default max 4GB */ __u32 flags; + __u32 reserved; }; /* Pin pages without FOLL_WRITE; usable with write-sealed memfd */ From 211ff1602b67e26125977f8b2f369d7c2847628c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:14 +0800 Subject: [PATCH 138/146] ublk: verify all pages in multi-page bvec fall within registered range rq_for_each_bvec() yields multi-page bvecs where bv_page is only the first page. ublk_try_buf_match() only validated the start PFN against the maple tree, but a bvec can span multiple pages past the end of a registered range. Use mas_walk() instead of mtree_load() to obtain the range boundaries stored in the maple tree, and check that the bvec's end PFN does not exceed the range. Also remove base_pfn from struct ublk_buf_range since mas.index already provides the range start PFN. Reported-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-3-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 3f8bb80b1e8f..8fef6dfee271 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -304,7 +304,6 @@ struct ublk_buf { /* Maple tree value: maps a PFN range to buffer location */ struct ublk_buf_range { - unsigned long base_pfn; unsigned short buf_index; unsigned short flags; unsigned int base_offset; /* byte offset within buffer */ @@ -5306,7 +5305,6 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub, } range->buf_index = index; range->flags = flags; - range->base_pfn = pfn; range->base_offset = start << PAGE_SHIFT; ret = mtree_insert_range(&ub->buf_tree, pfn, @@ -5451,8 +5449,8 @@ static void __ublk_ctrl_unreg_buf(struct ublk_device *ub, if (range->buf_index != buf_index) continue; - base = range->base_pfn; - nr = mas.last - mas.index + 1; + base = mas.index; + nr = mas.last - base + 1; mas_erase(&mas); for (off = 0; off < nr; ) { @@ -5531,15 +5529,22 @@ static bool ublk_try_buf_match(struct ublk_device *ub, rq_for_each_bvec(bv, rq, iter) { unsigned long pfn = page_to_pfn(bv.bv_page); + unsigned long end_pfn = pfn + + ((bv.bv_offset + bv.bv_len - 1) >> PAGE_SHIFT); struct ublk_buf_range *range; unsigned long off; + MA_STATE(mas, &ub->buf_tree, pfn, pfn); - range = mtree_load(&ub->buf_tree, pfn); + range = mas_walk(&mas); if (!range) return false; + /* verify all pages in this bvec fall within the range */ + if (end_pfn > mas.last) + return false; + off = range->base_offset + - (pfn - range->base_pfn) * PAGE_SIZE + bv.bv_offset; + (pfn - mas.index) * PAGE_SIZE + bv.bv_offset; if (first) { /* Read-only buffer can't serve READ (kernel writes) */ From 8ea8566a9aeef746699d8c84bed3ac44edbfaa0e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:15 +0800 Subject: [PATCH 139/146] ublk: simplify PFN range loop in __ublk_ctrl_reg_buf Use the for-loop increment instead of a manual `i++` past the last page, and fix the mtree_insert_range end key accordingly. Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-4-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 8fef6dfee271..1257acc4522a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -5287,7 +5287,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub, unsigned long i; int ret; - for (i = 0; i < nr_pages; ) { + for (i = 0; i < nr_pages; i++) { unsigned long pfn = page_to_pfn(pages[i]); unsigned long start = i; struct ublk_buf_range *range; @@ -5296,7 +5296,6 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub, while (i + 1 < nr_pages && page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1) i++; - i++; /* past the last page in this run */ range = kzalloc(sizeof(*range), GFP_KERNEL); if (!range) { @@ -5308,7 +5307,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub, range->base_offset = start << PAGE_SHIFT; ret = mtree_insert_range(&ub->buf_tree, pfn, - pfn + (i - start) - 1, + pfn + (i - start), range, GFP_KERNEL); if (ret) { kfree(range); From 5e864438e2853ef5112d7905fadcc3877e2be70a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:16 +0800 Subject: [PATCH 140/146] ublk: replace xarray with IDA for shmem buffer index allocation Remove struct ublk_buf which only contained nr_pages that was never read after registration. Use IDA for pure index allocation instead of xarray. Make __ublk_ctrl_unreg_buf() return int so the caller can detect invalid index without a separate lookup. Simplify ublk_buf_cleanup() to walk the maple tree directly and unpin all pages in one pass, instead of iterating the xarray by buffer index. Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-5-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 92 ++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1257acc4522a..ec1c539326c9 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -297,11 +297,6 @@ struct ublk_queue { struct ublk_io ios[] __counted_by(q_depth); }; -/* Per-registered shared memory buffer */ -struct ublk_buf { - unsigned int nr_pages; -}; - /* Maple tree value: maps a PFN range to buffer location */ struct ublk_buf_range { unsigned short buf_index; @@ -345,7 +340,7 @@ struct ublk_device { /* shared memory zero copy */ struct maple_tree buf_tree; - struct xarray bufs_xa; + struct ida buf_ida; struct ublk_queue *queues[]; }; @@ -4698,7 +4693,7 @@ static int ublk_ctrl_add_dev(const struct ublksrv_ctrl_cmd *header) spin_lock_init(&ub->lock); mutex_init(&ub->cancel_mutex); mt_init(&ub->buf_tree); - xa_init_flags(&ub->bufs_xa, XA_FLAGS_ALLOC); + ida_init(&ub->buf_ida); INIT_WORK(&ub->partition_scan_work, ublk_partition_scan_work); ret = ublk_alloc_dev_number(ub, header->dev_id); @@ -5279,11 +5274,9 @@ static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index) } static int __ublk_ctrl_reg_buf(struct ublk_device *ub, - struct ublk_buf *ubuf, - struct page **pages, int index, - unsigned short flags) + struct page **pages, unsigned long nr_pages, + int index, unsigned short flags) { - unsigned long nr_pages = ubuf->nr_pages; unsigned long i; int ret; @@ -5335,9 +5328,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, struct page **pages = NULL; unsigned int gup_flags; struct gendisk *disk; - struct ublk_buf *ubuf; long pinned; - u32 index; + int index; int ret; if (!ublk_dev_support_shmem_zc(ub)) @@ -5367,16 +5359,10 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, return -ENODEV; /* Pin pages before quiescing (may sleep) */ - ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL); - if (!ubuf) { - ret = -ENOMEM; - goto put_disk; - } - pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) { ret = -ENOMEM; - goto err_free; + goto put_disk; } gup_flags = FOLL_LONGTERM; @@ -5392,7 +5378,6 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, ret = -EFAULT; goto err_unpin; } - ubuf->nr_pages = nr_pages; /* * Drain inflight I/O and quiesce the queue so no new requests @@ -5403,13 +5388,15 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, mutex_lock(&ub->mutex); - ret = xa_alloc(&ub->bufs_xa, &index, ubuf, xa_limit_16b, GFP_KERNEL); - if (ret) + index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL); + if (index < 0) { + ret = index; goto err_unlock; + } - ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags); + ret = __ublk_ctrl_reg_buf(ub, pages, nr_pages, index, buf_reg.flags); if (ret) { - xa_erase(&ub->bufs_xa, index); + ida_free(&ub->buf_ida, index); goto err_unlock; } @@ -5427,19 +5414,17 @@ err_unpin: unpin_user_pages(pages, pinned); err_free_pages: kvfree(pages); -err_free: - kfree(ubuf); put_disk: ublk_put_disk(disk); return ret; } -static void __ublk_ctrl_unreg_buf(struct ublk_device *ub, - struct ublk_buf *ubuf, int buf_index) +static int __ublk_ctrl_unreg_buf(struct ublk_device *ub, int buf_index) { MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); struct ublk_buf_range *range; struct page *pages[32]; + int ret = -ENOENT; mas_lock(&mas); mas_for_each(&mas, range, ULONG_MAX) { @@ -5448,6 +5433,7 @@ static void __ublk_ctrl_unreg_buf(struct ublk_device *ub, if (range->buf_index != buf_index) continue; + ret = 0; base = mas.index; nr = mas.last - base + 1; mas_erase(&mas); @@ -5465,7 +5451,8 @@ static void __ublk_ctrl_unreg_buf(struct ublk_device *ub, kfree(range); } mas_unlock(&mas); - kfree(ubuf); + + return ret; } static int ublk_ctrl_unreg_buf(struct ublk_device *ub, @@ -5473,11 +5460,14 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub, { int index = (int)header->data[0]; struct gendisk *disk; - struct ublk_buf *ubuf; + int ret; if (!ublk_dev_support_shmem_zc(ub)) return -EOPNOTSUPP; + if (index < 0 || index > USHRT_MAX) + return -EINVAL; + disk = ublk_get_disk(ub); if (!disk) return -ENODEV; @@ -5487,32 +5477,42 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub, mutex_lock(&ub->mutex); - ubuf = xa_erase(&ub->bufs_xa, index); - if (!ubuf) { - mutex_unlock(&ub->mutex); - ublk_unquiesce_and_resume(disk); - ublk_put_disk(disk); - return -ENOENT; - } - - __ublk_ctrl_unreg_buf(ub, ubuf, index); + ret = __ublk_ctrl_unreg_buf(ub, index); + if (!ret) + ida_free(&ub->buf_ida, index); mutex_unlock(&ub->mutex); ublk_unquiesce_and_resume(disk); ublk_put_disk(disk); - return 0; + return ret; } static void ublk_buf_cleanup(struct ublk_device *ub) { - struct ublk_buf *ubuf; - unsigned long index; + MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX); + struct ublk_buf_range *range; + struct page *pages[32]; - xa_for_each(&ub->bufs_xa, index, ubuf) - __ublk_ctrl_unreg_buf(ub, ubuf, index); - xa_destroy(&ub->bufs_xa); + mas_for_each(&mas, range, ULONG_MAX) { + unsigned long base = mas.index; + unsigned long nr = mas.last - base + 1; + unsigned long off; + + for (off = 0; off < nr; ) { + unsigned int batch = min_t(unsigned long, + nr - off, 32); + unsigned int j; + + for (j = 0; j < batch; j++) + pages[j] = pfn_to_page(base + off + j); + unpin_user_pages(pages, batch); + off += batch; + } + kfree(range); + } mtree_destroy(&ub->buf_tree); + ida_destroy(&ub->buf_ida); } /* Check if request pages match a registered shared memory buffer */ From 365ea7cc62447caac508706b429cdf031cc15a9f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:17 +0800 Subject: [PATCH 141/146] ublk: allow buffer registration before device is started Before START_DEV, there is no disk, no queue, no I/O dispatch, so the maple tree can be safely modified under ub->mutex alone without freezing the queue. Add ublk_lock_buf_tree()/ublk_unlock_buf_tree() helpers that take ub->mutex first, then freeze the queue if device is started. This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked() already holds ub->mutex when calling del_gendisk() which freezes the queue. Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-6-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 79 ++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 51 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index ec1c539326c9..247c1ce8ce8a 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -5238,23 +5238,29 @@ exit: } /* - * Drain inflight I/O and quiesce the queue. Freeze drains all inflight - * requests, quiesce_nowait marks the queue so no new requests dispatch, - * then unfreeze allows new submissions (which won't dispatch due to - * quiesce). This keeps freeze and ub->mutex non-nested. - */ -static void ublk_quiesce_and_release(struct gendisk *disk) + * Lock for maple tree modification: acquire ub->mutex, then freeze queue + * if device is started. If device is not yet started, only mutex is + * needed since no I/O path can access the tree. + * + * This ordering (mutex -> freeze) is safe because ublk_stop_dev_unlocked() + * already holds ub->mutex when calling del_gendisk() which freezes the queue. +*/ +static unsigned int ublk_lock_buf_tree(struct ublk_device *ub) { - unsigned int memflags; + unsigned int memflags = 0; - memflags = blk_mq_freeze_queue(disk->queue); - blk_mq_quiesce_queue_nowait(disk->queue); - blk_mq_unfreeze_queue(disk->queue, memflags); + mutex_lock(&ub->mutex); + if (ub->ub_disk) + memflags = blk_mq_freeze_queue(ub->ub_disk->queue); + + return memflags; } -static void ublk_unquiesce_and_resume(struct gendisk *disk) +static void ublk_unlock_buf_tree(struct ublk_device *ub, unsigned int memflags) { - blk_mq_unquiesce_queue(disk->queue); + if (ub->ub_disk) + blk_mq_unfreeze_queue(ub->ub_disk->queue, memflags); + mutex_unlock(&ub->mutex); } /* Erase coalesced PFN ranges from the maple tree matching buf_index */ @@ -5327,7 +5333,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, unsigned long addr, size, nr_pages; struct page **pages = NULL; unsigned int gup_flags; - struct gendisk *disk; + unsigned int memflags; long pinned; int index; int ret; @@ -5354,16 +5360,10 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr)) return -EINVAL; - disk = ublk_get_disk(ub); - if (!disk) - return -ENODEV; - - /* Pin pages before quiescing (may sleep) */ + /* Pin pages before any locks (may sleep) */ pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto put_disk; - } + if (!pages) + return -ENOMEM; gup_flags = FOLL_LONGTERM; if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY)) @@ -5379,14 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, goto err_unpin; } - /* - * Drain inflight I/O and quiesce the queue so no new requests - * are dispatched while we modify the maple tree. Keep freeze - * and mutex non-nested to avoid lock dependency. - */ - ublk_quiesce_and_release(disk); - - mutex_lock(&ub->mutex); + memflags = ublk_lock_buf_tree(ub); index = ida_alloc_max(&ub->buf_ida, USHRT_MAX, GFP_KERNEL); if (index < 0) { @@ -5400,22 +5393,16 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, goto err_unlock; } - mutex_unlock(&ub->mutex); - + ublk_unlock_buf_tree(ub, memflags); kvfree(pages); - ublk_unquiesce_and_resume(disk); - ublk_put_disk(disk); return index; err_unlock: - mutex_unlock(&ub->mutex); - ublk_unquiesce_and_resume(disk); + ublk_unlock_buf_tree(ub, memflags); err_unpin: unpin_user_pages(pages, pinned); err_free_pages: kvfree(pages); -put_disk: - ublk_put_disk(disk); return ret; } @@ -5459,7 +5446,7 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub, struct ublksrv_ctrl_cmd *header) { int index = (int)header->data[0]; - struct gendisk *disk; + unsigned int memflags; int ret; if (!ublk_dev_support_shmem_zc(ub)) @@ -5468,23 +5455,13 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub, if (index < 0 || index > USHRT_MAX) return -EINVAL; - disk = ublk_get_disk(ub); - if (!disk) - return -ENODEV; - - /* Drain inflight I/O before modifying the maple tree */ - ublk_quiesce_and_release(disk); - - mutex_lock(&ub->mutex); + memflags = ublk_lock_buf_tree(ub); ret = __ublk_ctrl_unreg_buf(ub, index); if (!ret) ida_free(&ub->buf_ida, index); - mutex_unlock(&ub->mutex); - - ublk_unquiesce_and_resume(disk); - ublk_put_disk(disk); + ublk_unlock_buf_tree(ub, memflags); return ret; } From 289653bb76c46149f88939c3cfef55cdb236ace2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:18 +0800 Subject: [PATCH 142/146] Documentation: ublk: address review comments for SHMEM_ZC docs - Use "physical pages" instead of "page frame numbers (PFNs)" for clarity - Remove "without any per-I/O overhead" claim from zero-copy description - Add scatter/gather limitation: each I/O's data must be contiguous within a single registered buffer Suggested-by: Caleb Sander Mateos Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-7-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- Documentation/block/ublk.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Documentation/block/ublk.rst b/Documentation/block/ublk.rst index e80cc415a739..0413dcd9ef69 100644 --- a/Documentation/block/ublk.rst +++ b/Documentation/block/ublk.rst @@ -492,8 +492,8 @@ The ``UBLK_F_SHMEM_ZC`` feature provides an alternative zero-copy path that works by sharing physical memory pages between the client application and the ublk server. Unlike the io_uring fixed buffer approach above, shared memory zero copy does not require io_uring buffer registration -per I/O — instead, it relies on the kernel matching page frame numbers -(PFNs) at I/O time. This allows the ublk server to access the shared +per I/O — instead, it relies on the kernel matching physical pages +at I/O time. This allows the ublk server to access the shared buffer directly, which is unlikely for the io_uring fixed buffer approach. @@ -507,8 +507,7 @@ tells the server where the data already lives. ``UBLK_F_SHMEM_ZC`` can be thought of as a supplement for optimized client applications — when the client is willing to allocate I/O buffers from -shared memory, the entire data path becomes zero-copy without any per-I/O -overhead. +shared memory, the entire data path becomes zero-copy. Use Cases ~~~~~~~~~ @@ -584,6 +583,9 @@ Limitations the page cache, which allocates its own pages. These kernel-allocated pages will never match the registered shared buffer. Only ``O_DIRECT`` puts the client's buffer pages directly into the block I/O. +- **Contiguous data only**: each I/O request's data must be contiguous + within a single registered buffer. Scatter/gather I/O that spans + multiple non-adjacent registered buffers cannot use the zero-copy path. Control Commands ~~~~~~~~~~~~~~~~ From b774765fb804045ee774476ded8e52482ae5ecb7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Apr 2026 21:30:19 +0800 Subject: [PATCH 143/146] MAINTAINERS: update ublk driver maintainer email Update the ublk userspace block driver maintainer email address from ming.lei@redhat.com to tom.leiming@gmail.com as the original email will become invalid. Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260409133020.3780098-8-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 77fdfcb55f06..4abb3345bc4e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -26992,7 +26992,7 @@ F: Documentation/filesystems/ubifs.rst F: fs/ubifs/ UBLK USERSPACE BLOCK DRIVER -M: Ming Lei +M: Ming Lei L: linux-block@vger.kernel.org S: Maintained F: Documentation/block/ublk.rst From 539fb773a3f7c07cf7fd00617f33ed4e33058d72 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Mar 2026 10:00:32 +0100 Subject: [PATCH 144/146] block: refactor blkdev_zone_mgmt_ioctl Split the zone reset case into a separate helper so that the conditional locking goes away. Signed-off-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Bart Van Assche Reviewed-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Link: https://patch.msgid.link/20260327090032.3722065-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-zoned.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index bfd9733ebd31..30cad2bb9291 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -417,20 +417,32 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, return 0; } -static int blkdev_truncate_zone_range(struct block_device *bdev, - blk_mode_t mode, const struct blk_zone_range *zrange) +static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode, + struct blk_zone_range *zrange) { loff_t start, end; + int ret = -EINVAL; + inode_lock(bdev->bd_mapping->host); + filemap_invalidate_lock(bdev->bd_mapping); if (zrange->sector + zrange->nr_sectors <= zrange->sector || zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) /* Out of range */ - return -EINVAL; + goto out_unlock; start = zrange->sector << SECTOR_SHIFT; end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; - return truncate_bdev_range(bdev, mode, start, end); + ret = truncate_bdev_range(bdev, mode, start, end); + if (ret) + goto out_unlock; + + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector, + zrange->nr_sectors); +out_unlock: + filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); + return ret; } /* @@ -443,7 +455,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, void __user *argp = (void __user *)arg; struct blk_zone_range zrange; enum req_op op; - int ret; if (!argp) return -EINVAL; @@ -459,15 +470,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, switch (cmd) { case BLKRESETZONE: - op = REQ_OP_ZONE_RESET; - - /* Invalidate the page cache, including dirty pages. */ - inode_lock(bdev->bd_mapping->host); - filemap_invalidate_lock(bdev->bd_mapping); - ret = blkdev_truncate_zone_range(bdev, mode, &zrange); - if (ret) - goto fail; - break; + return blkdev_reset_zone(bdev, mode, &zrange); case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; break; @@ -481,15 +484,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); - -fail: - if (cmd == BLKRESETZONE) { - filemap_invalidate_unlock(bdev->bd_mapping); - inode_unlock(bdev->bd_mapping->host); - } - - return ret; + return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); } static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) From 0a42ca4d2bff6306dd574a7897258fd02c2e6930 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Apr 2026 13:14:52 +0300 Subject: [PATCH 145/146] scsi: bsg: fix buffer overflow in scsi_bsg_uring_cmd() The bounds checking in scsi_bsg_uring_cmd() does not work because cmd->request_len is a u32 and scmd->cmd_len is a u16. We check that scmd->cmd_len is valid but if the cmd->request_len is more than USHRT_MAX it would still lead to a buffer overflow when we do the copy_from_user(). Fixes: 7b6d3255e7f8 ("scsi: bsg: add io_uring passthrough handler") Signed-off-by: Dan Carpenter Link: https://patch.msgid.link/adjNnMYK7A7KMNkA@stanley.mountain Signed-off-by: Jens Axboe --- drivers/scsi/scsi_bsg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_bsg.c b/drivers/scsi/scsi_bsg.c index c3ce497a3b94..e80dec53174e 100644 --- a/drivers/scsi/scsi_bsg.c +++ b/drivers/scsi/scsi_bsg.c @@ -137,11 +137,11 @@ static int scsi_bsg_uring_cmd(struct request_queue *q, struct io_uring_cmd *iouc return PTR_ERR(req); scmd = blk_mq_rq_to_pdu(req); - scmd->cmd_len = cmd->request_len; - if (scmd->cmd_len > sizeof(scmd->cmnd)) { + if (cmd->request_len > sizeof(scmd->cmnd)) { ret = -EINVAL; goto out_free_req; } + scmd->cmd_len = cmd->request_len; scmd->allowed = SG_DEFAULT_RETRIES; if (copy_from_user(scmd->cmnd, uptr64(cmd->request), cmd->request_len)) { From 36446de0c30c62b9d89502fd36c4904996d86ecd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 10 Apr 2026 20:41:36 +0800 Subject: [PATCH 146/146] ublk: fix tautological comparison warning in ublk_ctrl_reg_buf On 32-bit architectures, 'unsigned long size' can never exceed UBLK_SHMEM_BUF_SIZE_MAX (1ULL << 32), causing a tautological comparison warning. Validate buf_reg.len (__u64) directly before using it, and consolidate all input validation into a single check. Also remove the unnecessary local variables 'addr' and 'size' since buf_reg.addr and buf_reg.len can be used directly. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202604101952.3NOzqnu9-lkp@intel.com/ Fixes: 23b3b6f0b584 ("ublk: widen ublk_shmem_buf_reg.len to __u64 for 4GB buffer support") Signed-off-by: Ming Lei Link: https://patch.msgid.link/20260410124136.3983429-1-tom.leiming@gmail.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 247c1ce8ce8a..49fb584e392b 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -5330,7 +5330,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, { void __user *argp = (void __user *)(unsigned long)header->addr; struct ublk_shmem_buf_reg buf_reg; - unsigned long addr, size, nr_pages; + unsigned long nr_pages; struct page **pages = NULL; unsigned int gup_flags; unsigned int memflags; @@ -5352,14 +5352,12 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, if (buf_reg.reserved) return -EINVAL; - addr = buf_reg.addr; - size = buf_reg.len; - nr_pages = size >> PAGE_SHIFT; - - if (!size || size > UBLK_SHMEM_BUF_SIZE_MAX || - !PAGE_ALIGNED(size) || !PAGE_ALIGNED(addr)) + if (!buf_reg.len || buf_reg.len > UBLK_SHMEM_BUF_SIZE_MAX || + !PAGE_ALIGNED(buf_reg.len) || !PAGE_ALIGNED(buf_reg.addr)) return -EINVAL; + nr_pages = buf_reg.len >> PAGE_SHIFT; + /* Pin pages before any locks (may sleep) */ pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); if (!pages) @@ -5369,7 +5367,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub, if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY)) gup_flags |= FOLL_WRITE; - pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages); + pinned = pin_user_pages_fast(buf_reg.addr, nr_pages, gup_flags, pages); if (pinned < 0) { ret = pinned; goto err_free_pages;