Merge tag 'v7.1-rc-part1-smbdirect-fixes' of git://git.samba.org/ksmbd

Pull smbdirect updates from Steve French:
 "Move smbdirect server and client code to common directory:

   - temporary use of smbdirect_all_c_files.c to allow micro steps

   - factor out common functions into a smbdirect.ko.

   - convert cifs.ko to use smbdirect.ko

   - convert ksmbd.ko to use smbdirect.ko

   - let smbdirect.ko use global workqueues

   - move ib_client logic from ksmbd.ko into smbdirect.ko

   - remove smbdirect_all_c_files.c hack again

   - some locking and teardown related fixes on top"

* tag 'v7.1-rc-part1-smbdirect-fixes' of git://git.samba.org/ksmbd: (145 commits)
  smb: smbdirect: let smbdirect_connection_deregister_mr_io unlock while waiting
  smb: smbdirect: fix the logic in smbdirect_socket_destroy_sync() without an error
  smb: smbdirect: fix copyright header of smbdirect.h
  smb: smbdirect: change smbdirect_socket_parameters.{initiator_depth,responder_resources} to __u16
  smb: smbdirect: remove unused SMBDIRECT_USE_INLINE_C_FILES logic
  smb: server: no longer use smbdirect_socket_set_custom_workqueue()
  smb: client: no longer use smbdirect_socket_set_custom_workqueue()
  smb: smbdirect: introduce global workqueues
  smb: smbdirect: prepare use of dedicated workqueues for different steps
  smb: smbdirect: remove unused smbdirect_connection_mr_io_recovery_work()
  smb: smbdirect: wrap rdma_disconnect() in rdma_[un]lock_handler()
  smb: server: make use of smbdirect_netdev_rdma_capable_mode_type()
  smb: smbdirect: introduce smbdirect_netdev_rdma_capable_mode_type()
  smb: server: make use of smbdirect.ko
  smb: server: remove unused ksmbd_transport_ops.prepare()
  smb: server: make use of smbdirect_socket_{listen,accept}()
  smb: server: only use public smbdirect functions
  smb: server: make use of smbdirect_socket_create_accepting()/smbdirect_socket_release()
  smb: server: make use of smbdirect_{socket_init_accepting,connection_wait_for_connected}()
  smb: server: make use of smbdirect_connection_send_iter() and related functions
  ...
This commit is contained in:
Linus Torvalds
2026-04-16 08:25:04 -07:00
30 changed files with 7133 additions and 5837 deletions

View File

@@ -4,6 +4,7 @@
source "fs/smb/client/Kconfig"
source "fs/smb/server/Kconfig"
source "fs/smb/common/smbdirect/Kconfig"
config SMBFS
tristate

View File

@@ -180,7 +180,9 @@ if CIFS
config CIFS_SMB_DIRECT
bool "SMB Direct support"
depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
depends on CIFS && INFINIBAND && INFINIBAND_ADDR_TRANS
depends on CIFS=m || INFINIBAND=y
select SMB_COMMON_SMBDIRECT
help
Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1.
SMB Direct allows transferring SMB packets over RDMA. If unsure,

View File

@@ -23,7 +23,6 @@
#endif
#ifdef CONFIG_CIFS_SMB_DIRECT
#include "smbdirect.h"
#include "../common/smbdirect/smbdirect_pdu.h"
#endif
#include "cifs_swn.h"
#include "cached_dir.h"
@@ -452,11 +451,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
c = 0;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
#ifdef CONFIG_CIFS_SMB_DIRECT
struct smbdirect_socket *sc;
struct smbdirect_socket_parameters *sp;
#endif
/* channel info will be printed as a part of sessions below */
if (SERVER_IS_CHAN(server))
continue;
@@ -471,66 +465,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\nClientGUID: %pUL", server->client_guid);
spin_unlock(&server->srv_lock);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (!server->rdma)
goto skip_rdma;
if (!server->smbd_conn) {
seq_printf(m, "\nSMBDirect transport not available");
goto skip_rdma;
}
sc = &server->smbd_conn->socket;
sp = &sc->parameters;
seq_printf(m, "\nSMBDirect protocol version: 0x%x "
"transport status: %s (%u)",
SMBDIRECT_V1,
smbdirect_socket_status_string(sc->status),
sc->status);
seq_printf(m, "\nConn receive_credit_max: %u "
"send_credit_target: %u max_send_size: %u",
sp->recv_credit_max,
sp->send_credit_target,
sp->max_send_size);
seq_printf(m, "\nConn max_fragmented_recv_size: %u "
"max_fragmented_send_size: %u max_receive_size:%u",
sp->max_fragmented_recv_size,
sp->max_fragmented_send_size,
sp->max_recv_size);
seq_printf(m, "\nConn keep_alive_interval: %u "
"max_readwrite_size: %u rdma_readwrite_threshold: %u",
sp->keepalive_interval_msec * 1000,
sp->max_read_write_size,
server->rdma_readwrite_threshold);
seq_printf(m, "\nDebug count_get_receive_buffer: %llu "
"count_put_receive_buffer: %llu count_send_empty: %llu",
sc->statistics.get_receive_buffer,
sc->statistics.put_receive_buffer,
sc->statistics.send_empty);
seq_printf(m, "\nRead Queue "
"count_enqueue_reassembly_queue: %llu "
"count_dequeue_reassembly_queue: %llu "
"reassembly_data_length: %u "
"reassembly_queue_length: %u",
sc->statistics.enqueue_reassembly_queue,
sc->statistics.dequeue_reassembly_queue,
sc->recv_io.reassembly.data_length,
sc->recv_io.reassembly.queue_length);
seq_printf(m, "\nCurrent Credits send_credits: %u "
"receive_credits: %u receive_credit_target: %u",
atomic_read(&sc->send_io.credits.count),
atomic_read(&sc->recv_io.credits.count),
sc->recv_io.credits.target);
seq_printf(m, "\nPending send_pending: %u ",
atomic_read(&sc->send_io.pending.count));
seq_printf(m, "\nMR responder_resources: %u "
"max_frmr_depth: %u mr_type: 0x%x",
sp->responder_resources,
sp->max_frmr_depth,
sc->mr_io.type);
seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u",
atomic_read(&sc->mr_io.ready.count),
atomic_read(&sc->mr_io.used.count));
skip_rdma:
smbd_debug_proc_show(server, m);
#endif
seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x",
server->credits,

View File

@@ -36,7 +36,6 @@
#include "../common/smb2status.h"
#include "smb2glob.h"
#include "cifs_spnego.h"
#include "../common/smbdirect/smbdirect.h"
#include "smbdirect.h"
#include "trace.h"
#ifdef CONFIG_CIFS_DFS_UPCALL
@@ -4554,9 +4553,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
req->ReadChannelInfoLength =
cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
v1->offset = cpu_to_le64(rdata->mr->mr->iova);
v1->token = cpu_to_le32(rdata->mr->mr->rkey);
v1->length = cpu_to_le32(rdata->mr->mr->length);
smbd_mr_fill_buffer_descriptor(rdata->mr, v1);
*total_len += sizeof(*v1) - 1;
}
@@ -5155,9 +5152,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata)
req->WriteChannelInfoLength =
cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1));
v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0];
v1->offset = cpu_to_le64(wdata->mr->mr->iova);
v1->token = cpu_to_le32(wdata->mr->mr->rkey);
v1->length = cpu_to_le32(wdata->mr->mr->length);
smbd_mr_fill_buffer_descriptor(wdata->mr, v1);
rqst.rq_iov[0].iov_len += sizeof(*v1);

File diff suppressed because it is too large Load Diff

View File

@@ -11,12 +11,8 @@
#define cifs_rdma_enabled(server) ((server)->rdma)
#include "cifsglob.h"
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
#include <linux/mempool.h>
#include "../common/smbdirect/smbdirect.h"
#include "../common/smbdirect/smbdirect_socket.h"
extern int rdma_readwrite_threshold;
extern int smbd_max_frmr_depth;
@@ -27,17 +23,8 @@ extern int smbd_max_send_size;
extern int smbd_send_credit_target;
extern int smbd_receive_credit_max;
/*
* The context for the SMBDirect transport
* Everything related to the transport is here. It has several logical parts
* 1. RDMA related structures
* 2. SMBDirect connection parameters
* 3. Memory registrations
* 4. Receive and reassembly queues for data receive path
* 5. mempools for allocating packets
*/
struct smbd_connection {
struct smbdirect_socket socket;
struct smbdirect_socket *socket;
};
/* Create a SMBDirect session */
@@ -60,8 +47,12 @@ int smbd_send(struct TCP_Server_Info *server,
struct smbdirect_mr_io *smbd_register_mr(
struct smbd_connection *info, struct iov_iter *iter,
bool writing, bool need_invalidate);
void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
struct smbdirect_buffer_descriptor_v1 *v1);
void smbd_deregister_mr(struct smbdirect_mr_io *mr);
void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m);
#else
#define cifs_rdma_enabled(server) 0
struct smbd_connection {};

View File

@@ -4,3 +4,4 @@
#
obj-$(CONFIG_SMBFS) += cifs_md4.o
obj-$(CONFIG_SMB_COMMON_SMBDIRECT) += smbdirect/

View File

@@ -0,0 +1,9 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# smbdirect configuration
config SMB_COMMON_SMBDIRECT
def_tristate n
depends on INFINIBAND && INFINIBAND_ADDR_TRANS
depends on m || INFINIBAND=y
select SG_POOL

View File

@@ -0,0 +1,18 @@
# SPDX-License-Identifier: GPL-2.0-or-later
#
# Makefile for smbdirect support
#
obj-$(CONFIG_SMB_COMMON_SMBDIRECT) += smbdirect.o
smbdirect-y := \
smbdirect_socket.o \
smbdirect_connection.o \
smbdirect_mr.o \
smbdirect_rw.o \
smbdirect_debug.o \
smbdirect_connect.o \
smbdirect_listen.o \
smbdirect_accept.o \
smbdirect_devices.o \
smbdirect_main.o

View File

@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (C) 2018, LG Electronics.
* Copyright (C) 2025 Stefan Metzmacher
*/
#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__
@@ -25,12 +24,15 @@ struct smbdirect_buffer_descriptor_v1 {
* Some values are important for the upper layer.
*/
struct smbdirect_socket_parameters {
__u64 flags;
#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1)
#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2)
__u32 resolve_addr_timeout_msec;
__u32 resolve_route_timeout_msec;
__u32 rdma_connect_timeout_msec;
__u32 negotiate_timeout_msec;
__u8 initiator_depth;
__u8 responder_resources;
__u16 initiator_depth; /* limited to U8_MAX */
__u16 responder_resources; /* limited to U8_MAX */
__u16 recv_credit_max;
__u16 send_credit_target;
__u32 max_send_size;
@@ -43,4 +45,8 @@ struct smbdirect_socket_parameters {
__u32 keepalive_timeout_msec;
} __packed;
#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \
SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \
SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */

View File

@@ -0,0 +1,857 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (C) 2018, LG Electronics.
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
#include <net/sock.h>
#include "../../common/smb2status.h"
static int smbdirect_accept_rdma_event_handler(struct rdma_cm_id *id,
struct rdma_cm_event *event);
static int smbdirect_accept_init_params(struct smbdirect_socket *sc);
static void smbdirect_accept_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc);
static void smbdirect_accept_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc);
int smbdirect_accept_connect_request(struct smbdirect_socket *sc,
const struct rdma_conn_param *param)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_recv_io *recv_io;
u8 peer_initiator_depth;
u8 peer_responder_resources;
struct rdma_conn_param conn_param;
__be32 ird_ord_hdr[2];
int ret;
if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_CREATED))
return -EINVAL;
/*
* First set what the we as server are able to support
*/
sp->initiator_depth = min_t(u8, sp->initiator_depth,
sc->ib.dev->attrs.max_qp_rd_atom);
peer_initiator_depth = param->initiator_depth;
peer_responder_resources = param->responder_resources;
smbdirect_connection_negotiate_rdma_resources(sc,
peer_initiator_depth,
peer_responder_resources,
param);
ret = smbdirect_accept_init_params(sc);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_accept_init_params() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto init_params_failed;
}
ret = smbdirect_connection_create_qp(sc);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_create_qp() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto create_qp_failed;
}
ret = smbdirect_connection_create_mem_pools(sc);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_create_mem_pools() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto create_mem_failed;
}
recv_io = smbdirect_connection_get_recv_io(sc);
if (WARN_ON_ONCE(!recv_io)) {
ret = -EINVAL;
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_get_recv_io() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto get_recv_io_failed;
}
recv_io->cqe.done = smbdirect_accept_negotiate_recv_done;
/*
* Now post the recv_io buffer in order to get
* the negotiate request
*/
sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ;
ret = smbdirect_connection_post_recv_io(recv_io);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_post_recv_io() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto post_recv_io_failed;
}
/*
* From here recv_io is known to the RDMA QP and needs ib_drain_qp and
* smbdirect_accept_negotiate_recv_done to cleanup...
*/
recv_io = NULL;
/* already checked with SMBDIRECT_CHECK_STATUS_WARN above */
WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
/*
* We already negotiated sp->initiator_depth
* and sp->responder_resources above.
*/
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = sp->initiator_depth;
conn_param.responder_resources = sp->responder_resources;
if (sc->rdma.legacy_iwarp) {
ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
conn_param.private_data = ird_ord_hdr;
conn_param.private_data_len = sizeof(ird_ord_hdr);
} else {
conn_param.private_data = NULL;
conn_param.private_data_len = 0;
}
conn_param.retry_count = SMBDIRECT_RDMA_CM_RETRY;
conn_param.rnr_retry_count = SMBDIRECT_RDMA_CM_RNR_RETRY;
conn_param.flow_control = 0;
/* explicitly set above */
WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED);
sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
sc->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED;
sc->rdma.cm_id->event_handler = smbdirect_accept_rdma_event_handler;
ret = rdma_accept(sc->rdma.cm_id, &conn_param);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"rdma_accept() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto rdma_accept_failed;
}
/*
* start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
* so that the timer will cause a disconnect.
*/
INIT_DELAYED_WORK(&sc->idle.timer_work, smbdirect_connection_idle_timer_work);
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
msecs_to_jiffies(sp->negotiate_timeout_msec));
return 0;
rdma_accept_failed:
/*
* smbdirect_connection_destroy_qp() calls ib_drain_qp(),
* so that smbdirect_accept_negotiate_recv_done() will
* call smbdirect_connection_put_recv_io()
*/
post_recv_io_failed:
if (recv_io)
smbdirect_connection_put_recv_io(recv_io);
get_recv_io_failed:
smbdirect_connection_destroy_mem_pools(sc);
create_mem_failed:
smbdirect_connection_destroy_qp(sc);
create_qp_failed:
init_params_failed:
return ret;
}
static int smbdirect_accept_init_params(struct smbdirect_socket *sc)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
int max_send_sges;
unsigned int maxpages;
/* need 3 more sge. because a SMB_DIRECT header, SMB2 header,
* SMB2 response could be mapped.
*/
max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3;
if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) {
pr_err("max_send_size %d is too large\n", sp->max_send_size);
return -EINVAL;
}
/*
* There is only a single batch credit
*/
atomic_set(&sc->send_io.bcredits.count, 1);
/*
* Initialize the local credits to post
* IB_WR_SEND[_WITH_INV].
*/
atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
if (sp->max_read_write_size) {
maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE);
sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev,
sc->rdma.cm_id->port_num,
maxpages);
sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max);
/* add one extra in order to handle unaligned pages */
sc->rw_io.credits.max += 1;
}
sc->recv_io.credits.target = 1;
atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max);
return 0;
}
static void smbdirect_accept_negotiate_recv_work(struct work_struct *work);
static void smbdirect_accept_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_recv_io *recv_io =
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = recv_io->socket;
unsigned long flags;
if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
"wc->status=%s (%d) wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
goto error;
}
smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO,
"smbdirect_recv_io completed. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
/*
* This is an internal error!
*/
if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ))
goto error;
/*
* Don't reset timer to the keepalive interval in
* this will be done in smbdirect_accept_direct_negotiate_recv_work.
*/
ib_dma_sync_single_for_cpu(sc->ib.dev,
recv_io->sge.addr,
recv_io->sge.length,
DMA_FROM_DEVICE);
/*
* Only remember recv_io if it has enough bytes,
* this gives smbdirect_accept_negotiate_recv_work enough
* information in order to disconnect if it was not
* valid.
*/
sc->recv_io.reassembly.full_packet_received = true;
if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req))
smbdirect_connection_reassembly_append_recv_io(sc, recv_io, 0);
else
smbdirect_connection_put_recv_io(recv_io);
/*
* Some drivers (at least mlx5_ib and irdma) might post a
* recv completion before RDMA_CM_EVENT_ESTABLISHED,
* we need to adjust our expectation in that case.
*
* So we defer further processing of the negotiation
* to smbdirect_accept_negotiate_recv_work().
*
* If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED
* we queue the work directly otherwise
* smbdirect_accept_rdma_event_handler() will do it, when
* RDMA_CM_EVENT_ESTABLISHED arrived.
*/
spin_lock_irqsave(&sc->connect.lock, flags);
if (!sc->first_error) {
INIT_WORK(&sc->connect.work, smbdirect_accept_negotiate_recv_work);
if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)
queue_work(sc->workqueues.accept, &sc->connect.work);
}
spin_unlock_irqrestore(&sc->connect.lock, flags);
return;
error:
/*
* recv_io.posted.refill_work is still disabled,
* so smbdirect_connection_put_recv_io() won't
* start it.
*/
smbdirect_connection_put_recv_io(recv_io);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
}
static void smbdirect_accept_negotiate_recv_work(struct work_struct *work)
{
struct smbdirect_socket *sc =
container_of(work, struct smbdirect_socket, connect.work);
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_recv_io *recv_io;
struct smbdirect_negotiate_req *nreq;
unsigned long flags;
u16 min_version;
u16 max_version;
u16 credits_requested;
u32 preferred_send_size;
u32 max_receive_size;
u32 max_fragmented_size;
u32 ntstatus;
if (sc->first_error)
return;
/*
* make sure we won't start again...
*/
disable_work(work);
/*
* Reset timer to the keepalive interval in
* order to trigger our next keepalive message.
*/
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
msecs_to_jiffies(sp->keepalive_interval_msec));
/*
* If smbdirect_accept_negotiate_recv_done() detected an
* invalid request we want to disconnect.
*/
recv_io = smbdirect_connection_reassembly_first_recv_io(sc);
if (!recv_io) {
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
sc->recv_io.reassembly.queue_length--;
list_del(&recv_io->list);
spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
smbdirect_connection_put_recv_io(recv_io);
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED))
return;
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
/*
* Note recv_io is already part of the free list,
* as we just called smbdirect_connection_put_recv_io(),
* but it won't be reused before we call
* smbdirect_connection_recv_io_refill() below.
*/
nreq = (struct smbdirect_negotiate_req *)recv_io->packet;
min_version = le16_to_cpu(nreq->min_version);
max_version = le16_to_cpu(nreq->max_version);
credits_requested = le16_to_cpu(nreq->credits_requested);
preferred_send_size = le32_to_cpu(nreq->preferred_send_size);
max_receive_size = le32_to_cpu(nreq->max_receive_size);
max_fragmented_size = le32_to_cpu(nreq->max_fragmented_size);
smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
"ReqIn: %s%x, %s%x, %s%u, %s%u, %s%u, %s%u\n",
"MinVersion=0x",
le16_to_cpu(nreq->min_version),
"MaxVersion=0x",
le16_to_cpu(nreq->max_version),
"CreditsRequested=",
le16_to_cpu(nreq->credits_requested),
"PreferredSendSize=",
le32_to_cpu(nreq->preferred_send_size),
"MaxRecvSize=",
le32_to_cpu(nreq->max_receive_size),
"MaxFragmentedSize=",
le32_to_cpu(nreq->max_fragmented_size));
if (!(min_version <= SMBDIRECT_V1 && max_version >= SMBDIRECT_V1)) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: min_version=0x%x max_version=0x%x\n",
min_version, max_version);
ntstatus = le32_to_cpu(STATUS_NOT_SUPPORTED);
goto not_supported;
}
if (credits_requested == 0) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: credits_requested == 0\n");
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
if (max_receive_size < SMBDIRECT_MIN_RECEIVE_SIZE) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: max_receive_size=%u < %u\n",
max_receive_size,
SMBDIRECT_MIN_RECEIVE_SIZE);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
if (max_fragmented_size < SMBDIRECT_MIN_FRAGMENTED_SIZE) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: max_fragmented_size=%u < %u\n",
max_fragmented_size,
SMBDIRECT_MIN_FRAGMENTED_SIZE);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
/*
* At least the value of SMBDIRECT_MIN_RECEIVE_SIZE is used.
*/
sp->max_recv_size = min_t(u32, sp->max_recv_size, preferred_send_size);
sp->max_recv_size = max_t(u32, sp->max_recv_size, SMBDIRECT_MIN_RECEIVE_SIZE);
/*
* The maximum fragmented upper-layer payload receive size supported
*
* Assume max_payload_per_credit is
* smb_direct_receive_credit_max - 24 = 1340
*
* The maximum number would be
* smb_direct_receive_credit_max * max_payload_per_credit
*
* 1340 * 255 = 341700 (0x536C4)
*
* The minimum value from the spec is 131072 (0x20000)
*
* For now we use the logic we used in ksmbd before:
* (1364 * 255) / 2 = 173910 (0x2A756)
*
* We need to adjust this here in case the peer
* lowered sp->max_recv_size.
*
* TODO: instead of adjusting max_fragmented_recv_size
* we should adjust the number of available buffers,
* but for now we keep the logic as it was used
* in ksmbd before.
*/
sp->max_fragmented_recv_size = (sp->recv_credit_max * sp->max_recv_size) / 2;
/*
* We take the value from the peer, which is checked to be higher than 0,
* but we limit it to the max value we support in order to have
* the main logic simpler.
*/
sc->recv_io.credits.target = credits_requested;
sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target,
sp->recv_credit_max);
/*
* Note nreq->max_receive_size was already checked against
* SMBDIRECT_MIN_RECEIVE_SIZE above.
*/
sp->max_send_size = min_t(u32, sp->max_send_size, max_receive_size);
/*
* Note nreq->max_fragmented_size was already checked against
* SMBDIRECT_MIN_FRAGMENTED_SIZE above.
*/
sp->max_fragmented_send_size = max_fragmented_size;
if (sc->accept.listener) {
struct smbdirect_socket *lsc = sc->accept.listener;
unsigned long flags;
spin_lock_irqsave(&lsc->listen.lock, flags);
list_del(&sc->accept.list);
list_add_tail(&sc->accept.list, &lsc->listen.ready);
wake_up(&lsc->listen.wait_queue);
spin_unlock_irqrestore(&lsc->listen.lock, flags);
/*
* smbdirect_socket_accept() will call
* smbdirect_accept_negotiate_finish(nsc, 0);
*
* So that we don't send the negotiation
* response that grants credits to the peer
* before the socket is accepted by the
* application.
*/
return;
}
ntstatus = le32_to_cpu(STATUS_SUCCESS);
not_supported:
smbdirect_accept_negotiate_finish(sc, ntstatus);
}
void smbdirect_accept_negotiate_finish(struct smbdirect_socket *sc, u32 ntstatus)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_recv_io *recv_io;
struct smbdirect_send_io *send_io;
struct smbdirect_negotiate_resp *nrep;
int posted;
u16 new_credits;
int ret;
if (ntstatus)
goto not_supported;
/*
* Prepare for receiving data_transfer messages
*/
sc->recv_io.reassembly.full_packet_received = true;
sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
list_for_each_entry(recv_io, &sc->recv_io.free.list, list)
recv_io->cqe.done = smbdirect_connection_recv_io_done;
recv_io = NULL;
/*
* We should at least post 1 smbdirect_recv_io!
*/
posted = smbdirect_connection_recv_io_refill(sc);
if (posted < 1) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_recv_io_refill() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(posted));
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
/*
* The response will grant credits for all posted
* smbdirect_recv_io messages.
*/
new_credits = smbdirect_connection_grant_recv_credits(sc);
not_supported:
send_io = smbdirect_connection_alloc_send_io(sc);
if (IS_ERR(send_io)) {
ret = PTR_ERR(send_io);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_alloc_send_io() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_socket_schedule_cleanup(sc, ret);
return;
}
send_io->cqe.done = smbdirect_accept_negotiate_send_done;
nrep = (struct smbdirect_negotiate_resp *)send_io->packet;
nrep->min_version = cpu_to_le16(SMBDIRECT_V1);
nrep->max_version = cpu_to_le16(SMBDIRECT_V1);
if (ntstatus == 0) {
nrep->negotiated_version = cpu_to_le16(SMBDIRECT_V1);
nrep->reserved = 0;
nrep->credits_requested = cpu_to_le16(sp->send_credit_target);
nrep->credits_granted = cpu_to_le16(new_credits);
nrep->status = cpu_to_le32(ntstatus);
nrep->max_readwrite_size = cpu_to_le32(sp->max_read_write_size);
nrep->preferred_send_size = cpu_to_le32(sp->max_send_size);
nrep->max_receive_size = cpu_to_le32(sp->max_recv_size);
nrep->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size);
} else {
nrep->negotiated_version = 0;
nrep->reserved = 0;
nrep->credits_requested = 0;
nrep->credits_granted = 0;
nrep->status = cpu_to_le32(ntstatus);
nrep->max_readwrite_size = 0;
nrep->preferred_send_size = 0;
nrep->max_receive_size = 0;
nrep->max_fragmented_size = 0;
}
smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
"RepOut: %s%x, %s%x, %s%x, %s%u, %s%u, %s%x, %s%u, %s%u, %s%u, %s%u\n",
"MinVersion=0x",
le16_to_cpu(nrep->min_version),
"MaxVersion=0x",
le16_to_cpu(nrep->max_version),
"NegotiatedVersion=0x",
le16_to_cpu(nrep->negotiated_version),
"CreditsRequested=",
le16_to_cpu(nrep->credits_requested),
"CreditsGranted=",
le16_to_cpu(nrep->credits_granted),
"Status=0x",
le32_to_cpu(nrep->status),
"MaxReadWriteSize=",
le32_to_cpu(nrep->max_readwrite_size),
"PreferredSendSize=",
le32_to_cpu(nrep->preferred_send_size),
"MaxRecvSize=",
le32_to_cpu(nrep->max_receive_size),
"MaxFragmentedSize=",
le32_to_cpu(nrep->max_fragmented_size));
send_io->sge[0].addr = ib_dma_map_single(sc->ib.dev,
nrep,
sizeof(*nrep),
DMA_TO_DEVICE);
ret = ib_dma_mapping_error(sc->ib.dev, send_io->sge[0].addr);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"ib_dma_mapping_error() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_connection_free_send_io(send_io);
smbdirect_socket_schedule_cleanup(sc, ret);
return;
}
send_io->sge[0].length = sizeof(*nrep);
send_io->sge[0].lkey = sc->ib.pd->local_dma_lkey;
send_io->num_sge = 1;
ib_dma_sync_single_for_device(sc->ib.dev,
send_io->sge[0].addr,
send_io->sge[0].length,
DMA_TO_DEVICE);
send_io->wr.next = NULL;
send_io->wr.wr_cqe = &send_io->cqe;
send_io->wr.sg_list = send_io->sge;
send_io->wr.num_sge = send_io->num_sge;
send_io->wr.opcode = IB_WR_SEND;
send_io->wr.send_flags = IB_SEND_SIGNALED;
ret = smbdirect_connection_post_send_wr(sc, &send_io->wr);
if (ret) {
/* if we reach here, post send failed */
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_post_send_wr() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
/*
* Note smbdirect_connection_free_send_io()
* does ib_dma_unmap_page()
*/
smbdirect_connection_free_send_io(send_io);
smbdirect_socket_schedule_cleanup(sc, ret);
return;
}
/*
* smbdirect_accept_negotiate_send_done
* will do all remaining work...
*/
}
static void smbdirect_accept_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_send_io *send_io =
container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
struct smbdirect_socket *sc = send_io->socket;
struct smbdirect_negotiate_resp *nrep;
u32 ntstatus;
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
"smbdirect_send_io completed. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
nrep = (struct smbdirect_negotiate_resp *)send_io->packet;
ntstatus = le32_to_cpu(nrep->status);
/* Note this frees wc->wr_cqe, but not wc */
smbdirect_connection_free_send_io(send_io);
atomic_dec(&sc->send_io.pending.count);
if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
"wc->status=%s (%d) wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
/*
* If we send a smbdirect_negotiate_resp without NT_STATUS_OK (0)
* we need to disconnect now.
*
* Otherwise smbdirect_connection_negotiation_done()
* will setup all required things and wake up
* the waiter.
*/
if (ntstatus)
smbdirect_socket_schedule_cleanup(sc, -EOPNOTSUPP);
else
smbdirect_connection_negotiation_done(sc);
}
static int smbdirect_accept_rdma_event_handler(struct rdma_cm_id *id,
struct rdma_cm_event *event)
{
struct smbdirect_socket *sc = id->context;
unsigned long flags;
/*
* cma_cm_event_handler() has
* lockdep_assert_held(&id_priv->handler_mutex);
*
* Mutexes are not allowed in interrupts,
* and we rely on not being in an interrupt here,
* as we might sleep.
*
* We didn't timeout so we cancel our idle timer,
* it will be scheduled again if needed.
*/
WARN_ON_ONCE(in_interrupt());
if (event->status || event->event != sc->rdma.expected_event) {
int ret = -ECONNABORTED;
if (event->event == RDMA_CM_EVENT_REJECTED)
ret = -ECONNREFUSED;
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
ret = -ENETDOWN;
if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
ret = event->status;
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
rdma_event_msg(sc->rdma.expected_event),
rdma_event_msg(event->event),
event->status,
SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_socket_schedule_cleanup(sc, ret);
return 0;
}
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"%s (first_error=%1pe) event=%s\n",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
rdma_event_msg(event->event));
if (sc->first_error)
return 0;
switch (event->event) {
case RDMA_CM_EVENT_ESTABLISHED:
smbdirect_connection_rdma_established(sc);
/*
* Some drivers (at least mlx5_ib and irdma) might post a
* recv completion before RDMA_CM_EVENT_ESTABLISHED,
* we need to adjust our expectation in that case.
*
* If smbdirect_accept_negotiate_recv_done was called first
* it initialized sc->connect.work only for us to
* start, so that we turned into
* SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before
* smbdirect_accept_negotiate_recv_work() runs.
*
* If smbdirect_accept_negotiate_recv_done didn't happen
* yet. sc->connect.work is still be disabled and
* queue_work() is a no-op.
*/
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
return 0;
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
spin_lock_irqsave(&sc->connect.lock, flags);
if (!sc->first_error)
queue_work(sc->workqueues.accept, &sc->connect.work);
spin_unlock_irqrestore(&sc->connect.lock, flags);
/*
* wait for smbdirect_accept_negotiate_recv_done()
* to get the negotiate request.
*/
return 0;
default:
break;
}
/*
* This is an internal error
*/
WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_ESTABLISHED);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return 0;
}
static long smbdirect_socket_wait_for_accept(struct smbdirect_socket *lsc, long timeo)
{
long ret;
ret = wait_event_interruptible_timeout(lsc->listen.wait_queue,
!list_empty_careful(&lsc->listen.ready) ||
lsc->status != SMBDIRECT_SOCKET_LISTENING ||
lsc->first_error,
timeo);
if (lsc->status != SMBDIRECT_SOCKET_LISTENING)
return -EINVAL;
if (lsc->first_error)
return lsc->first_error;
if (!ret)
ret = -ETIMEDOUT;
if (ret < 0)
return ret;
return 0;
}
struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc,
long timeo,
struct proto_accept_arg *arg)
{
struct smbdirect_socket *nsc;
unsigned long flags;
if (lsc->status != SMBDIRECT_SOCKET_LISTENING) {
arg->err = -EINVAL;
return NULL;
}
if (lsc->first_error) {
arg->err = lsc->first_error;
return NULL;
}
if (list_empty_careful(&lsc->listen.ready)) {
int ret;
if (timeo == 0) {
arg->err = -EAGAIN;
return NULL;
}
ret = smbdirect_socket_wait_for_accept(lsc, timeo);
if (ret) {
arg->err = ret;
return NULL;
}
}
spin_lock_irqsave(&lsc->listen.lock, flags);
nsc = list_first_entry_or_null(&lsc->listen.ready,
struct smbdirect_socket,
accept.list);
if (nsc) {
nsc->accept.listener = NULL;
list_del_init_careful(&nsc->accept.list);
arg->is_empty = list_empty_careful(&lsc->listen.ready);
}
spin_unlock_irqrestore(&lsc->listen.lock, flags);
if (!nsc) {
arg->err = -EAGAIN;
return NULL;
}
/*
* We did not send the negotiation response
* yet, so we did not grant any credits to the client,
* so it didn't grant any credits to us.
*
* The caller expects a connected socket
* now as there are no credits anyway.
*
* Then we send the negotiation response in
* order to grant credits to the peer.
*/
nsc->status = SMBDIRECT_SOCKET_CONNECTED;
smbdirect_accept_negotiate_finish(nsc, 0);
return nsc;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_accept);

View File

@@ -0,0 +1,925 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2012,2016,2017,2025 Stefan Metzmacher
*/
#include "smbdirect_internal.h"
#include "../../common/smb2status.h"
static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc);
static int smbdirect_connect_resolve_addr(struct smbdirect_socket *sc,
const struct sockaddr *src,
const struct sockaddr *dst);
static int smbdirect_connect_rdma_event_handler(struct rdma_cm_id *id,
struct rdma_cm_event *event);
static int smbdirect_connect_negotiate_start(struct smbdirect_socket *sc);
static void smbdirect_connect_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc);
static void smbdirect_connect_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc);
int smbdirect_connect(struct smbdirect_socket *sc, const struct sockaddr *dst)
{
const struct sockaddr *src = NULL;
union {
struct sockaddr sa;
struct sockaddr_storage ss;
} src_addr = {
.sa = {
.sa_family = AF_UNSPEC,
},
};
int ret;
if (sc->first_error)
return -ENOTCONN;
if (sc->status != SMBDIRECT_SOCKET_CREATED)
return -EALREADY;
if (WARN_ON_ONCE(!sc->rdma.cm_id))
return -EINVAL;
src_addr.ss = sc->rdma.cm_id->route.addr.src_addr;
if (src_addr.sa.sa_family != AF_UNSPEC)
src = &src_addr.sa;
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"connect: src: %pISpsfc dst: %pISpsfc\n",
src, dst);
ret = smbdirect_connect_setup_connection(sc);
if (ret)
return ret;
ret = smbdirect_connect_resolve_addr(sc, src, dst);
if (ret)
return ret;
/*
* The rest happens async via smbdirect_connect_rdma_event_handler()
* the caller will decide to wait or not.
*/
return 0;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect);
static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc)
{
rdma_lock_handler(sc->rdma.cm_id);
sc->rdma.cm_id->event_handler = smbdirect_connect_rdma_event_handler;
rdma_unlock_handler(sc->rdma.cm_id);
if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_CREATED))
return -EINVAL;
sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED;
return 0;
}
static int smbdirect_connect_resolve_addr(struct smbdirect_socket *sc,
const struct sockaddr *src,
const struct sockaddr *dst)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct sockaddr *src_addr = NULL;
struct sockaddr *dst_addr = NULL;
int ret;
src_addr = (struct sockaddr *)src;
if (src_addr && src_addr->sa_family == AF_UNSPEC)
src_addr = NULL;
dst_addr = (struct sockaddr *)dst;
if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED))
return -EINVAL;
sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING;
sc->rdma.expected_event = RDMA_CM_EVENT_ADDR_RESOLVED;
ret = rdma_resolve_addr(sc->rdma.cm_id, src_addr, dst_addr,
sp->resolve_addr_timeout_msec);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"rdma_resolve_addr() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
return 0;
}
static int smbdirect_connect_resolve_route(struct smbdirect_socket *sc)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
int ret;
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED))
return sc->first_error;
sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING;
sc->rdma.expected_event = RDMA_CM_EVENT_ROUTE_RESOLVED;
ret = rdma_resolve_route(sc->rdma.cm_id, sp->resolve_route_timeout_msec);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"rdma_resolve_route() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
return 0;
}
static int smbdirect_connect_rdma_connect(struct smbdirect_socket *sc)
{
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct rdma_conn_param conn_param;
__be32 ird_ord_hdr[2];
int ret;
sc->ib.dev = sc->rdma.cm_id->device;
if (!smbdirect_frwr_is_supported(&sc->ib.dev->attrs)) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"Fast Registration Work Requests (FRWR) is not supported device %.*s\n",
IB_DEVICE_NAME_MAX,
sc->ib.dev->name);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
sc->ib.dev->attrs.device_cap_flags,
sc->ib.dev->attrs.max_fast_reg_page_list_len);
return -EPROTONOSUPPORT;
}
if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB &&
!rdma_ib_or_roce(sc->ib.dev, sc->rdma.cm_id->port_num)) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"Not IB: device: %.*s IW:%u local: %pISpsfc remote: %pISpsfc\n",
IB_DEVICE_NAME_MAX,
sc->ib.dev->name,
rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num),
&sc->rdma.cm_id->route.addr.src_addr,
&sc->rdma.cm_id->route.addr.dst_addr);
return -EPROTONOSUPPORT;
}
if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW &&
!rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num)) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"Not IW: device: %.*s IB:%u local: %pISpsfc remote: %pISpsfc\n",
IB_DEVICE_NAME_MAX,
sc->ib.dev->name,
rdma_ib_or_roce(sc->ib.dev, sc->rdma.cm_id->port_num),
&sc->rdma.cm_id->route.addr.src_addr,
&sc->rdma.cm_id->route.addr.dst_addr);
return -EPROTONOSUPPORT;
}
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"rdma connect: device: %.*s local: %pISpsfc remote: %pISpsfc\n",
IB_DEVICE_NAME_MAX,
sc->ib.dev->name,
&sc->rdma.cm_id->route.addr.src_addr,
&sc->rdma.cm_id->route.addr.dst_addr);
sp->max_frmr_depth = min_t(u32, sp->max_frmr_depth,
sc->ib.dev->attrs.max_fast_reg_page_list_len);
sc->mr_io.type = IB_MR_TYPE_MEM_REG;
if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG)
sc->mr_io.type = IB_MR_TYPE_SG_GAPS;
sp->responder_resources = min_t(u8, sp->responder_resources,
sc->ib.dev->attrs.max_qp_rd_atom);
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO,
"responder_resources=%d\n",
sp->responder_resources);
ret = smbdirect_connection_create_qp(sc);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_create_qp() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
memset(&conn_param, 0, sizeof(conn_param));
conn_param.initiator_depth = sp->initiator_depth;
conn_param.responder_resources = sp->responder_resources;
/* Need to send IRD/ORD in private data for iWARP */
if (rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num)) {
ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources);
ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth);
conn_param.private_data = ird_ord_hdr;
conn_param.private_data_len = sizeof(ird_ord_hdr);
} else {
conn_param.private_data = NULL;
conn_param.private_data_len = 0;
}
conn_param.retry_count = SMBDIRECT_RDMA_CM_RETRY;
conn_param.rnr_retry_count = SMBDIRECT_RDMA_CM_RNR_RETRY;
conn_param.flow_control = 0;
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED))
return sc->first_error;
sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING;
sc->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED;
ret = rdma_connect_locked(sc->rdma.cm_id, &conn_param);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"rdma_connect_locked() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
/*
* start with the rdma connect timeout and SMBDIRECT_KEEPALIVE_PENDING
* so that the timer will cause a disconnect.
*/
INIT_DELAYED_WORK(&sc->idle.timer_work, smbdirect_connection_idle_timer_work);
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
msecs_to_jiffies(sp->rdma_connect_timeout_msec));
return 0;
}
static int smbdirect_connect_rdma_event_handler(struct rdma_cm_id *id,
struct rdma_cm_event *event)
{
struct smbdirect_socket *sc = id->context;
u8 peer_initiator_depth;
u8 peer_responder_resources;
int ret;
/*
* cma_cm_event_handler() has
* lockdep_assert_held(&id_priv->handler_mutex);
*
* Mutexes are not allowed in interrupts,
* and we rely on not being in an interrupt here,
* as we might sleep.
*
* We didn't timeout so we cancel our idle timer,
* it will be scheduled again if needed.
*/
WARN_ON_ONCE(in_interrupt());
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
cancel_delayed_work_sync(&sc->idle.timer_work);
if (event->status || event->event != sc->rdma.expected_event) {
int lvl = SMBDIRECT_LOG_ERR;
ret = -ECONNABORTED;
if (event->event == RDMA_CM_EVENT_REJECTED)
ret = -ECONNREFUSED;
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
ret = -ENETDOWN;
if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
ret = event->status;
if (ret == -ENODEV)
lvl = SMBDIRECT_LOG_INFO;
smbdirect_log_rdma_event(sc, lvl,
"%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
rdma_event_msg(sc->rdma.expected_event),
rdma_event_msg(event->event),
event->status,
SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_socket_schedule_cleanup_lvl(sc,
lvl,
ret);
return 0;
}
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"%s (first_error=%1pe) event=%s\n",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
rdma_event_msg(event->event));
if (sc->first_error)
return 0;
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING))
return 0;
sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED;
ret = smbdirect_connect_resolve_route(sc);
if (ret)
smbdirect_socket_schedule_cleanup(sc, ret);
return 0;
case RDMA_CM_EVENT_ROUTE_RESOLVED:
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING))
return 0;
sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED;
ret = smbdirect_connect_rdma_connect(sc);
if (ret)
smbdirect_socket_schedule_cleanup(sc, ret);
return 0;
case RDMA_CM_EVENT_ESTABLISHED:
smbdirect_connection_rdma_established(sc);
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING))
return 0;
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED;
/*
* Here we work around an inconsistency between
* iWarp and other devices (at least rxe and irdma using RoCEv2)
*/
if (rdma_protocol_iwarp(id->device, id->port_num)) {
/*
* iWarp devices report the peer's values
* with the perspective of the peer here.
* Tested with siw and irdma (in iwarp mode)
* We need to change to our perspective here,
* so we need to switch the values.
*/
peer_initiator_depth = event->param.conn.responder_resources;
peer_responder_resources = event->param.conn.initiator_depth;
} else {
/*
* Non iWarp devices report the peer's values
* already changed to our perspective here.
* Tested with rxe and irdma (in roce mode).
*/
peer_initiator_depth = event->param.conn.initiator_depth;
peer_responder_resources = event->param.conn.responder_resources;
}
smbdirect_connection_negotiate_rdma_resources(sc,
peer_initiator_depth,
peer_responder_resources,
&event->param.conn);
ret = smbdirect_connect_negotiate_start(sc);
if (ret)
smbdirect_socket_schedule_cleanup(sc, ret);
return 0;
default:
break;
}
/*
* This is an internal error
*/
WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_ESTABLISHED);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return 0;
}
static int smbdirect_connect_negotiate_start(struct smbdirect_socket *sc)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_recv_io *recv_io = NULL;
struct smbdirect_send_io *send_io = NULL;
struct smbdirect_negotiate_req *nreq = NULL;
int ret;
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED))
return sc->first_error;
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING;
ret = smbdirect_connection_create_mem_pools(sc);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_create_mem_pools() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto create_mem_pools_failed;
}
/*
* There is only a single batch credit
*/
atomic_set(&sc->send_io.bcredits.count, 1);
/*
* Initialize the local credits to post
* IB_WR_SEND[_WITH_INV].
*/
atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target);
recv_io = smbdirect_connection_get_recv_io(sc);
if (WARN_ON_ONCE(!recv_io)) {
ret = -EINVAL;
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_get_recv_io() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto get_recv_io_failed;
}
recv_io->cqe.done = smbdirect_connect_negotiate_recv_done;
send_io = smbdirect_connection_alloc_send_io(sc);
if (IS_ERR(send_io)) {
ret = PTR_ERR(send_io);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_alloc_send_io() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto alloc_send_io_failed;
}
send_io->cqe.done = smbdirect_connect_negotiate_send_done;
nreq = (struct smbdirect_negotiate_req *)send_io->packet;
nreq->min_version = cpu_to_le16(SMBDIRECT_V1);
nreq->max_version = cpu_to_le16(SMBDIRECT_V1);
nreq->reserved = 0;
nreq->credits_requested = cpu_to_le16(sp->send_credit_target);
nreq->preferred_send_size = cpu_to_le32(sp->max_send_size);
nreq->max_receive_size = cpu_to_le32(sp->max_recv_size);
nreq->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size);
smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
"ReqOut: %s%x, %s%x, %s%u, %s%u, %s%u, %s%u\n",
"MinVersion=0x",
le16_to_cpu(nreq->min_version),
"MaxVersion=0x",
le16_to_cpu(nreq->max_version),
"CreditsRequested=",
le16_to_cpu(nreq->credits_requested),
"PreferredSendSize=",
le32_to_cpu(nreq->preferred_send_size),
"MaxRecvSize=",
le32_to_cpu(nreq->max_receive_size),
"MaxFragmentedSize=",
le32_to_cpu(nreq->max_fragmented_size));
send_io->sge[0].addr = ib_dma_map_single(sc->ib.dev,
nreq,
sizeof(*nreq),
DMA_TO_DEVICE);
ret = ib_dma_mapping_error(sc->ib.dev, send_io->sge[0].addr);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"ib_dma_mapping_error() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto dma_mapping_failed;
}
send_io->sge[0].length = sizeof(*nreq);
send_io->sge[0].lkey = sc->ib.pd->local_dma_lkey;
send_io->num_sge = 1;
ib_dma_sync_single_for_device(sc->ib.dev,
send_io->sge[0].addr,
send_io->sge[0].length,
DMA_TO_DEVICE);
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
"sge addr=0x%llx length=%u lkey=0x%x\n",
send_io->sge[0].addr,
send_io->sge[0].length,
send_io->sge[0].lkey);
/*
* Now post the recv_io buffer in order to get
* the negotiate response
*/
sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP;
ret = smbdirect_connection_post_recv_io(recv_io);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_post_recv_io() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto post_recv_io_failed;
}
send_io->wr.next = NULL;
send_io->wr.wr_cqe = &send_io->cqe;
send_io->wr.sg_list = send_io->sge;
send_io->wr.num_sge = send_io->num_sge;
send_io->wr.opcode = IB_WR_SEND;
send_io->wr.send_flags = IB_SEND_SIGNALED;
ret = smbdirect_connection_post_send_wr(sc, &send_io->wr);
if (ret) {
/* if we reach here, post send failed */
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_post_send_wr() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
goto post_send_wr_failed;
}
/*
* start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING
* so that the timer will cause a disconnect.
*/
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING;
mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
msecs_to_jiffies(sp->negotiate_timeout_msec));
return 0;
post_send_wr_failed:
/*
* ib_dma_unmap_single is called in
* smbdirect_connection_free_send_io()
*/
smbdirect_connection_free_send_io(send_io);
/*
* recv_io is given to the rdma layer,
* we should not put it even on error
* nor call smbdirect_connection_destroy_mem_pools()
* it will be cleaned up during disconnect.
*/
return ret;
post_recv_io_failed:
/*
* ib_dma_unmap_single is called in
* smbdirect_connection_free_send_io()
*/
dma_mapping_failed:
smbdirect_connection_free_send_io(send_io);
alloc_send_io_failed:
smbdirect_connection_put_recv_io(recv_io);
get_recv_io_failed:
smbdirect_connection_destroy_mem_pools(sc);
create_mem_pools_failed:
return ret;
}
static void smbdirect_connect_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_send_io *send_io =
container_of(wc->wr_cqe, struct smbdirect_send_io, cqe);
struct smbdirect_socket *sc = send_io->socket;
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO,
"smbdirect_send_io completed. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
/* Note this frees wc->wr_cqe, but not wc */
smbdirect_connection_free_send_io(send_io);
atomic_dec(&sc->send_io.pending.count);
if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR,
"wc->status=%s (%d) wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
}
static void smbdirect_connect_negotiate_recv_work(struct work_struct *work);
static void smbdirect_connect_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_recv_io *recv_io =
container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe);
struct smbdirect_socket *sc = recv_io->socket;
unsigned long flags;
if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) {
if (wc->status != IB_WC_WR_FLUSH_ERR)
smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR,
"wc->status=%s (%d) wc->opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
goto error;
}
smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO,
"smbdirect_recv_io completed. status='%s (%d)', opcode=%d\n",
ib_wc_status_msg(wc->status), wc->status, wc->opcode);
/*
* This is an internal error!
*/
if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REP))
goto error;
/*
* Don't reset timer to the keepalive interval in
* this will be done in smbdirect_accept_direct_negotiate_recv_work.
*/
ib_dma_sync_single_for_cpu(sc->ib.dev,
recv_io->sge.addr,
recv_io->sge.length,
DMA_FROM_DEVICE);
/*
* Only remember recv_io if it has enough bytes,
* this gives smbdirect_accept_negotiate_recv_work enough
* information in order to disconnect if it was not
* valid.
*/
sc->recv_io.reassembly.full_packet_received = true;
if (wc->byte_len >= sizeof(struct smbdirect_negotiate_resp))
smbdirect_connection_reassembly_append_recv_io(sc, recv_io, 0);
else
smbdirect_connection_put_recv_io(recv_io);
/*
* We continue via the workqueue as we may have
* complex work that might sleep.
*
* So we defer further processing of the negotiation
* to smbdirect_connect_negotiate_recv_work().
*/
spin_lock_irqsave(&sc->connect.lock, flags);
if (!sc->first_error) {
INIT_WORK(&sc->connect.work, smbdirect_connect_negotiate_recv_work);
if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)
queue_work(sc->workqueues.connect, &sc->connect.work);
}
spin_unlock_irqrestore(&sc->connect.lock, flags);
return;
error:
/*
* recv_io.posted.refill_work is still disabled,
* so smbdirect_connection_put_recv_io() won't
* start it.
*/
smbdirect_connection_put_recv_io(recv_io);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
}
static void smbdirect_connect_negotiate_recv_work(struct work_struct *work)
{
struct smbdirect_socket *sc =
container_of(work, struct smbdirect_socket, connect.work);
struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_recv_io *recv_io;
struct smbdirect_negotiate_resp *nrep;
unsigned long flags;
u16 negotiated_version;
u16 credits_requested;
u16 credits_granted;
u32 status;
u32 max_readwrite_size;
u32 preferred_send_size;
u32 max_receive_size;
u32 max_fragmented_size;
int posted;
int ret;
if (sc->first_error)
return;
/*
* make sure we won't start again...
*/
disable_work(work);
/*
* Reset timer to the keepalive interval in
* order to trigger our next keepalive message.
*/
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work,
msecs_to_jiffies(sp->keepalive_interval_msec));
/*
* If smbdirect_connect_negotiate_recv_done() detected an
* invalid request we want to disconnect.
*/
recv_io = smbdirect_connection_reassembly_first_recv_io(sc);
if (!recv_io) {
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
sc->recv_io.reassembly.queue_length--;
list_del(&recv_io->list);
spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
smbdirect_connection_put_recv_io(recv_io);
if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_RUNNING))
return;
/*
* Note recv_io is already part of the free list,
* as we just called smbdirect_connection_put_recv_io(),
* but it won't be reused before we call
* smbdirect_connection_recv_io_refill() below.
*/
nrep = (struct smbdirect_negotiate_resp *)recv_io->packet;
negotiated_version = le16_to_cpu(nrep->negotiated_version);
credits_requested = le16_to_cpu(nrep->credits_requested);
credits_granted = le16_to_cpu(nrep->credits_granted);
status = le32_to_cpu(nrep->status);
max_readwrite_size = le32_to_cpu(nrep->max_readwrite_size);
preferred_send_size = le32_to_cpu(nrep->preferred_send_size);
max_receive_size = le32_to_cpu(nrep->max_receive_size);
max_fragmented_size = le32_to_cpu(nrep->max_fragmented_size);
smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO,
"RepIn: %s%x, %s%x, %s%x, %s%u, %s%u, %s%x, %s%u, %s%u, %s%u, %s%u\n",
"MinVersion=0x",
le16_to_cpu(nrep->min_version),
"MaxVersion=0x",
le16_to_cpu(nrep->max_version),
"NegotiatedVersion=0x",
le16_to_cpu(nrep->negotiated_version),
"CreditsRequested=",
le16_to_cpu(nrep->credits_requested),
"CreditsGranted=",
le16_to_cpu(nrep->credits_granted),
"Status=0x",
le32_to_cpu(nrep->status),
"MaxReadWriteSize=",
le32_to_cpu(nrep->max_readwrite_size),
"PreferredSendSize=",
le32_to_cpu(nrep->preferred_send_size),
"MaxRecvSize=",
le32_to_cpu(nrep->max_receive_size),
"MaxFragmentedSize=",
le32_to_cpu(nrep->max_fragmented_size));
if (negotiated_version != SMBDIRECT_V1) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: negotiated_version=0x%x\n",
negotiated_version);
smbdirect_socket_schedule_cleanup(sc, -ECONNREFUSED);
return;
}
if (status != le32_to_cpu(STATUS_SUCCESS)) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: status=0x%x != 0x0\n",
status);
smbdirect_socket_schedule_cleanup(sc, -ECONNREFUSED);
return;
}
if (max_receive_size < SMBDIRECT_MIN_RECEIVE_SIZE) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: max_receive_size=%u < %u\n",
max_receive_size,
SMBDIRECT_MIN_RECEIVE_SIZE);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
if (max_fragmented_size < SMBDIRECT_MIN_FRAGMENTED_SIZE) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: max_fragmented_size=%u < %u\n",
max_fragmented_size,
SMBDIRECT_MIN_FRAGMENTED_SIZE);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
if (credits_granted == 0) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: credits_granted == 0\n");
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
if (credits_requested == 0) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: credits_requested == 0\n");
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
if (preferred_send_size > sp->max_recv_size) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: preferred_send_size=%u > max_recv_size=%u\n",
preferred_send_size,
sp->max_recv_size);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
/*
* We take the value from the peer, which is checked to be higher than 0,
* but we limit it to the max value we support in order to have
* the main logic simpler.
*/
sc->recv_io.credits.target = credits_requested;
sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target,
sp->recv_credit_max);
/*
* At least the value of SMBDIRECT_MIN_RECEIVE_SIZE is used.
*/
sp->max_recv_size = min_t(u32, sp->max_recv_size, preferred_send_size);
sp->max_recv_size = max_t(u32, sp->max_recv_size, SMBDIRECT_MIN_RECEIVE_SIZE);
/*
* We already sent our sp->max_fragmented_recv_size
* to the peer, so we can't lower it here any more.
*
* TODO: but if the peer lowered sp->max_recv_size
* we will have to adjust our number of buffers.
*
* But for now we keep it as the cifs.ko code
* worked before.
*/
/*
* Note nrep->max_receive_size was already checked against
* SMBDIRECT_MIN_RECEIVE_SIZE above.
*/
sp->max_send_size = min_t(u32, sp->max_send_size, max_receive_size);
/*
* Make sure the resulting max_frmr_depth is at least 1,
* which means max_read_write_size needs to be at least PAGE_SIZE.
*/
sp->max_read_write_size = min_t(u32, sp->max_frmr_depth * PAGE_SIZE,
max_readwrite_size);
if (sp->max_read_write_size < PAGE_SIZE) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"invalid: max_readwrite_size=%u < PAGE_SIZE(%lu)\n",
max_readwrite_size,
PAGE_SIZE);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE;
/*
* Note nrep->credits_granted was already checked against 0 above.
*/
atomic_set(&sc->send_io.credits.count, credits_granted);
/*
* Note nrep->max_fragmented_size was already checked against
* SMBDIRECT_MIN_FRAGMENTED_SIZE above.
*/
sp->max_fragmented_send_size = max_fragmented_size;
ret = smbdirect_connection_create_mr_list(sc);
if (ret) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_create_mr_list() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_socket_schedule_cleanup(sc, ret);
return;
}
/*
* Prepare for receiving data_transfer messages
*/
sc->recv_io.reassembly.full_packet_received = true;
sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER;
list_for_each_entry(recv_io, &sc->recv_io.free.list, list)
recv_io->cqe.done = smbdirect_connection_recv_io_done;
recv_io = NULL;
/*
* We should at least post 1 smbdirect_recv_io!
*/
posted = smbdirect_connection_recv_io_refill(sc);
if (posted < 1) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_recv_io_refill() failed %1pe\n",
SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
return;
}
/*
* smbdirect_connection_negotiation_done()
* will setup all required things and wake up
* the waiter.
*/
smbdirect_connection_negotiation_done(sc);
}
int smbdirect_connect_sync(struct smbdirect_socket *sc,
const struct sockaddr *dst)
{
int ret;
ret = smbdirect_connect(sc, dst);
if (ret) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connect(%pISpsfc) failed %1pe\n",
dst, SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
ret = smbdirect_connection_wait_for_connected(sc);
if (ret) {
int lvl = SMBDIRECT_LOG_ERR;
if (ret == -ENODEV)
lvl = SMBDIRECT_LOG_INFO;
smbdirect_log_rdma_event(sc, lvl,
"wait for smbdirect_connect(%pISpsfc) failed %1pe\n",
dst, SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
return 0;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect_sync);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,88 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
#include <linux/seq_file.h>
void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
unsigned int rdma_readwrite_threshold,
struct seq_file *m)
{
const struct smbdirect_socket_parameters *sp;
if (!sc)
return;
sp = &sc->parameters;
seq_puts(m, "\n");
seq_printf(m, "SMBDirect protocol version: 0x%x ",
SMBDIRECT_V1);
seq_printf(m, "transport status: %s (%u)",
smbdirect_socket_status_string(sc->status),
sc->status);
seq_puts(m, "\n");
seq_printf(m, "Conn receive_credit_max: %u ",
sp->recv_credit_max);
seq_printf(m, "send_credit_target: %u max_send_size: %u",
sp->send_credit_target,
sp->max_send_size);
seq_puts(m, "\n");
seq_printf(m, "Conn max_fragmented_recv_size: %u ",
sp->max_fragmented_recv_size);
seq_printf(m, "max_fragmented_send_size: %u max_receive_size:%u",
sp->max_fragmented_send_size,
sp->max_recv_size);
seq_puts(m, "\n");
seq_printf(m, "Conn keep_alive_interval: %u ",
sp->keepalive_interval_msec * 1000);
seq_printf(m, "max_readwrite_size: %u rdma_readwrite_threshold: %u",
sp->max_read_write_size,
rdma_readwrite_threshold);
seq_puts(m, "\n");
seq_printf(m, "Debug count_get_receive_buffer: %llu ",
sc->statistics.get_receive_buffer);
seq_printf(m, "count_put_receive_buffer: %llu count_send_empty: %llu",
sc->statistics.put_receive_buffer,
sc->statistics.send_empty);
seq_puts(m, "\n");
seq_printf(m, "Read Queue count_enqueue_reassembly_queue: %llu ",
sc->statistics.enqueue_reassembly_queue);
seq_printf(m, "count_dequeue_reassembly_queue: %llu ",
sc->statistics.dequeue_reassembly_queue);
seq_printf(m, "reassembly_data_length: %u ",
sc->recv_io.reassembly.data_length);
seq_printf(m, "reassembly_queue_length: %u",
sc->recv_io.reassembly.queue_length);
seq_puts(m, "\n");
seq_printf(m, "Current Credits send_credits: %u ",
atomic_read(&sc->send_io.credits.count));
seq_printf(m, "receive_credits: %u receive_credit_target: %u",
atomic_read(&sc->recv_io.credits.count),
sc->recv_io.credits.target);
seq_puts(m, "\n");
seq_printf(m, "Pending send_pending: %u ",
atomic_read(&sc->send_io.pending.count));
seq_puts(m, "\n");
seq_printf(m, "MR responder_resources: %u ",
sp->responder_resources);
seq_printf(m, "max_frmr_depth: %u mr_type: 0x%x",
sp->max_frmr_depth,
sc->mr_io.type);
seq_puts(m, "\n");
seq_printf(m, "MR mr_ready_count: %u mr_used_count: %u",
atomic_read(&sc->mr_io.ready.count),
atomic_read(&sc->mr_io.used.count));
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_legacy_debug_proc_show);

View File

@@ -0,0 +1,277 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (C) 2018, LG Electronics.
* Copyright (c) 2025 Stefan Metzmacher
*/
#include "smbdirect_internal.h"
static u8 smbdirect_ib_device_rdma_capable_node_type(struct ib_device *ib_dev)
{
if (!smbdirect_frwr_is_supported(&ib_dev->attrs))
return RDMA_NODE_UNSPECIFIED;
switch (ib_dev->node_type) {
case RDMA_NODE_IB_CA: /* Infiniband, RoCE v1 and v2 */
case RDMA_NODE_RNIC: /* iWarp */
return ib_dev->node_type;
}
return RDMA_NODE_UNSPECIFIED;
}
static int smbdirect_ib_client_add(struct ib_device *ib_dev)
{
u8 node_type = smbdirect_ib_device_rdma_capable_node_type(ib_dev);
struct smbdirect_device *sdev;
const char *node_str;
const char *action;
u32 pidx;
switch (node_type) {
case RDMA_NODE_IB_CA:
node_str = "IB_CA";
action = "added";
break;
case RDMA_NODE_RNIC:
node_str = "RNIC";
action = "added";
break;
case RDMA_NODE_UNSPECIFIED:
node_str = "UNSPECIFIED";
action = "ignored";
break;
default:
node_str = "UNKNOWN";
action = "ignored";
node_type = RDMA_NODE_UNSPECIFIED;
break;
}
pr_info("ib_dev[%.*s]: %s: %s %s=%u %s=0x%llx %s=0x%llx %s=0x%llx\n",
IB_DEVICE_NAME_MAX,
ib_dev->name,
action,
node_str,
"max_fast_reg_page_list_len",
ib_dev->attrs.max_fast_reg_page_list_len,
"device_cap_flags",
ib_dev->attrs.device_cap_flags,
"kernel_cap_flags",
ib_dev->attrs.kernel_cap_flags,
"page_size_cap",
ib_dev->attrs.page_size_cap);
if (node_type == RDMA_NODE_UNSPECIFIED)
return 0;
pr_info("ib_dev[%.*s]: %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u\n",
IB_DEVICE_NAME_MAX,
ib_dev->name,
"num_ports",
rdma_end_port(ib_dev),
"max_qp_rd_atom",
ib_dev->attrs.max_qp_rd_atom,
"max_qp_init_rd_atom",
ib_dev->attrs.max_qp_init_rd_atom,
"max_sgl_rd",
ib_dev->attrs.max_sgl_rd,
"max_sge_rd",
ib_dev->attrs.max_sge_rd,
"max_cqe",
ib_dev->attrs.max_cqe,
"max_qp_wr",
ib_dev->attrs.max_qp_wr,
"max_send_sge",
ib_dev->attrs.max_send_sge,
"max_recv_sge",
ib_dev->attrs.max_recv_sge);
rdma_for_each_port(ib_dev, pidx) {
const struct ib_port_immutable *ib_pi =
ib_port_immutable_read(ib_dev, pidx);
u32 core_cap_flags = ib_pi ? ib_pi->core_cap_flags : 0;
pr_info("ib_dev[%.*s]PORT[%u]: %s=%u %s=%u %s=%u %s=%u %s=%u %s=0x%x\n",
IB_DEVICE_NAME_MAX,
ib_dev->name,
pidx,
"iwarp",
rdma_protocol_iwarp(ib_dev, pidx),
"ib",
rdma_protocol_ib(ib_dev, pidx),
"roce",
rdma_protocol_roce(ib_dev, pidx),
"v1",
rdma_protocol_roce_eth_encap(ib_dev, pidx),
"v2",
rdma_protocol_roce_udp_encap(ib_dev, pidx),
"core_cap_flags",
core_cap_flags);
}
sdev = kzalloc_obj(*sdev);
if (!sdev)
return -ENOMEM;
sdev->ib_dev = ib_dev;
snprintf(sdev->ib_name, ARRAY_SIZE(sdev->ib_name), "%.*s",
IB_DEVICE_NAME_MAX, ib_dev->name);
write_lock(&smbdirect_globals.devices.lock);
list_add(&sdev->list, &smbdirect_globals.devices.list);
write_unlock(&smbdirect_globals.devices.lock);
return 0;
}
static void smbdirect_ib_client_remove(struct ib_device *ib_dev, void *client_data)
{
struct smbdirect_device *sdev, *tmp;
write_lock(&smbdirect_globals.devices.lock);
list_for_each_entry_safe(sdev, tmp, &smbdirect_globals.devices.list, list) {
if (sdev->ib_dev == ib_dev) {
list_del(&sdev->list);
pr_info("ib_dev[%.*s] removed\n",
IB_DEVICE_NAME_MAX, sdev->ib_name);
kfree(sdev);
break;
}
}
write_unlock(&smbdirect_globals.devices.lock);
}
static void smbdirect_ib_client_rename(struct ib_device *ib_dev, void *client_data)
{
struct smbdirect_device *sdev;
write_lock(&smbdirect_globals.devices.lock);
list_for_each_entry(sdev, &smbdirect_globals.devices.list, list) {
if (sdev->ib_dev == ib_dev) {
pr_info("ib_dev[%.*s] renamed to [%.*s]\n",
IB_DEVICE_NAME_MAX, sdev->ib_name,
IB_DEVICE_NAME_MAX, ib_dev->name);
snprintf(sdev->ib_name, ARRAY_SIZE(sdev->ib_name), "%.*s",
IB_DEVICE_NAME_MAX, ib_dev->name);
break;
}
}
write_unlock(&smbdirect_globals.devices.lock);
}
static struct ib_client smbdirect_ib_client = {
.name = "smbdirect_ib_client",
.add = smbdirect_ib_client_add,
.remove = smbdirect_ib_client_remove,
.rename = smbdirect_ib_client_rename,
};
static u8 smbdirect_netdev_find_rdma_capable_node_type(struct net_device *netdev)
{
struct smbdirect_device *sdev;
u8 node_type = RDMA_NODE_UNSPECIFIED;
read_lock(&smbdirect_globals.devices.lock);
list_for_each_entry(sdev, &smbdirect_globals.devices.list, list) {
u32 pi;
rdma_for_each_port(sdev->ib_dev, pi) {
struct net_device *ndev;
ndev = ib_device_get_netdev(sdev->ib_dev, pi);
if (!ndev)
continue;
if (ndev == netdev) {
dev_put(ndev);
node_type = sdev->ib_dev->node_type;
goto out;
}
dev_put(ndev);
}
}
out:
read_unlock(&smbdirect_globals.devices.lock);
if (node_type == RDMA_NODE_UNSPECIFIED) {
struct ib_device *ibdev;
ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN);
if (ibdev) {
node_type = smbdirect_ib_device_rdma_capable_node_type(ibdev);
ib_device_put(ibdev);
}
}
return node_type;
}
/*
* Returns RDMA_NODE_UNSPECIFIED when the netdev has
* no support for smbdirect capable rdma.
*
* Otherwise RDMA_NODE_RNIC is returned for iwarp devices
* and RDMA_NODE_IB_CA or Infiniband and RoCE (v1 and v2)
*/
u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev)
{
struct net_device *lower_dev;
struct list_head *iter;
u8 node_type = RDMA_NODE_UNSPECIFIED;
node_type = smbdirect_netdev_find_rdma_capable_node_type(netdev);
if (node_type != RDMA_NODE_UNSPECIFIED)
return node_type;
/* check if netdev is bridge or VLAN */
if (netif_is_bridge_master(netdev) || netdev->priv_flags & IFF_802_1Q_VLAN)
netdev_for_each_lower_dev(netdev, lower_dev, iter) {
node_type = smbdirect_netdev_find_rdma_capable_node_type(lower_dev);
if (node_type != RDMA_NODE_UNSPECIFIED)
return node_type;
}
/* check if netdev is IPoIB safely without layer violation */
if (netdev->type == ARPHRD_INFINIBAND)
return RDMA_NODE_IB_CA;
return RDMA_NODE_UNSPECIFIED;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_netdev_rdma_capable_node_type);
__init int smbdirect_devices_init(void)
{
int ret;
rwlock_init(&smbdirect_globals.devices.lock);
INIT_LIST_HEAD(&smbdirect_globals.devices.list);
ret = ib_register_client(&smbdirect_ib_client);
if (ret) {
pr_crit("failed to ib_register_client: %d %1pe\n",
ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
return 0;
}
__exit void smbdirect_devices_exit(void)
{
struct smbdirect_device *sdev, *tmp;
/*
* On exist we just cleanup so that
* smbdirect_ib_client_remove() won't
* print removals of devices.
*/
write_lock(&smbdirect_globals.devices.lock);
list_for_each_entry_safe(sdev, tmp, &smbdirect_globals.devices.list, list) {
list_del(&sdev->list);
kfree(sdev);
}
write_unlock(&smbdirect_globals.devices.lock);
ib_unregister_client(&smbdirect_ib_client);
}

View File

@@ -0,0 +1,141 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (c) 2025, Stefan Metzmacher
*/
#ifndef __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__
#define __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "smbdirect.h"
#include "smbdirect_pdu.h"
#include "smbdirect_public.h"
#include <linux/mutex.h>
struct smbdirect_module_state {
struct mutex mutex;
struct {
struct workqueue_struct *accept;
struct workqueue_struct *connect;
struct workqueue_struct *idle;
struct workqueue_struct *refill;
struct workqueue_struct *immediate;
struct workqueue_struct *cleanup;
} workqueues;
struct {
rwlock_t lock;
struct list_head list;
} devices;
};
extern struct smbdirect_module_state smbdirect_globals;
#include "smbdirect_socket.h"
struct smbdirect_device {
struct list_head list;
struct ib_device *ib_dev;
/*
* copy of ib_dev->name,
* in order to print renames
*/
char ib_name[IB_DEVICE_NAME_MAX];
};
int smbdirect_socket_init_new(struct net *net, struct smbdirect_socket *sc);
int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc);
void __smbdirect_socket_schedule_cleanup(struct smbdirect_socket *sc,
const char *macro_name,
unsigned int lvl,
const char *func,
unsigned int line,
int error,
enum smbdirect_socket_status *force_status);
#define smbdirect_socket_schedule_cleanup(__sc, __error) \
__smbdirect_socket_schedule_cleanup(__sc, \
"smbdirect_socket_schedule_cleanup", SMBDIRECT_LOG_ERR, \
__func__, __LINE__, __error, NULL)
#define smbdirect_socket_schedule_cleanup_lvl(__sc, __lvl, __error) \
__smbdirect_socket_schedule_cleanup(__sc, \
"smbdirect_socket_schedule_cleanup_lvl", __lvl, \
__func__, __LINE__, __error, NULL)
#define smbdirect_socket_schedule_cleanup_status(__sc, __lvl, __error, __status) do { \
enum smbdirect_socket_status __force_status = __status; \
__smbdirect_socket_schedule_cleanup(__sc, \
"smbdirect_socket_schedule_cleanup_status", __lvl, \
__func__, __LINE__, __error, &__force_status); \
} while (0)
void smbdirect_socket_destroy_sync(struct smbdirect_socket *sc);
int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc,
enum smbdirect_socket_status expected_status,
int unexpected_errno,
wait_queue_head_t *waitq,
atomic_t *total_credits,
int needed);
void smbdirect_connection_rdma_established(struct smbdirect_socket *sc);
void smbdirect_connection_negotiation_done(struct smbdirect_socket *sc);
int smbdirect_connection_create_qp(struct smbdirect_socket *sc);
void smbdirect_connection_destroy_qp(struct smbdirect_socket *sc);
int smbdirect_connection_create_mem_pools(struct smbdirect_socket *sc);
void smbdirect_connection_destroy_mem_pools(struct smbdirect_socket *sc);
struct smbdirect_send_io *smbdirect_connection_alloc_send_io(struct smbdirect_socket *sc);
void smbdirect_connection_free_send_io(struct smbdirect_send_io *msg);
struct smbdirect_recv_io *smbdirect_connection_get_recv_io(struct smbdirect_socket *sc);
void smbdirect_connection_put_recv_io(struct smbdirect_recv_io *msg);
void smbdirect_connection_reassembly_append_recv_io(struct smbdirect_socket *sc,
struct smbdirect_recv_io *msg,
u32 data_length);
struct smbdirect_recv_io *
smbdirect_connection_reassembly_first_recv_io(struct smbdirect_socket *sc);
void smbdirect_connection_negotiate_rdma_resources(struct smbdirect_socket *sc,
u8 peer_initiator_depth,
u8 peer_responder_resources,
const struct rdma_conn_param *param);
void smbdirect_connection_idle_timer_work(struct work_struct *work);
u16 smbdirect_connection_grant_recv_credits(struct smbdirect_socket *sc);
int smbdirect_connection_post_send_wr(struct smbdirect_socket *sc,
struct ib_send_wr *wr);
int smbdirect_connection_post_recv_io(struct smbdirect_recv_io *msg);
void smbdirect_connection_recv_io_done(struct ib_cq *cq, struct ib_wc *wc);
int smbdirect_connection_recv_io_refill(struct smbdirect_socket *sc);
int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc);
void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc);
int smbdirect_accept_connect_request(struct smbdirect_socket *sc,
const struct rdma_conn_param *param);
void smbdirect_accept_negotiate_finish(struct smbdirect_socket *sc, u32 ntstatus);
__init int smbdirect_devices_init(void);
__exit void smbdirect_devices_exit(void);
#endif /* __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ */

View File

@@ -0,0 +1,308 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (C) 2018, LG Electronics.
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
static int smbdirect_listen_rdma_event_handler(struct rdma_cm_id *id,
struct rdma_cm_event *event);
int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog)
{
int ret;
if (backlog < 0)
return -EINVAL;
if (!backlog)
backlog = 1; /* use 1 as default for now */
if (sc->first_error)
return -EINVAL;
if (sc->status != SMBDIRECT_SOCKET_CREATED)
return -EINVAL;
if (WARN_ON_ONCE(!sc->rdma.cm_id))
return -EINVAL;
if (sc->rdma.cm_id->device)
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"try to listen on addr: %pISpsfc dev: %.*s\n",
&sc->rdma.cm_id->route.addr.src_addr,
IB_DEVICE_NAME_MAX,
sc->rdma.cm_id->device->name);
else
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"try to listen on addr: %pISpsfc\n",
&sc->rdma.cm_id->route.addr.src_addr);
/* already checked above */
WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED);
sc->status = SMBDIRECT_SOCKET_LISTENING;
sc->rdma.expected_event = RDMA_CM_EVENT_CONNECT_REQUEST;
rdma_lock_handler(sc->rdma.cm_id);
sc->rdma.cm_id->event_handler = smbdirect_listen_rdma_event_handler;
rdma_unlock_handler(sc->rdma.cm_id);
ret = rdma_listen(sc->rdma.cm_id, backlog);
if (ret) {
sc->first_error = ret;
sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
if (sc->rdma.cm_id->device)
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"listening failed %1pe on addr: %pISpsfc dev: %.*s\n",
SMBDIRECT_DEBUG_ERR_PTR(ret),
&sc->rdma.cm_id->route.addr.src_addr,
IB_DEVICE_NAME_MAX,
sc->rdma.cm_id->device->name);
else
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"listening failed %1pe on addr: %pISpsfc\n",
SMBDIRECT_DEBUG_ERR_PTR(ret),
&sc->rdma.cm_id->route.addr.src_addr);
return ret;
}
/*
* This is a value > 0, checked above,
* so we are able to use sc->listen.backlog == -1,
* as indication that the socket was never
* a listener.
*/
sc->listen.backlog = backlog;
if (sc->rdma.cm_id->device)
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"listening on addr: %pISpsfc dev: %.*s\n",
&sc->rdma.cm_id->route.addr.src_addr,
IB_DEVICE_NAME_MAX,
sc->rdma.cm_id->device->name);
else
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"listening on addr: %pISpsfc\n",
&sc->rdma.cm_id->route.addr.src_addr);
/*
* The rest happens async via smbdirect_listen_rdma_event_handler()
*/
return 0;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_listen);
static int smbdirect_new_rdma_event_handler(struct rdma_cm_id *new_id,
struct rdma_cm_event *event)
{
int ret = -ESTALE;
/*
* This should be replaced before any real work
* starts! So it should never be called!
*/
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
ret = -ENETDOWN;
if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
ret = event->status;
WARN_ONCE(1,
"%s should not be called! event=%s status=%d => ret=%1pe\n",
__func__,
rdma_event_msg(event->event),
event->status,
SMBDIRECT_DEBUG_ERR_PTR(ret));
return -ESTALE;
}
static int smbdirect_listen_connect_request(struct smbdirect_socket *lsc,
struct rdma_cm_id *new_id,
const struct rdma_cm_event *event);
static int smbdirect_listen_rdma_event_handler(struct rdma_cm_id *new_id,
struct rdma_cm_event *event)
{
struct smbdirect_socket *lsc = new_id->context;
int ret;
if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
new_id->context = NULL;
new_id->event_handler = smbdirect_new_rdma_event_handler;
} else
new_id = NULL;
/*
* cma_cm_event_handler() has
* lockdep_assert_held(&id_priv->handler_mutex);
*
* Mutexes are not allowed in interrupts,
* and we rely on not being in an interrupt here,
* as we might sleep.
*/
WARN_ON_ONCE(in_interrupt());
if (event->status || event->event != lsc->rdma.expected_event) {
ret = -ECONNABORTED;
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
ret = -ENETDOWN;
if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
ret = event->status;
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
"%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
smbdirect_socket_status_string(lsc->status),
SMBDIRECT_DEBUG_ERR_PTR(lsc->first_error),
rdma_event_msg(lsc->rdma.expected_event),
rdma_event_msg(event->event),
event->status,
SMBDIRECT_DEBUG_ERR_PTR(ret));
/*
* In case of error return it and let the caller
* destroy new_id
*/
smbdirect_socket_schedule_cleanup(lsc, ret);
return new_id ? ret : 0;
}
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_INFO,
"%s (first_error=%1pe) event=%s\n",
smbdirect_socket_status_string(lsc->status),
SMBDIRECT_DEBUG_ERR_PTR(lsc->first_error),
rdma_event_msg(event->event));
/*
* In case of error return it and let the caller
* destroy new_id
*/
if (lsc->first_error)
return new_id ? lsc->first_error : 0;
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
WARN_ON_ONCE(lsc->status != SMBDIRECT_SOCKET_LISTENING);
/*
* In case of error return it and let the caller
* destroy new_id
*/
ret = smbdirect_listen_connect_request(lsc, new_id, event);
if (ret)
return ret;
return 0;
default:
break;
}
/*
* This is an internal error
*/
WARN_ON_ONCE(lsc->rdma.expected_event != RDMA_CM_EVENT_CONNECT_REQUEST);
smbdirect_socket_schedule_cleanup(lsc, -EINVAL);
return 0;
}
static int smbdirect_listen_connect_request(struct smbdirect_socket *lsc,
struct rdma_cm_id *new_id,
const struct rdma_cm_event *event)
{
const struct smbdirect_socket_parameters *lsp = &lsc->parameters;
struct smbdirect_socket *nsc;
unsigned long flags;
size_t backlog = max_t(size_t, 1, lsc->listen.backlog);
size_t psockets;
size_t rsockets;
int ret;
if (!smbdirect_frwr_is_supported(&new_id->device->attrs)) {
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
"Fast Registration Work Requests (FRWR) is not supported device %.*s\n",
IB_DEVICE_NAME_MAX,
new_id->device->name);
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
"Device capability flags = %llx max_fast_reg_page_list_len = %u\n",
new_id->device->attrs.device_cap_flags,
new_id->device->attrs.max_fast_reg_page_list_len);
return -EPROTONOSUPPORT;
}
if (lsp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB &&
!rdma_ib_or_roce(new_id->device, new_id->port_num)) {
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
"Not IB: device: %.*s IW:%u local: %pISpsfc remote: %pISpsfc\n",
IB_DEVICE_NAME_MAX,
new_id->device->name,
rdma_protocol_iwarp(new_id->device, new_id->port_num),
&new_id->route.addr.src_addr,
&new_id->route.addr.dst_addr);
return -EPROTONOSUPPORT;
}
if (lsp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW &&
!rdma_protocol_iwarp(new_id->device, new_id->port_num)) {
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
"Not IW: device: %.*s IB:%u local: %pISpsfc remote: %pISpsfc\n",
IB_DEVICE_NAME_MAX,
new_id->device->name,
rdma_ib_or_roce(new_id->device, new_id->port_num),
&new_id->route.addr.src_addr,
&new_id->route.addr.dst_addr);
return -EPROTONOSUPPORT;
}
spin_lock_irqsave(&lsc->listen.lock, flags);
psockets = list_count_nodes(&lsc->listen.pending);
rsockets = list_count_nodes(&lsc->listen.ready);
spin_unlock_irqrestore(&lsc->listen.lock, flags);
if (psockets > backlog ||
rsockets > backlog ||
(psockets + rsockets) > backlog) {
smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR,
"Backlog[%d][%zu] full pending[%zu] ready[%zu]\n",
lsc->listen.backlog, backlog, psockets, rsockets);
return -EBUSY;
}
ret = smbdirect_socket_create_accepting(new_id, &nsc);
if (ret)
goto socket_init_failed;
nsc->logging = lsc->logging;
ret = smbdirect_socket_set_initial_parameters(nsc, &lsc->parameters);
if (ret)
goto set_params_failed;
ret = smbdirect_socket_set_kernel_settings(nsc,
lsc->ib.poll_ctx,
lsc->send_io.mem.gfp_mask);
if (ret)
goto set_settings_failed;
spin_lock_irqsave(&lsc->listen.lock, flags);
list_add_tail(&nsc->accept.list, &lsc->listen.pending);
nsc->accept.listener = lsc;
spin_unlock_irqrestore(&lsc->listen.lock, flags);
ret = smbdirect_accept_connect_request(nsc, &event->param.conn);
if (ret)
goto accept_connect_failed;
return 0;
accept_connect_failed:
spin_lock_irqsave(&lsc->listen.lock, flags);
list_del_init(&nsc->accept.list);
nsc->accept.listener = NULL;
spin_unlock_irqrestore(&lsc->listen.lock, flags);
set_settings_failed:
set_params_failed:
/*
* The caller will destroy new_id
*/
nsc->ib.dev = NULL;
nsc->rdma.cm_id = NULL;
smbdirect_socket_release(nsc);
socket_init_failed:
return ret;
}

View File

@@ -0,0 +1,121 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
#include <linux/module.h>
struct smbdirect_module_state smbdirect_globals = {
.mutex = __MUTEX_INITIALIZER(smbdirect_globals.mutex),
};
static __init int smbdirect_module_init(void)
{
int ret = -ENOMEM;
pr_notice("subsystem loading...\n");
mutex_lock(&smbdirect_globals.mutex);
smbdirect_globals.workqueues.accept = alloc_workqueue("smbdirect-accept",
WQ_SYSFS |
WQ_PERCPU |
WQ_POWER_EFFICIENT,
0);
if (smbdirect_globals.workqueues.accept == NULL)
goto alloc_accept_wq_failed;
smbdirect_globals.workqueues.connect = alloc_workqueue("smbdirect-connect",
WQ_SYSFS |
WQ_PERCPU |
WQ_POWER_EFFICIENT,
0);
if (smbdirect_globals.workqueues.connect == NULL)
goto alloc_connect_wq_failed;
smbdirect_globals.workqueues.idle = alloc_workqueue("smbdirect-idle",
WQ_SYSFS |
WQ_PERCPU |
WQ_POWER_EFFICIENT,
0);
if (smbdirect_globals.workqueues.idle == NULL)
goto alloc_idle_wq_failed;
smbdirect_globals.workqueues.refill = alloc_workqueue("smbdirect-refill",
WQ_HIGHPRI |
WQ_SYSFS |
WQ_PERCPU |
WQ_POWER_EFFICIENT,
0);
if (smbdirect_globals.workqueues.refill == NULL)
goto alloc_refill_wq_failed;
smbdirect_globals.workqueues.immediate = alloc_workqueue("smbdirect-immediate",
WQ_HIGHPRI |
WQ_SYSFS |
WQ_PERCPU |
WQ_POWER_EFFICIENT,
0);
if (smbdirect_globals.workqueues.immediate == NULL)
goto alloc_immediate_wq_failed;
smbdirect_globals.workqueues.cleanup = alloc_workqueue("smbdirect-cleanup",
WQ_MEM_RECLAIM |
WQ_HIGHPRI |
WQ_SYSFS |
WQ_PERCPU |
WQ_POWER_EFFICIENT,
0);
if (smbdirect_globals.workqueues.cleanup == NULL)
goto alloc_cleanup_wq_failed;
ret = smbdirect_devices_init();
if (ret)
goto devices_init_failed;
mutex_unlock(&smbdirect_globals.mutex);
pr_notice("subsystem loaded\n");
return 0;
devices_init_failed:
destroy_workqueue(smbdirect_globals.workqueues.cleanup);
alloc_cleanup_wq_failed:
destroy_workqueue(smbdirect_globals.workqueues.immediate);
alloc_immediate_wq_failed:
destroy_workqueue(smbdirect_globals.workqueues.refill);
alloc_refill_wq_failed:
destroy_workqueue(smbdirect_globals.workqueues.idle);
alloc_idle_wq_failed:
destroy_workqueue(smbdirect_globals.workqueues.connect);
alloc_connect_wq_failed:
destroy_workqueue(smbdirect_globals.workqueues.accept);
alloc_accept_wq_failed:
mutex_unlock(&smbdirect_globals.mutex);
pr_crit("failed to loaded: %d (%1pe)\n",
ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
static __exit void smbdirect_module_exit(void)
{
pr_notice("subsystem unloading...\n");
mutex_lock(&smbdirect_globals.mutex);
smbdirect_devices_exit();
destroy_workqueue(smbdirect_globals.workqueues.accept);
destroy_workqueue(smbdirect_globals.workqueues.connect);
destroy_workqueue(smbdirect_globals.workqueues.idle);
destroy_workqueue(smbdirect_globals.workqueues.refill);
destroy_workqueue(smbdirect_globals.workqueues.immediate);
destroy_workqueue(smbdirect_globals.workqueues.cleanup);
mutex_unlock(&smbdirect_globals.mutex);
pr_notice("subsystem unloaded\n");
}
module_init(smbdirect_module_init);
module_exit(smbdirect_module_exit);
MODULE_DESCRIPTION("smbdirect subsystem");
MODULE_LICENSE("GPL");

View File

@@ -0,0 +1,493 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
/*
* Allocate MRs used for RDMA read/write
* The number of MRs will not exceed hardware capability in responder_resources
* All MRs are kept in mr_list. The MR can be recovered after it's used
* Recovery is done in smbd_mr_recovery_work. The content of list entry changes
* as MRs are used and recovered for I/O, but the list links will not change
*/
int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_mr_io *mr;
int ret;
u32 i;
if (sp->responder_resources == 0) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"responder_resources negotiated as 0\n");
return -EINVAL;
}
/* Allocate more MRs (2x) than hardware responder_resources */
for (i = 0; i < sp->responder_resources * 2; i++) {
mr = kzalloc_obj(*mr);
if (!mr) {
ret = -ENOMEM;
goto kzalloc_mr_failed;
}
kref_init(&mr->kref);
mutex_init(&mr->mutex);
mr->mr = ib_alloc_mr(sc->ib.pd,
sc->mr_io.type,
sp->max_frmr_depth);
if (IS_ERR(mr->mr)) {
ret = PTR_ERR(mr->mr);
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"ib_alloc_mr failed ret=%d (%1pe) type=0x%x max_frmr_depth=%u\n",
ret, SMBDIRECT_DEBUG_ERR_PTR(ret),
sc->mr_io.type, sp->max_frmr_depth);
goto ib_alloc_mr_failed;
}
mr->sgt.sgl = kzalloc_objs(struct scatterlist, sp->max_frmr_depth);
if (!mr->sgt.sgl) {
ret = -ENOMEM;
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"failed to allocate sgl, max_frmr_depth=%u\n",
sp->max_frmr_depth);
goto kcalloc_sgl_failed;
}
mr->state = SMBDIRECT_MR_READY;
mr->socket = sc;
list_add_tail(&mr->list, &sc->mr_io.all.list);
atomic_inc(&sc->mr_io.ready.count);
}
return 0;
kcalloc_sgl_failed:
ib_dereg_mr(mr->mr);
ib_alloc_mr_failed:
mutex_destroy(&mr->mutex);
kfree(mr);
kzalloc_mr_failed:
smbdirect_connection_destroy_mr_list(sc);
return ret;
}
static void smbdirect_mr_io_disable_locked(struct smbdirect_mr_io *mr)
{
struct smbdirect_socket *sc = mr->socket;
lockdep_assert_held(&mr->mutex);
if (mr->state == SMBDIRECT_MR_DISABLED)
return;
if (mr->mr)
ib_dereg_mr(mr->mr);
if (mr->sgt.nents)
ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
kfree(mr->sgt.sgl);
mr->mr = NULL;
mr->sgt.sgl = NULL;
mr->sgt.nents = 0;
mr->state = SMBDIRECT_MR_DISABLED;
}
static void smbdirect_mr_io_free_locked(struct kref *kref)
{
struct smbdirect_mr_io *mr =
container_of(kref, struct smbdirect_mr_io, kref);
lockdep_assert_held(&mr->mutex);
/*
* smbdirect_mr_io_disable_locked() should already be called!
*/
if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED))
smbdirect_mr_io_disable_locked(mr);
mutex_unlock(&mr->mutex);
mutex_destroy(&mr->mutex);
kfree(mr);
}
void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc)
{
struct smbdirect_mr_io *mr, *tmp;
LIST_HEAD(all_list);
unsigned long flags;
spin_lock_irqsave(&sc->mr_io.all.lock, flags);
list_splice_tail_init(&sc->mr_io.all.list, &all_list);
spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
list_for_each_entry_safe(mr, tmp, &all_list, list) {
mutex_lock(&mr->mutex);
smbdirect_mr_io_disable_locked(mr);
list_del(&mr->list);
mr->socket = NULL;
/*
* No kref_put_mutex() as it's already locked.
*
* If smbdirect_mr_io_free_locked() is called
* and the mutex is unlocked and mr is gone,
* in that case kref_put() returned 1.
*
* If kref_put() returned 0 we know that
* smbdirect_mr_io_free_locked() didn't
* run. Not by us nor by anyone else, as we
* still hold the mutex, so we need to unlock.
*
* If the mr is still registered it will
* be dangling (detached from the connection
* waiting for smbd_deregister_mr() to be
* called in order to free the memory.
*/
if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
mutex_unlock(&mr->mutex);
}
}
/*
* Get a MR from mr_list. This function waits until there is at least one MR
* available in the list. There may be several CPUs issuing I/O trying to get MR
* at the same time, mr_list_lock is used to protect this situation.
*/
static struct smbdirect_mr_io *
smbdirect_connection_get_mr_io(struct smbdirect_socket *sc)
{
struct smbdirect_mr_io *mr;
unsigned long flags;
int ret;
again:
ret = wait_event_interruptible(sc->mr_io.ready.wait_queue,
atomic_read(&sc->mr_io.ready.count) ||
sc->status != SMBDIRECT_SOCKET_CONNECTED);
if (ret) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"wait_event_interruptible ret=%d (%1pe)\n",
ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
return NULL;
}
if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"sc->status=%s sc->first_error=%1pe\n",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
return NULL;
}
spin_lock_irqsave(&sc->mr_io.all.lock, flags);
list_for_each_entry(mr, &sc->mr_io.all.list, list) {
if (mr->state == SMBDIRECT_MR_READY) {
mr->state = SMBDIRECT_MR_REGISTERED;
kref_get(&mr->kref);
spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
atomic_dec(&sc->mr_io.ready.count);
atomic_inc(&sc->mr_io.used.count);
return mr;
}
}
spin_unlock_irqrestore(&sc->mr_io.all.lock, flags);
/*
* It is possible that we could fail to get MR because other processes may
* try to acquire a MR at the same time. If this is the case, retry it.
*/
goto again;
}
static void smbdirect_connection_mr_io_register_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_mr_io *mr =
container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
struct smbdirect_socket *sc = mr->socket;
if (wc->status != IB_WC_SUCCESS) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"wc->status=%s opcode=%d\n",
ib_wc_status_msg(wc->status), wc->opcode);
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
}
}
static void smbdirect_connection_mr_io_local_inv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct smbdirect_mr_io *mr =
container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe);
struct smbdirect_socket *sc = mr->socket;
mr->state = SMBDIRECT_MR_INVALIDATED;
if (wc->status != IB_WC_SUCCESS) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"invalidate failed status=%s\n",
ib_wc_status_msg(wc->status));
smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED);
}
complete(&mr->invalidate_done);
}
/*
* Transcribe the pages from an iterator into an MR scatterlist.
*/
static int smbdirect_iter_to_sgt(struct iov_iter *iter,
struct sg_table *sgt,
unsigned int max_sg)
{
int ret;
memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist));
ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0);
WARN_ON(ret < 0);
if (sgt->nents > 0)
sg_mark_end(&sgt->sgl[sgt->nents - 1]);
return ret;
}
/*
* Register memory for RDMA read/write
* iter: the buffer to register memory with
* writing: true if this is a RDMA write (SMB read), false for RDMA read
* need_invalidate: true if this MR needs to be locally invalidated after I/O
* return value: the MR registered, NULL if failed.
*/
struct smbdirect_mr_io *
smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
struct iov_iter *iter,
bool writing,
bool need_invalidate)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
struct smbdirect_mr_io *mr;
int ret, num_pages;
struct ib_reg_wr *reg_wr;
num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1);
if (num_pages > sp->max_frmr_depth) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"num_pages=%d max_frmr_depth=%d\n",
num_pages, sp->max_frmr_depth);
WARN_ON_ONCE(1);
return NULL;
}
mr = smbdirect_connection_get_mr_io(sc);
if (!mr) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"smbdirect_connection_get_mr_io returning NULL\n");
return NULL;
}
mutex_lock(&mr->mutex);
mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
mr->need_invalidate = need_invalidate;
mr->sgt.nents = 0;
mr->sgt.orig_nents = 0;
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO,
"num_pages=%u count=%zu depth=%u\n",
num_pages, iov_iter_count(iter), sp->max_frmr_depth);
smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth);
ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
if (!ret) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n",
num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
goto dma_map_error;
}
ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE);
if (ret != mr->sgt.nents) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"ib_map_mr_sg failed ret = %d nents = %u\n",
ret, mr->sgt.nents);
goto map_mr_error;
}
ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey));
reg_wr = &mr->wr;
reg_wr->wr.opcode = IB_WR_REG_MR;
mr->cqe.done = smbdirect_connection_mr_io_register_done;
reg_wr->wr.wr_cqe = &mr->cqe;
reg_wr->wr.num_sge = 0;
reg_wr->wr.send_flags = IB_SEND_SIGNALED;
reg_wr->mr = mr->mr;
reg_wr->key = mr->mr->rkey;
reg_wr->access = writing ?
IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
IB_ACCESS_REMOTE_READ;
/*
* There is no need for waiting for complemtion on ib_post_send
* on IB_WR_REG_MR. Hardware enforces a barrier and order of execution
* on the next ib_post_send when we actually send I/O to remote peer
*/
ret = ib_post_send(sc->ib.qp, &reg_wr->wr, NULL);
if (!ret) {
/*
* smbdirect_connection_get_mr_io() gave us a reference
* via kref_get(&mr->kref), we keep that and let
* the caller use smbdirect_connection_deregister_mr_io()
* to remove it again.
*/
mutex_unlock(&mr->mutex);
return mr;
}
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"ib_post_send failed ret=%d (%1pe) reg_wr->key=0x%x\n",
ret, SMBDIRECT_DEBUG_ERR_PTR(ret), reg_wr->key);
map_mr_error:
ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
dma_map_error:
mr->sgt.nents = 0;
mr->state = SMBDIRECT_MR_ERROR;
atomic_dec(&sc->mr_io.used.count);
smbdirect_socket_schedule_cleanup(sc, ret);
/*
* smbdirect_connection_get_mr_io() gave us a reference
* via kref_get(&mr->kref), we need to remove it again
* on error.
*
* No kref_put_mutex() as it's already locked.
*
* If smbdirect_mr_io_free_locked() is called
* and the mutex is unlocked and mr is gone,
* in that case kref_put() returned 1.
*
* If kref_put() returned 0 we know that
* smbdirect_mr_io_free_locked() didn't
* run. Not by us nor by anyone else, as we
* still hold the mutex, so we need to unlock.
*/
if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
mutex_unlock(&mr->mutex);
return NULL;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io);
void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
struct smbdirect_buffer_descriptor_v1 *v1)
{
mutex_lock(&mr->mutex);
if (mr->state == SMBDIRECT_MR_REGISTERED) {
v1->offset = cpu_to_le64(mr->mr->iova);
v1->token = cpu_to_le32(mr->mr->rkey);
v1->length = cpu_to_le32(mr->mr->length);
} else {
v1->offset = cpu_to_le64(U64_MAX);
v1->token = cpu_to_le32(U32_MAX);
v1->length = cpu_to_le32(U32_MAX);
}
mutex_unlock(&mr->mutex);
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor);
/*
* Deregister a MR after I/O is done
* This function may wait if remote invalidation is not used
* and we have to locally invalidate the buffer to prevent data is being
* modified by remote peer after upper layer consumes it
*/
void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr)
{
struct smbdirect_socket *sc = mr->socket;
int ret = 0;
lock_again:
mutex_lock(&mr->mutex);
if (mr->state == SMBDIRECT_MR_DISABLED)
goto put_kref;
if (sc->status != SMBDIRECT_SOCKET_CONNECTED) {
smbdirect_mr_io_disable_locked(mr);
goto put_kref;
}
if (mr->need_invalidate) {
struct ib_send_wr *wr = &mr->inv_wr;
/* Need to finish local invalidation before returning */
wr->opcode = IB_WR_LOCAL_INV;
mr->cqe.done = smbdirect_connection_mr_io_local_inv_done;
wr->wr_cqe = &mr->cqe;
wr->num_sge = 0;
wr->ex.invalidate_rkey = mr->mr->rkey;
wr->send_flags = IB_SEND_SIGNALED;
init_completion(&mr->invalidate_done);
ret = ib_post_send(sc->ib.qp, wr, NULL);
if (ret) {
smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR,
"ib_post_send failed ret=%d (%1pe)\n",
ret, SMBDIRECT_DEBUG_ERR_PTR(ret));
smbdirect_mr_io_disable_locked(mr);
smbdirect_socket_schedule_cleanup(sc, ret);
goto done;
}
/*
* We still hold the reference to mr
* so we can unlock while waiting.
*/
mutex_unlock(&mr->mutex);
wait_for_completion(&mr->invalidate_done);
mr->need_invalidate = false;
goto lock_again;
} else
/*
* For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED
* and defer to mr_recovery_work to recover the MR for next use
*/
mr->state = SMBDIRECT_MR_INVALIDATED;
if (mr->sgt.nents) {
ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir);
mr->sgt.nents = 0;
}
WARN_ONCE(mr->state != SMBDIRECT_MR_INVALIDATED,
"mr->state[%u] != SMBDIRECT_MR_INVALIDATED[%u]\n",
mr->state, SMBDIRECT_MR_INVALIDATED);
mr->state = SMBDIRECT_MR_READY;
if (atomic_inc_return(&sc->mr_io.ready.count) == 1)
wake_up(&sc->mr_io.ready.wait_queue);
done:
atomic_dec(&sc->mr_io.used.count);
put_kref:
/*
* No kref_put_mutex() as it's already locked.
*
* If smbdirect_mr_io_free_locked() is called
* and the mutex is unlocked and mr is gone,
* in that case kref_put() returned 1.
*
* If kref_put() returned 0 we know that
* smbdirect_mr_io_free_locked() didn't
* run. Not by us nor by anyone else, as we
* still hold the mutex, so we need to unlock
* and keep the mr in SMBDIRECT_MR_READY or
* SMBDIRECT_MR_ERROR state.
*/
if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked))
mutex_unlock(&mr->mutex);
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io);

View File

@@ -8,6 +8,10 @@
#define SMBDIRECT_V1 0x0100
/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */
#define SMBDIRECT_MIN_RECEIVE_SIZE 128
#define SMBDIRECT_MIN_FRAGMENTED_SIZE 131072
/* SMBD negotiation request packet [MS-SMBD] 2.2.1 */
struct smbdirect_negotiate_req {
__le16 min_version;

View File

@@ -0,0 +1,148 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (C) 2025, Stefan Metzmacher
*/
#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__
#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__
struct smbdirect_buffer_descriptor_v1;
struct smbdirect_socket_parameters;
struct smbdirect_socket;
struct smbdirect_send_batch;
struct smbdirect_mr_io;
#define __SMBDIRECT_EXPORT_SYMBOL__(__sym) EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd")
#include <rdma/rw.h>
u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev);
bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs);
int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc);
int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc);
int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
const struct smbdirect_socket_parameters *sp);
const struct smbdirect_socket_parameters *
smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc);
int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
enum ib_poll_context poll_ctx,
gfp_t gfp_mask);
#define SMBDIRECT_LOG_ERR 0x0
#define SMBDIRECT_LOG_INFO 0x1
#define SMBDIRECT_LOG_OUTGOING 0x1
#define SMBDIRECT_LOG_INCOMING 0x2
#define SMBDIRECT_LOG_READ 0x4
#define SMBDIRECT_LOG_WRITE 0x8
#define SMBDIRECT_LOG_RDMA_SEND 0x10
#define SMBDIRECT_LOG_RDMA_RECV 0x20
#define SMBDIRECT_LOG_KEEP_ALIVE 0x40
#define SMBDIRECT_LOG_RDMA_EVENT 0x80
#define SMBDIRECT_LOG_RDMA_MR 0x100
#define SMBDIRECT_LOG_RDMA_RW 0x200
#define SMBDIRECT_LOG_NEGOTIATE 0x400
void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
void *private_ptr,
bool (*needed)(struct smbdirect_socket *sc,
void *private_ptr,
unsigned int lvl,
unsigned int cls),
void (*vaprintf)(struct smbdirect_socket *sc,
const char *func,
unsigned int line,
void *private_ptr,
unsigned int lvl,
unsigned int cls,
struct va_format *vaf));
bool smbdirect_connection_is_connected(struct smbdirect_socket *sc);
int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc);
int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr);
void smbdirect_socket_shutdown(struct smbdirect_socket *sc);
void smbdirect_socket_release(struct smbdirect_socket *sc);
int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc,
struct smbdirect_send_batch *batch,
bool is_last);
/*
* This is only temporary and only needed
* as long as the client still requires
* to use smbdirect_connection_send_single_iter()
*/
struct smbdirect_send_batch_storage {
union {
struct list_head __msg_list;
__aligned_u64 __space[5];
};
};
struct smbdirect_send_batch *
smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage,
bool need_invalidate_rkey,
unsigned int remote_key);
int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc,
struct smbdirect_send_batch *batch,
struct iov_iter *iter,
unsigned int flags,
u32 remaining_data_length);
int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc);
int smbdirect_connection_send_iter(struct smbdirect_socket *sc,
struct iov_iter *iter,
unsigned int flags,
bool need_invalidate,
unsigned int remote_key);
int smbdirect_connection_recvmsg(struct smbdirect_socket *sc,
struct msghdr *msg,
unsigned int flags);
int smbdirect_connect(struct smbdirect_socket *sc,
const struct sockaddr *dst);
int smbdirect_connect_sync(struct smbdirect_socket *sc,
const struct sockaddr *dst);
int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog);
struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc,
long timeo,
struct proto_accept_arg *arg);
int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc,
void *buf, size_t buf_len,
struct smbdirect_buffer_descriptor_v1 *desc,
size_t desc_len,
bool is_read);
struct smbdirect_mr_io *
smbdirect_connection_register_mr_io(struct smbdirect_socket *sc,
struct iov_iter *iter,
bool writing,
bool need_invalidate);
void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr,
struct smbdirect_buffer_descriptor_v1 *v1);
void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr);
void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc,
unsigned int rdma_readwrite_threshold,
struct seq_file *m);
#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */

View File

@@ -0,0 +1,255 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (C) 2018, LG Electronics.
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
static int smbdirect_connection_wait_for_rw_credits(struct smbdirect_socket *sc,
int credits)
{
return smbdirect_socket_wait_for_credits(sc,
SMBDIRECT_SOCKET_CONNECTED,
-ENOTCONN,
&sc->rw_io.credits.wait_queue,
&sc->rw_io.credits.count,
credits);
}
static int smbdirect_connection_calc_rw_credits(struct smbdirect_socket *sc,
const void *buf,
size_t len)
{
return DIV_ROUND_UP(smbdirect_get_buf_page_count(buf, len),
sc->rw_io.credits.num_pages);
}
static int smbdirect_connection_rdma_get_sg_list(void *buf,
size_t size,
struct scatterlist *sg_list,
size_t nentries)
{
bool high = is_vmalloc_addr(buf);
struct page *page;
size_t offset, len;
int i = 0;
if (size == 0 || nentries < smbdirect_get_buf_page_count(buf, size))
return -EINVAL;
offset = offset_in_page(buf);
buf -= offset;
while (size > 0) {
len = min_t(size_t, PAGE_SIZE - offset, size);
if (high)
page = vmalloc_to_page(buf);
else
page = kmap_to_page(buf);
if (!sg_list)
return -EINVAL;
sg_set_page(sg_list, page, len, offset);
sg_list = sg_next(sg_list);
buf += PAGE_SIZE;
size -= len;
offset = 0;
i++;
}
return i;
}
static void smbdirect_connection_rw_io_free(struct smbdirect_rw_io *msg,
enum dma_data_direction dir)
{
struct smbdirect_socket *sc = msg->socket;
rdma_rw_ctx_destroy(&msg->rdma_ctx,
sc->ib.qp,
sc->ib.qp->port,
msg->sgt.sgl,
msg->sgt.nents,
dir);
sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
kfree(msg);
}
static void smbdirect_connection_rdma_rw_done(struct ib_cq *cq, struct ib_wc *wc,
enum dma_data_direction dir)
{
struct smbdirect_rw_io *msg =
container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe);
struct smbdirect_socket *sc = msg->socket;
if (wc->status != IB_WC_SUCCESS) {
msg->error = -EIO;
pr_err("read/write error. opcode = %d, status = %s(%d)\n",
wc->opcode, ib_wc_status_msg(wc->status), wc->status);
if (wc->status != IB_WC_WR_FLUSH_ERR)
smbdirect_socket_schedule_cleanup(sc, msg->error);
}
complete(msg->completion);
}
static void smbdirect_connection_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc)
{
smbdirect_connection_rdma_rw_done(cq, wc, DMA_FROM_DEVICE);
}
static void smbdirect_connection_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc)
{
smbdirect_connection_rdma_rw_done(cq, wc, DMA_TO_DEVICE);
}
int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc,
void *buf, size_t buf_len,
struct smbdirect_buffer_descriptor_v1 *desc,
size_t desc_len,
bool is_read)
{
const struct smbdirect_socket_parameters *sp = &sc->parameters;
enum dma_data_direction direction = is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
struct smbdirect_rw_io *msg, *next_msg;
size_t i;
int ret;
DECLARE_COMPLETION_ONSTACK(completion);
struct ib_send_wr *first_wr;
LIST_HEAD(msg_list);
u8 *desc_buf;
int credits_needed;
size_t desc_buf_len, desc_num = 0;
if (sc->status != SMBDIRECT_SOCKET_CONNECTED)
return -ENOTCONN;
if (buf_len > sp->max_read_write_size)
return -EINVAL;
/* calculate needed credits */
credits_needed = 0;
desc_buf = buf;
for (i = 0; i < desc_len / sizeof(*desc); i++) {
if (!buf_len)
break;
desc_buf_len = le32_to_cpu(desc[i].length);
if (!desc_buf_len)
return -EINVAL;
if (desc_buf_len > buf_len) {
desc_buf_len = buf_len;
desc[i].length = cpu_to_le32(desc_buf_len);
buf_len = 0;
}
credits_needed += smbdirect_connection_calc_rw_credits(sc,
desc_buf,
desc_buf_len);
desc_buf += desc_buf_len;
buf_len -= desc_buf_len;
desc_num++;
}
smbdirect_log_rdma_rw(sc, SMBDIRECT_LOG_INFO,
"RDMA %s, len %zu, needed credits %d\n",
str_read_write(is_read), buf_len, credits_needed);
ret = smbdirect_connection_wait_for_rw_credits(sc, credits_needed);
if (ret < 0)
return ret;
/* build rdma_rw_ctx for each descriptor */
desc_buf = buf;
for (i = 0; i < desc_num; i++) {
size_t page_count;
msg = kzalloc_flex(*msg, sg_list, SG_CHUNK_SIZE,
sc->rw_io.mem.gfp_mask);
if (!msg) {
ret = -ENOMEM;
goto out;
}
desc_buf_len = le32_to_cpu(desc[i].length);
page_count = smbdirect_get_buf_page_count(desc_buf, desc_buf_len);
msg->socket = sc;
msg->cqe.done = is_read ?
smbdirect_connection_rdma_read_done :
smbdirect_connection_rdma_write_done;
msg->completion = &completion;
msg->sgt.sgl = &msg->sg_list[0];
ret = sg_alloc_table_chained(&msg->sgt,
page_count,
msg->sg_list,
SG_CHUNK_SIZE);
if (ret) {
ret = -ENOMEM;
goto free_msg;
}
ret = smbdirect_connection_rdma_get_sg_list(desc_buf,
desc_buf_len,
msg->sgt.sgl,
msg->sgt.orig_nents);
if (ret < 0)
goto free_table;
ret = rdma_rw_ctx_init(&msg->rdma_ctx,
sc->ib.qp,
sc->ib.qp->port,
msg->sgt.sgl,
page_count,
0,
le64_to_cpu(desc[i].offset),
le32_to_cpu(desc[i].token),
direction);
if (ret < 0) {
pr_err("failed to init rdma_rw_ctx: %d\n", ret);
goto free_table;
}
list_add_tail(&msg->list, &msg_list);
desc_buf += desc_buf_len;
}
/* concatenate work requests of rdma_rw_ctxs */
first_wr = NULL;
list_for_each_entry_reverse(msg, &msg_list, list) {
first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx,
sc->ib.qp,
sc->ib.qp->port,
&msg->cqe,
first_wr);
}
ret = ib_post_send(sc->ib.qp, first_wr, NULL);
if (ret) {
pr_err("failed to post send wr for RDMA R/W: %d\n", ret);
goto out;
}
msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list);
wait_for_completion(&completion);
ret = msg->error;
out:
list_for_each_entry_safe(msg, next_msg, &msg_list, list) {
list_del(&msg->list);
smbdirect_connection_rw_io_free(msg, direction);
}
atomic_add(credits_needed, &sc->rw_io.credits.count);
wake_up(&sc->rw_io.credits.wait_queue);
return ret;
free_table:
sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE);
free_msg:
kfree(msg);
goto out;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_rdma_xmit);

View File

@@ -0,0 +1,743 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C) 2017, Microsoft Corporation.
* Copyright (c) 2025, Stefan Metzmacher
*/
#include "smbdirect_internal.h"
bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs)
{
/*
* Test if FRWR (Fast Registration Work Requests) is supported on the
* device This implementation requires FRWR on RDMA read/write return
* value: true if it is supported
*/
if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS))
return false;
if (attrs->max_fast_reg_page_list_len == 0)
return false;
return true;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_frwr_is_supported);
static void smbdirect_socket_cleanup_work(struct work_struct *work);
static int smbdirect_socket_rdma_event_handler(struct rdma_cm_id *id,
struct rdma_cm_event *event)
{
struct smbdirect_socket *sc = id->context;
int ret = -ESTALE;
/*
* This should be replaced before any real work
* starts! So it should never be called!
*/
if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
ret = -ENETDOWN;
if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status)))
ret = event->status;
pr_err("%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error),
rdma_event_msg(sc->rdma.expected_event),
rdma_event_msg(event->event),
event->status,
SMBDIRECT_DEBUG_ERR_PTR(ret));
WARN_ONCE(1, "%s should not be called!\n", __func__);
sc->rdma.cm_id = NULL;
return -ESTALE;
}
int smbdirect_socket_init_new(struct net *net, struct smbdirect_socket *sc)
{
struct rdma_cm_id *id;
int ret;
smbdirect_socket_init(sc);
id = rdma_create_id(net,
smbdirect_socket_rdma_event_handler,
sc,
RDMA_PS_TCP,
IB_QPT_RC);
if (IS_ERR(id)) {
pr_err("%s: rdma_create_id() failed %1pe\n", __func__, id);
return PTR_ERR(id);
}
ret = rdma_set_afonly(id, 1);
if (ret) {
rdma_destroy_id(id);
pr_err("%s: rdma_set_afonly() failed %1pe\n",
__func__, SMBDIRECT_DEBUG_ERR_PTR(ret));
return ret;
}
sc->rdma.cm_id = id;
INIT_WORK(&sc->disconnect_work, smbdirect_socket_cleanup_work);
return 0;
}
int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc)
{
struct smbdirect_socket *sc;
int ret;
ret = -ENOMEM;
sc = kzalloc_obj(*sc);
if (!sc)
goto alloc_failed;
ret = smbdirect_socket_init_new(net, sc);
if (ret)
goto init_failed;
kref_init(&sc->refs.destroy);
*_sc = sc;
return 0;
init_failed:
kfree(sc);
alloc_failed:
return ret;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_kern);
int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc)
{
smbdirect_socket_init(sc);
sc->rdma.cm_id = id;
sc->rdma.cm_id->context = sc;
sc->rdma.cm_id->event_handler = smbdirect_socket_rdma_event_handler;
sc->ib.dev = sc->rdma.cm_id->device;
INIT_WORK(&sc->disconnect_work, smbdirect_socket_cleanup_work);
return 0;
}
int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc)
{
struct smbdirect_socket *sc;
int ret;
ret = -ENOMEM;
sc = kzalloc_obj(*sc);
if (!sc)
goto alloc_failed;
ret = smbdirect_socket_init_accepting(id, sc);
if (ret)
goto init_failed;
kref_init(&sc->refs.destroy);
*_sc = sc;
return 0;
init_failed:
kfree(sc);
alloc_failed:
return ret;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_accepting);
int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc,
const struct smbdirect_socket_parameters *sp)
{
/*
* This is only allowed before connect or accept
*/
WARN_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
if (sc->status != SMBDIRECT_SOCKET_CREATED)
return -EINVAL;
if (sp->flags & ~SMBDIRECT_FLAG_PORT_RANGE_MASK)
return -EINVAL;
if (sp->initiator_depth > U8_MAX)
return -EINVAL;
if (sp->responder_resources > U8_MAX)
return -EINVAL;
if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB &&
sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
return -EINVAL;
else if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB)
rdma_restrict_node_type(sc->rdma.cm_id, RDMA_NODE_IB_CA);
else if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW)
rdma_restrict_node_type(sc->rdma.cm_id, RDMA_NODE_RNIC);
/*
* Make a copy of the callers parameters
* from here we only work on the copy
*
* TODO: do we want consistency checking?
*/
sc->parameters = *sp;
return 0;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_initial_parameters);
const struct smbdirect_socket_parameters *
smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc)
{
return &sc->parameters;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_get_current_parameters);
int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc,
enum ib_poll_context poll_ctx,
gfp_t gfp_mask)
{
/*
* This is only allowed before connect or accept
*/
WARN_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
if (sc->status != SMBDIRECT_SOCKET_CREATED)
return -EINVAL;
sc->ib.poll_ctx = poll_ctx;
sc->send_io.mem.gfp_mask = gfp_mask;
sc->recv_io.mem.gfp_mask = gfp_mask;
sc->rw_io.mem.gfp_mask = gfp_mask;
return 0;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_kernel_settings);
void smbdirect_socket_set_logging(struct smbdirect_socket *sc,
void *private_ptr,
bool (*needed)(struct smbdirect_socket *sc,
void *private_ptr,
unsigned int lvl,
unsigned int cls),
void (*vaprintf)(struct smbdirect_socket *sc,
const char *func,
unsigned int line,
void *private_ptr,
unsigned int lvl,
unsigned int cls,
struct va_format *vaf))
{
sc->logging.private_ptr = private_ptr;
sc->logging.needed = needed;
sc->logging.vaprintf = vaprintf;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_logging);
static void smbdirect_socket_wake_up_all(struct smbdirect_socket *sc)
{
/*
* Wake up all waiters in all wait queues
* in order to notice the broken connection.
*/
wake_up_all(&sc->status_wait);
wake_up_all(&sc->listen.wait_queue);
wake_up_all(&sc->send_io.bcredits.wait_queue);
wake_up_all(&sc->send_io.lcredits.wait_queue);
wake_up_all(&sc->send_io.credits.wait_queue);
wake_up_all(&sc->send_io.pending.zero_wait_queue);
wake_up_all(&sc->recv_io.reassembly.wait_queue);
wake_up_all(&sc->rw_io.credits.wait_queue);
wake_up_all(&sc->mr_io.ready.wait_queue);
}
void __smbdirect_socket_schedule_cleanup(struct smbdirect_socket *sc,
const char *macro_name,
unsigned int lvl,
const char *func,
unsigned int line,
int error,
enum smbdirect_socket_status *force_status)
{
struct smbdirect_socket *psc, *tsc;
unsigned long flags;
bool was_first = false;
if (!sc->first_error) {
___smbdirect_log_generic(sc, func, line,
lvl,
SMBDIRECT_LOG_RDMA_EVENT,
"%s(%1pe%s%s) called from %s in line=%u status=%s\n",
macro_name,
SMBDIRECT_DEBUG_ERR_PTR(error),
force_status ? ", " : "",
force_status ? smbdirect_socket_status_string(*force_status) : "",
func, line,
smbdirect_socket_status_string(sc->status));
if (error)
sc->first_error = error;
else
sc->first_error = -ECONNABORTED;
was_first = true;
}
/*
* make sure other work (than disconnect_work)
* is not queued again but here we don't block and avoid
* disable[_delayed]_work_sync()
*/
disable_work(&sc->connect.work);
disable_work(&sc->recv_io.posted.refill_work);
disable_work(&sc->idle.immediate_work);
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
disable_delayed_work(&sc->idle.timer_work);
/*
* In case we were a listener we need to
* disconnect all pending and ready sockets
*
* First we move ready sockets to pending again.
*/
spin_lock_irqsave(&sc->listen.lock, flags);
list_splice_init(&sc->listen.ready, &sc->listen.pending);
list_for_each_entry_safe(psc, tsc, &sc->listen.pending, accept.list)
smbdirect_socket_schedule_cleanup(psc, sc->first_error);
spin_unlock_irqrestore(&sc->listen.lock, flags);
switch (sc->status) {
case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
case SMBDIRECT_SOCKET_ERROR:
case SMBDIRECT_SOCKET_DISCONNECTING:
case SMBDIRECT_SOCKET_DISCONNECTED:
case SMBDIRECT_SOCKET_DESTROYED:
/*
* Keep the current error status
*/
break;
case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED;
break;
case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED;
break;
case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED;
break;
case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED;
break;
case SMBDIRECT_SOCKET_CREATED:
case SMBDIRECT_SOCKET_LISTENING:
sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
break;
case SMBDIRECT_SOCKET_CONNECTED:
sc->status = SMBDIRECT_SOCKET_ERROR;
break;
}
if (force_status && (was_first || *force_status > sc->status))
sc->status = *force_status;
/*
* Wake up all waiters in all wait queues
* in order to notice the broken connection.
*/
smbdirect_socket_wake_up_all(sc);
queue_work(sc->workqueues.cleanup, &sc->disconnect_work);
}
static void smbdirect_socket_cleanup_work(struct work_struct *work)
{
struct smbdirect_socket *sc =
container_of(work, struct smbdirect_socket, disconnect_work);
struct smbdirect_socket *psc, *tsc;
unsigned long flags;
/*
* This should not never be called in an interrupt!
*/
WARN_ON_ONCE(in_interrupt());
if (!sc->first_error) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR,
"%s called with first_error==0\n",
smbdirect_socket_status_string(sc->status));
sc->first_error = -ECONNABORTED;
}
/*
* make sure this and other work is not queued again
* but here we don't block and avoid
* disable[_delayed]_work_sync()
*/
disable_work(&sc->disconnect_work);
disable_work(&sc->connect.work);
disable_work(&sc->recv_io.posted.refill_work);
disable_work(&sc->idle.immediate_work);
sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE;
disable_delayed_work(&sc->idle.timer_work);
/*
* In case we were a listener we need to
* disconnect all pending and ready sockets
*
* First we move ready sockets to pending again.
*/
spin_lock_irqsave(&sc->listen.lock, flags);
list_splice_init(&sc->listen.ready, &sc->listen.pending);
list_for_each_entry_safe(psc, tsc, &sc->listen.pending, accept.list)
smbdirect_socket_schedule_cleanup(psc, sc->first_error);
spin_unlock_irqrestore(&sc->listen.lock, flags);
switch (sc->status) {
case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED:
case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING:
case SMBDIRECT_SOCKET_NEGOTIATE_FAILED:
case SMBDIRECT_SOCKET_CONNECTED:
case SMBDIRECT_SOCKET_ERROR:
sc->status = SMBDIRECT_SOCKET_DISCONNECTING;
/*
* Make sure we hold the callback lock
* im order to coordinate with the
* rdma_event handlers, typically
* smbdirect_connection_rdma_event_handler(),
* and smbdirect_socket_destroy().
*
* So that the order of ib_drain_qp()
* and rdma_disconnect() is controlled
* by the mutex.
*/
rdma_lock_handler(sc->rdma.cm_id);
rdma_disconnect(sc->rdma.cm_id);
rdma_unlock_handler(sc->rdma.cm_id);
break;
case SMBDIRECT_SOCKET_CREATED:
case SMBDIRECT_SOCKET_LISTENING:
case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED:
case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED:
case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING:
case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED:
case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED:
case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING:
case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED:
/*
* rdma_{accept,connect}() never reached
* RDMA_CM_EVENT_ESTABLISHED
*/
sc->status = SMBDIRECT_SOCKET_DISCONNECTED;
break;
case SMBDIRECT_SOCKET_DISCONNECTING:
case SMBDIRECT_SOCKET_DISCONNECTED:
case SMBDIRECT_SOCKET_DESTROYED:
break;
}
/*
* Wake up all waiters in all wait queues
* in order to notice the broken connection.
*/
smbdirect_socket_wake_up_all(sc);
}
static void smbdirect_socket_destroy(struct smbdirect_socket *sc)
{
struct smbdirect_socket *psc, *tsc;
size_t psockets;
struct smbdirect_recv_io *recv_io;
struct smbdirect_recv_io *recv_tmp;
LIST_HEAD(all_list);
unsigned long flags;
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
/*
* This should not never be called in an interrupt!
*/
WARN_ON_ONCE(in_interrupt());
if (sc->status == SMBDIRECT_SOCKET_DESTROYED)
return;
WARN_ONCE(sc->status != SMBDIRECT_SOCKET_DISCONNECTED,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
/*
* The listener should clear this before we reach this
*/
WARN_ONCE(sc->accept.listener,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
/*
* Wake up all waiters in all wait queues
* in order to notice the broken connection.
*
* Most likely this was already called via
* smbdirect_socket_cleanup_work(), but call it again...
*/
smbdirect_socket_wake_up_all(sc);
disable_work_sync(&sc->disconnect_work);
disable_work_sync(&sc->connect.work);
disable_work_sync(&sc->recv_io.posted.refill_work);
disable_work_sync(&sc->idle.immediate_work);
disable_delayed_work_sync(&sc->idle.timer_work);
if (sc->rdma.cm_id)
rdma_lock_handler(sc->rdma.cm_id);
if (sc->ib.qp) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"drain qp\n");
ib_drain_qp(sc->ib.qp);
}
/*
* In case we were a listener we need to
* disconnect all pending and ready sockets
*
* We move ready sockets to pending again.
*/
spin_lock_irqsave(&sc->listen.lock, flags);
list_splice_tail_init(&sc->listen.ready, &all_list);
list_splice_tail_init(&sc->listen.pending, &all_list);
spin_unlock_irqrestore(&sc->listen.lock, flags);
psockets = list_count_nodes(&all_list);
if (sc->listen.backlog != -1) /* was a listener */
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"release %zu pending sockets\n", psockets);
list_for_each_entry_safe(psc, tsc, &all_list, accept.list) {
list_del_init(&psc->accept.list);
psc->accept.listener = NULL;
smbdirect_socket_release(psc);
}
if (sc->listen.backlog != -1) /* was a listener */
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"released %zu pending sockets\n", psockets);
INIT_LIST_HEAD(&all_list);
/* It's not possible for upper layer to get to reassembly */
if (sc->listen.backlog == -1) /* was not a listener */
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"drain the reassembly queue\n");
spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags);
list_splice_tail_init(&sc->recv_io.reassembly.list, &all_list);
spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags);
list_for_each_entry_safe(recv_io, recv_tmp, &all_list, list)
smbdirect_connection_put_recv_io(recv_io);
sc->recv_io.reassembly.data_length = 0;
if (sc->listen.backlog == -1) /* was not a listener */
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"freeing mr list\n");
smbdirect_connection_destroy_mr_list(sc);
if (sc->listen.backlog == -1) /* was not a listener */
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"destroying qp\n");
smbdirect_connection_destroy_qp(sc);
if (sc->rdma.cm_id) {
rdma_unlock_handler(sc->rdma.cm_id);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"destroying cm_id\n");
rdma_destroy_id(sc->rdma.cm_id);
sc->rdma.cm_id = NULL;
}
if (sc->listen.backlog == -1) /* was not a listener */
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"destroying mem pools\n");
smbdirect_connection_destroy_mem_pools(sc);
sc->status = SMBDIRECT_SOCKET_DESTROYED;
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"rdma session destroyed\n");
}
void smbdirect_socket_destroy_sync(struct smbdirect_socket *sc)
{
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
/*
* This should not never be called in an interrupt!
*/
WARN_ON_ONCE(in_interrupt());
/*
* First we try to disable the work
* without disable_work_sync() in a
* non blocking way, if it's already
* running it will be handles by
* disable_work_sync() below.
*
* Here we just want to make sure queue_work() in
* smbdirect_socket_schedule_cleanup_lvl()
* is a no-op.
*/
disable_work(&sc->disconnect_work);
if (!sc->first_error)
/*
* SMBDIRECT_LOG_INFO is enough here
* as this is the typical case where
* we terminate the connection ourself.
*/
smbdirect_socket_schedule_cleanup_lvl(sc,
SMBDIRECT_LOG_INFO,
-ESHUTDOWN);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"cancelling and disable disconnect_work\n");
disable_work_sync(&sc->disconnect_work);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"destroying rdma session\n");
if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING)
smbdirect_socket_cleanup_work(&sc->disconnect_work);
if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) {
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"wait for transport being disconnected\n");
wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"waited for transport being disconnected\n");
}
/*
* Once we reached SMBDIRECT_SOCKET_DISCONNECTED,
* we should call smbdirect_socket_destroy()
*/
smbdirect_socket_destroy(sc);
smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO,
"status=%s first_error=%1pe",
smbdirect_socket_status_string(sc->status),
SMBDIRECT_DEBUG_ERR_PTR(sc->first_error));
}
int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr)
{
int ret;
if (sc->status != SMBDIRECT_SOCKET_CREATED)
return -EINVAL;
ret = rdma_bind_addr(sc->rdma.cm_id, addr);
if (ret)
return ret;
return 0;
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_bind);
void smbdirect_socket_shutdown(struct smbdirect_socket *sc)
{
smbdirect_socket_schedule_cleanup_lvl(sc, SMBDIRECT_LOG_INFO, -ESHUTDOWN);
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_shutdown);
static void smbdirect_socket_release_disconnect(struct kref *kref)
{
struct smbdirect_socket *sc =
container_of(kref, struct smbdirect_socket, refs.disconnect);
/*
* For now do a sync disconnect/destroy
*/
smbdirect_socket_destroy_sync(sc);
}
static void smbdirect_socket_release_destroy(struct kref *kref)
{
struct smbdirect_socket *sc =
container_of(kref, struct smbdirect_socket, refs.destroy);
/*
* Do a sync disconnect/destroy...
* hopefully a no-op, as it should be already
* in DESTROYED state, before we free the memory.
*/
smbdirect_socket_destroy_sync(sc);
kfree(sc);
}
void smbdirect_socket_release(struct smbdirect_socket *sc)
{
/*
* We expect only 1 disconnect reference
* and if it is already 0, it's a use after free!
*/
WARN_ON_ONCE(kref_read(&sc->refs.disconnect) != 1);
WARN_ON(!kref_put(&sc->refs.disconnect, smbdirect_socket_release_disconnect));
/*
* This may not trigger smbdirect_socket_release_destroy(),
* if struct smbdirect_socket is embedded in another structure
* indicated by REFCOUNT_MAX.
*/
kref_put(&sc->refs.destroy, smbdirect_socket_release_destroy);
}
__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_release);
int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc,
enum smbdirect_socket_status expected_status,
int unexpected_errno,
wait_queue_head_t *waitq,
atomic_t *total_credits,
int needed)
{
int ret;
if (WARN_ON_ONCE(needed < 0))
return -EINVAL;
do {
if (atomic_sub_return(needed, total_credits) >= 0)
return 0;
atomic_add(needed, total_credits);
ret = wait_event_interruptible(*waitq,
atomic_read(total_credits) >= needed ||
sc->status != expected_status);
if (sc->status != expected_status)
return unexpected_errno;
else if (ret < 0)
return ret;
} while (true);
}

View File

@@ -6,10 +6,18 @@
#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/mempool.h>
#include <linux/spinlock.h>
#include <linux/mutex.h>
#include <linux/completion.h>
#include <rdma/rw.h>
enum smbdirect_socket_status {
SMBDIRECT_SOCKET_CREATED,
SMBDIRECT_SOCKET_LISTENING,
SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED,
SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING,
SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED,
@@ -35,6 +43,8 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status)
switch (status) {
case SMBDIRECT_SOCKET_CREATED:
return "CREATED";
case SMBDIRECT_SOCKET_LISTENING:
return "LISTENING";
case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED:
return "RESOLVE_ADDR_NEEDED";
case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING:
@@ -99,18 +109,59 @@ struct smbdirect_socket {
int first_error;
/*
* This points to the workqueue to
* This points to the workqueues to
* be used for this socket.
* It can be per socket (on the client)
* or point to a global workqueue (on the server)
*/
struct workqueue_struct *workqueue;
struct {
struct workqueue_struct *accept;
struct workqueue_struct *connect;
struct workqueue_struct *idle;
struct workqueue_struct *refill;
struct workqueue_struct *immediate;
struct workqueue_struct *cleanup;
} workqueues;
struct work_struct disconnect_work;
/*
* The reference counts.
*/
struct {
/*
* This holds the references by the
* frontend, typically the smb layer.
*
* It is typically 1 and a disconnect
* will happen if it reaches 0.
*/
struct kref disconnect;
/*
* This holds the reference by the
* backend, the code that manages
* the lifetime of the whole
* struct smbdirect_socket,
* if this reaches 0 it can will
* be freed.
*
* Can be REFCOUNT_MAX is part
* of another structure.
*
* This is equal or higher than
* the disconnect refcount.
*/
struct kref destroy;
} refs;
/* RDMA related */
struct {
struct rdma_cm_id *cm_id;
/*
* The expected event in our current
* cm_id->event_handler, all other events
* are treated as an error.
*/
enum rdma_cm_event_type expected_event;
/*
* This is for iWarp MPA v1
*/
@@ -120,6 +171,7 @@ struct smbdirect_socket {
/* IB verbs related */
struct {
struct ib_pd *pd;
enum ib_poll_context poll_ctx;
struct ib_cq *send_cq;
struct ib_cq *recv_cq;
@@ -149,6 +201,35 @@ struct smbdirect_socket {
struct delayed_work timer_work;
} idle;
/*
* The state for listen sockets
*/
struct {
spinlock_t lock;
struct list_head pending;
struct list_head ready;
wait_queue_head_t wait_queue;
/*
* This starts as -1 and a value != -1
* means this socket was in LISTENING state
* before. Note the valid backlog can
* only be > 0.
*/
int backlog;
} listen;
/*
* The state for sockets waiting
* for accept, either still waiting
* for the negotiation to finish
* or already ready with a usable
* connection.
*/
struct {
struct smbdirect_socket *listener;
struct list_head list;
} accept;
/*
* The state for posted send buffers
*/
@@ -158,8 +239,9 @@ struct smbdirect_socket {
* smbdirect_send_io buffers
*/
struct {
struct kmem_cache *cache;
mempool_t *pool;
struct kmem_cache *cache;
mempool_t *pool;
gfp_t gfp_mask;
} mem;
/*
@@ -194,10 +276,6 @@ struct smbdirect_socket {
*/
struct {
atomic_t count;
/*
* woken when count is decremented
*/
wait_queue_head_t dec_wait_queue;
/*
* woken when count reached zero
*/
@@ -223,8 +301,9 @@ struct smbdirect_socket {
* smbdirect_recv_io buffers
*/
struct {
struct kmem_cache *cache;
mempool_t *pool;
struct kmem_cache *cache;
mempool_t *pool;
gfp_t gfp_mask;
} mem;
/*
@@ -310,19 +389,20 @@ struct smbdirect_socket {
struct {
atomic_t count;
} used;
struct work_struct recovery_work;
/* Used by transport to wait until all MRs are returned */
struct {
wait_queue_head_t wait_queue;
} cleanup;
} mr_io;
/*
* The state for RDMA read/write requests on the server
*/
struct {
/*
* Memory hints for
* smbdirect_rw_io structs
*/
struct {
gfp_t gfp_mask;
} mem;
/*
* The credit state for the send side
*/
@@ -352,20 +432,6 @@ struct smbdirect_socket {
} statistics;
struct {
#define SMBDIRECT_LOG_ERR 0x0
#define SMBDIRECT_LOG_INFO 0x1
#define SMBDIRECT_LOG_OUTGOING 0x1
#define SMBDIRECT_LOG_INCOMING 0x2
#define SMBDIRECT_LOG_READ 0x4
#define SMBDIRECT_LOG_WRITE 0x8
#define SMBDIRECT_LOG_RDMA_SEND 0x10
#define SMBDIRECT_LOG_RDMA_RECV 0x20
#define SMBDIRECT_LOG_KEEP_ALIVE 0x40
#define SMBDIRECT_LOG_RDMA_EVENT 0x80
#define SMBDIRECT_LOG_RDMA_MR 0x100
#define SMBDIRECT_LOG_RDMA_RW 0x200
#define SMBDIRECT_LOG_NEGOTIATE 0x400
void *private_ptr;
bool (*needed)(struct smbdirect_socket *sc,
void *private_ptr,
@@ -493,9 +559,23 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
init_waitqueue_head(&sc->status_wait);
sc->workqueues.accept = smbdirect_globals.workqueues.accept;
sc->workqueues.connect = smbdirect_globals.workqueues.connect;
sc->workqueues.idle = smbdirect_globals.workqueues.idle;
sc->workqueues.refill = smbdirect_globals.workqueues.refill;
sc->workqueues.immediate = smbdirect_globals.workqueues.immediate;
sc->workqueues.cleanup = smbdirect_globals.workqueues.cleanup;
INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->disconnect_work);
kref_init(&sc->refs.disconnect);
sc->refs.destroy = (struct kref) KREF_INIT(REFCOUNT_MAX);
sc->rdma.expected_event = RDMA_CM_EVENT_INTERNAL;
sc->ib.poll_ctx = IB_POLL_UNBOUND_WORKQUEUE;
spin_lock_init(&sc->connect.lock);
INIT_WORK(&sc->connect.work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->connect.work);
@@ -505,6 +585,16 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work);
disable_delayed_work_sync(&sc->idle.timer_work);
spin_lock_init(&sc->listen.lock);
INIT_LIST_HEAD(&sc->listen.pending);
INIT_LIST_HEAD(&sc->listen.ready);
sc->listen.backlog = -1; /* not a listener */
init_waitqueue_head(&sc->listen.wait_queue);
INIT_LIST_HEAD(&sc->accept.list);
sc->send_io.mem.gfp_mask = GFP_KERNEL;
atomic_set(&sc->send_io.bcredits.count, 0);
init_waitqueue_head(&sc->send_io.bcredits.wait_queue);
@@ -515,9 +605,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
init_waitqueue_head(&sc->send_io.credits.wait_queue);
atomic_set(&sc->send_io.pending.count, 0);
init_waitqueue_head(&sc->send_io.pending.dec_wait_queue);
init_waitqueue_head(&sc->send_io.pending.zero_wait_queue);
sc->recv_io.mem.gfp_mask = GFP_KERNEL;
INIT_LIST_HEAD(&sc->recv_io.free.list);
spin_lock_init(&sc->recv_io.free.lock);
@@ -532,6 +623,7 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
spin_lock_init(&sc->recv_io.reassembly.lock);
init_waitqueue_head(&sc->recv_io.reassembly.wait_queue);
sc->rw_io.mem.gfp_mask = GFP_KERNEL;
atomic_set(&sc->rw_io.credits.count, 0);
init_waitqueue_head(&sc->rw_io.credits.wait_queue);
@@ -540,9 +632,6 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
atomic_set(&sc->mr_io.ready.count, 0);
init_waitqueue_head(&sc->mr_io.ready.wait_queue);
atomic_set(&sc->mr_io.used.count, 0);
INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work);
disable_work_sync(&sc->mr_io.recovery_work);
init_waitqueue_head(&sc->mr_io.cleanup.wait_queue);
sc->logging.private_ptr = NULL;
sc->logging.needed = __smbdirect_log_needed;
@@ -602,6 +691,11 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc)
#define SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status) \
__SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status, /* nothing */)
#ifndef __SMBDIRECT_SOCKET_DISCONNECT
#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) \
smbdirect_socket_schedule_cleanup(__sc, -ECONNABORTED)
#endif /* ! __SMBDIRECT_SOCKET_DISCONNECT */
#define SMBDIRECT_CHECK_STATUS_DISCONNECT(__sc, __expected_status) \
__SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status, \
__SMBDIRECT_SOCKET_DISCONNECT(__sc);)
@@ -720,4 +814,19 @@ struct smbdirect_rw_io {
struct scatterlist sg_list[];
};
static inline size_t smbdirect_get_buf_page_count(const void *buf, size_t size)
{
return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) -
(uintptr_t)buf / PAGE_SIZE;
}
/*
* Maximum number of retries on data transfer operations
*/
#define SMBDIRECT_RDMA_CM_RETRY 6
/*
* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits
*/
#define SMBDIRECT_RDMA_CM_RNR_RETRY 0
#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */

View File

@@ -47,8 +47,9 @@ if SMB_SERVER
config SMB_SERVER_SMBDIRECT
bool "Support for SMB Direct protocol"
depends on SMB_SERVER=m && INFINIBAND && INFINIBAND_ADDR_TRANS || SMB_SERVER=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y
select SG_POOL
depends on SMB_SERVER && INFINIBAND && INFINIBAND_ADDR_TRANS
depends on SMB_SERVER=m || INFINIBAND=y
select SMB_COMMON_SMBDIRECT
default n
help

View File

@@ -376,9 +376,6 @@ int ksmbd_conn_handler_loop(void *p)
mutex_init(&conn->srv_mutex);
__module_get(THIS_MODULE);
if (t->ops->prepare && t->ops->prepare(t))
goto out;
max_req = server_conf.max_inflight_req;
conn->last_active = jiffies;
set_freezable();
@@ -470,7 +467,6 @@ recheck:
}
}
out:
ksmbd_conn_set_releasing(conn);
/* Wait till all reference dropped to the Server object*/
ksmbd_debug(CONN, "Wait for all pending requests(%d)\n", atomic_read(&conn->r_count));
@@ -566,6 +562,5 @@ void ksmbd_conn_transport_destroy(void)
ksmbd_tcp_destroy();
ksmbd_rdma_stop_listening();
stop_sessions();
ksmbd_rdma_destroy();
mutex_unlock(&init_lock);
}

View File

@@ -127,7 +127,6 @@ struct ksmbd_conn_ops {
};
struct ksmbd_transport_ops {
int (*prepare)(struct ksmbd_transport *t);
void (*disconnect)(struct ksmbd_transport *t);
void (*shutdown)(struct ksmbd_transport *t);
int (*read)(struct ksmbd_transport *t, char *buf,

View File

@@ -24,7 +24,6 @@
#include "asn1.h"
#include "connection.h"
#include "transport_ipc.h"
#include "../common/smbdirect/smbdirect.h"
#include "transport_rdma.h"
#include "vfs.h"
#include "vfs_cache.h"

File diff suppressed because it is too large Load Diff

View File

@@ -14,17 +14,17 @@
#ifdef CONFIG_SMB_SERVER_SMBDIRECT
int ksmbd_rdma_init(void);
void ksmbd_rdma_stop_listening(void);
void ksmbd_rdma_destroy(void);
bool ksmbd_rdma_capable_netdev(struct net_device *netdev);
void init_smbd_max_io_size(unsigned int sz);
unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt);
#else
static inline int ksmbd_rdma_init(void) { return 0; }
static inline void ksmbd_rdma_stop_listening(void) { }
static inline void ksmbd_rdma_destroy(void) { }
static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; }
static inline void init_smbd_max_io_size(unsigned int sz) { }
static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; }
#endif
#include "../common/smbdirect/smbdirect.h"
#endif /* __KSMBD_TRANSPORT_RDMA_H__ */