diff --git a/fs/smb/Kconfig b/fs/smb/Kconfig index 85f7ad5fbc5e..b4b2cfdc2a6b 100644 --- a/fs/smb/Kconfig +++ b/fs/smb/Kconfig @@ -4,6 +4,7 @@ source "fs/smb/client/Kconfig" source "fs/smb/server/Kconfig" +source "fs/smb/common/smbdirect/Kconfig" config SMBFS tristate diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index d112da38c881..63831242fddf 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -180,7 +180,9 @@ if CIFS config CIFS_SMB_DIRECT bool "SMB Direct support" - depends on CIFS=m && INFINIBAND && INFINIBAND_ADDR_TRANS || CIFS=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y + depends on CIFS && INFINIBAND && INFINIBAND_ADDR_TRANS + depends on CIFS=m || INFINIBAND=y + select SMB_COMMON_SMBDIRECT help Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1. SMB Direct allows transferring SMB packets over RDMA. If unsure, diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 217444e3e6d0..0691d2a3e04b 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -23,7 +23,6 @@ #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" -#include "../common/smbdirect/smbdirect_pdu.h" #endif #include "cifs_swn.h" #include "cached_dir.h" @@ -452,11 +451,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) c = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { -#ifdef CONFIG_CIFS_SMB_DIRECT - struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; -#endif - /* channel info will be printed as a part of sessions below */ if (SERVER_IS_CHAN(server)) continue; @@ -471,66 +465,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_printf(m, "\nClientGUID: %pUL", server->client_guid); spin_unlock(&server->srv_lock); #ifdef CONFIG_CIFS_SMB_DIRECT - if (!server->rdma) - goto skip_rdma; - - if (!server->smbd_conn) { - seq_printf(m, "\nSMBDirect transport not available"); - goto skip_rdma; - } - sc = &server->smbd_conn->socket; - sp = &sc->parameters; - - seq_printf(m, "\nSMBDirect protocol version: 0x%x " - "transport status: %s (%u)", - SMBDIRECT_V1, - smbdirect_socket_status_string(sc->status), - sc->status); - seq_printf(m, "\nConn receive_credit_max: %u " - "send_credit_target: %u max_send_size: %u", - sp->recv_credit_max, - sp->send_credit_target, - sp->max_send_size); - seq_printf(m, "\nConn max_fragmented_recv_size: %u " - "max_fragmented_send_size: %u max_receive_size:%u", - sp->max_fragmented_recv_size, - sp->max_fragmented_send_size, - sp->max_recv_size); - seq_printf(m, "\nConn keep_alive_interval: %u " - "max_readwrite_size: %u rdma_readwrite_threshold: %u", - sp->keepalive_interval_msec * 1000, - sp->max_read_write_size, - server->rdma_readwrite_threshold); - seq_printf(m, "\nDebug count_get_receive_buffer: %llu " - "count_put_receive_buffer: %llu count_send_empty: %llu", - sc->statistics.get_receive_buffer, - sc->statistics.put_receive_buffer, - sc->statistics.send_empty); - seq_printf(m, "\nRead Queue " - "count_enqueue_reassembly_queue: %llu " - "count_dequeue_reassembly_queue: %llu " - "reassembly_data_length: %u " - "reassembly_queue_length: %u", - sc->statistics.enqueue_reassembly_queue, - sc->statistics.dequeue_reassembly_queue, - sc->recv_io.reassembly.data_length, - sc->recv_io.reassembly.queue_length); - seq_printf(m, "\nCurrent Credits send_credits: %u " - "receive_credits: %u receive_credit_target: %u", - atomic_read(&sc->send_io.credits.count), - atomic_read(&sc->recv_io.credits.count), - sc->recv_io.credits.target); - seq_printf(m, "\nPending send_pending: %u ", - atomic_read(&sc->send_io.pending.count)); - seq_printf(m, "\nMR responder_resources: %u " - "max_frmr_depth: %u mr_type: 0x%x", - sp->responder_resources, - sp->max_frmr_depth, - sc->mr_io.type); - seq_printf(m, "\nMR mr_ready_count: %u mr_used_count: %u", - atomic_read(&sc->mr_io.ready.count), - atomic_read(&sc->mr_io.used.count)); -skip_rdma: + smbd_debug_proc_show(server, m); #endif seq_printf(m, "\nNumber of credits: %d,%d,%d Dialect 0x%x", server->credits, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 59d7418cc480..957aca2222b5 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -36,7 +36,6 @@ #include "../common/smb2status.h" #include "smb2glob.h" #include "cifs_spnego.h" -#include "../common/smbdirect/smbdirect.h" #include "smbdirect.h" #include "trace.h" #ifdef CONFIG_CIFS_DFS_UPCALL @@ -4554,9 +4553,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, req->ReadChannelInfoLength = cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1)); v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0]; - v1->offset = cpu_to_le64(rdata->mr->mr->iova); - v1->token = cpu_to_le32(rdata->mr->mr->rkey); - v1->length = cpu_to_le32(rdata->mr->mr->length); + smbd_mr_fill_buffer_descriptor(rdata->mr, v1); *total_len += sizeof(*v1) - 1; } @@ -5155,9 +5152,7 @@ smb2_async_writev(struct cifs_io_subrequest *wdata) req->WriteChannelInfoLength = cpu_to_le16(sizeof(struct smbdirect_buffer_descriptor_v1)); v1 = (struct smbdirect_buffer_descriptor_v1 *) &req->Buffer[0]; - v1->offset = cpu_to_le64(wdata->mr->mr->iova); - v1->token = cpu_to_le32(wdata->mr->mr->rkey); - v1->length = cpu_to_le32(wdata->mr->mr->length); + smbd_mr_fill_buffer_descriptor(wdata->mr, v1); rqst.rq_iov[0].iov_len += sizeof(*v1); diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 461658105013..9e67adcdc7d3 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -4,60 +4,12 @@ * * Author(s): Long Li */ -#include -#include -#include -#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smbd_disconnect_rdma_connection(__sc) -#include "../common/smbdirect/smbdirect_pdu.h" + #include "smbdirect.h" #include "cifs_debug.h" #include "cifsproto.h" #include "smb2proto.h" - -const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) -{ - struct smbdirect_socket *sc = &conn->socket; - - return &sc->parameters; -} - -static struct smbdirect_recv_io *get_receive_buffer( - struct smbdirect_socket *sc); -static void put_receive_buffer( - struct smbdirect_socket *sc, - struct smbdirect_recv_io *response); -static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf); -static void destroy_receive_buffers(struct smbdirect_socket *sc); - -static void enqueue_reassembly( - struct smbdirect_socket *sc, - struct smbdirect_recv_io *response, int data_length); -static struct smbdirect_recv_io *_get_first_reassembly( - struct smbdirect_socket *sc); - -static int smbd_post_send(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - struct smbdirect_send_io *request); - -static int smbd_post_recv( - struct smbdirect_socket *sc, - struct smbdirect_recv_io *response); - -static int smbd_post_send_empty(struct smbdirect_socket *sc); - -static void destroy_mr_list(struct smbdirect_socket *sc); -static int allocate_mr_list(struct smbdirect_socket *sc); - -struct smb_extract_to_rdma { - struct ib_sge *sge; - unsigned int nr_sge; - unsigned int max_sge; - struct ib_device *device; - u32 local_dma_lkey; - enum dma_data_direction direction; -}; -static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, - struct smb_extract_to_rdma *rdma); +#include "../common/smbdirect/smbdirect_public.h" /* Port numbers for SMBD transport */ #define SMB_PORT 445 @@ -72,21 +24,12 @@ static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, /* The timeout to wait for a keepalive message from peer in seconds */ #define KEEPALIVE_RECV_TIMEOUT 5 -/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ -#define SMBD_MIN_RECEIVE_SIZE 128 -#define SMBD_MIN_FRAGMENTED_SIZE 131072 - /* * Default maximum number of RDMA read/write outstanding on this connection * This value is possibly decreased during QP creation on hardware limit */ #define SMBD_CM_RESPONDER_RESOURCES 32 -/* Maximum number of retries on data transfer operations */ -#define SMBD_CM_RETRY 6 -/* No need to retry on Receiver Not Ready since SMBD manages credits */ -#define SMBD_CM_RNR_RETRY 0 - /* * User configurable initial values per SMBD transport connection * as defined in [MS-SMBD] 3.1.1.1 @@ -162,6 +105,43 @@ module_param(smbd_logging_level, uint, 0644); MODULE_PARM_DESC(smbd_logging_level, "Logging level for SMBD transport, 0 (default): error, 1: info"); +static bool smbd_logging_needed(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls) +{ +#define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_LOG_ ##x) + BUILD_BUG_SAME(ERR); + BUILD_BUG_SAME(INFO); +#undef BUILD_BUG_SAME +#define BUILD_BUG_SAME(x) BUILD_BUG_ON(x != SMBDIRECT_ ##x) + BUILD_BUG_SAME(LOG_OUTGOING); + BUILD_BUG_SAME(LOG_INCOMING); + BUILD_BUG_SAME(LOG_READ); + BUILD_BUG_SAME(LOG_WRITE); + BUILD_BUG_SAME(LOG_RDMA_SEND); + BUILD_BUG_SAME(LOG_RDMA_RECV); + BUILD_BUG_SAME(LOG_KEEP_ALIVE); + BUILD_BUG_SAME(LOG_RDMA_EVENT); + BUILD_BUG_SAME(LOG_RDMA_MR); +#undef BUILD_BUG_SAME + + if (lvl <= smbd_logging_level || cls & smbd_logging_class) + return true; + return false; +} + +static void smbd_logging_vaprintf(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf) +{ + cifs_dbg(VFS, "%s:%u %pV", func, line, vaf); +} + #define log_rdma(level, class, fmt, args...) \ do { \ if (level <= smbd_logging_level || class & smbd_logging_class) \ @@ -185,1703 +165,34 @@ do { \ #define log_rdma_mr(level, fmt, args...) \ log_rdma(level, LOG_RDMA_MR, fmt, ##args) -static void smbd_disconnect_wake_up_all(struct smbdirect_socket *sc) -{ - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - */ - wake_up_all(&sc->status_wait); - wake_up_all(&sc->send_io.lcredits.wait_queue); - wake_up_all(&sc->send_io.credits.wait_queue); - wake_up_all(&sc->send_io.pending.dec_wait_queue); - wake_up_all(&sc->send_io.pending.zero_wait_queue); - wake_up_all(&sc->recv_io.reassembly.wait_queue); - wake_up_all(&sc->mr_io.ready.wait_queue); - wake_up_all(&sc->mr_io.cleanup.wait_queue); -} - -static void smbd_disconnect_rdma_work(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, disconnect_work); - - if (sc->first_error == 0) - sc->first_error = -ECONNABORTED; - - /* - * make sure this and other work is not queued again - * but here we don't block and avoid - * disable[_delayed]_work_sync() - */ - disable_work(&sc->disconnect_work); - disable_work(&sc->recv_io.posted.refill_work); - disable_work(&sc->mr_io.recovery_work); - disable_work(&sc->idle.immediate_work); - disable_delayed_work(&sc->idle.timer_work); - - switch (sc->status) { - case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: - case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: - case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: - case SMBDIRECT_SOCKET_CONNECTED: - case SMBDIRECT_SOCKET_ERROR: - sc->status = SMBDIRECT_SOCKET_DISCONNECTING; - rdma_disconnect(sc->rdma.cm_id); - break; - - case SMBDIRECT_SOCKET_CREATED: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: - case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: - /* - * rdma_connect() never reached - * RDMA_CM_EVENT_ESTABLISHED - */ - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - break; - - case SMBDIRECT_SOCKET_DISCONNECTING: - case SMBDIRECT_SOCKET_DISCONNECTED: - case SMBDIRECT_SOCKET_DESTROYED: - break; - } - - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - */ - smbd_disconnect_wake_up_all(sc); -} - -static void smbd_disconnect_rdma_connection(struct smbdirect_socket *sc) -{ - if (sc->first_error == 0) - sc->first_error = -ECONNABORTED; - - /* - * make sure other work (than disconnect_work) is - * not queued again but here we don't block and avoid - * disable[_delayed]_work_sync() - */ - disable_work(&sc->recv_io.posted.refill_work); - disable_work(&sc->mr_io.recovery_work); - disable_work(&sc->idle.immediate_work); - disable_delayed_work(&sc->idle.timer_work); - - switch (sc->status) { - case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: - case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: - case SMBDIRECT_SOCKET_ERROR: - case SMBDIRECT_SOCKET_DISCONNECTING: - case SMBDIRECT_SOCKET_DISCONNECTED: - case SMBDIRECT_SOCKET_DESTROYED: - /* - * Keep the current error status - */ - break; - - case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: - sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - break; - - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: - sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - break; - - case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; - break; - - case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: - case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - break; - - case SMBDIRECT_SOCKET_CREATED: - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - break; - - case SMBDIRECT_SOCKET_CONNECTED: - sc->status = SMBDIRECT_SOCKET_ERROR; - break; - } - - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - */ - smbd_disconnect_wake_up_all(sc); - - queue_work(sc->workqueue, &sc->disconnect_work); -} - -/* Upcall from RDMA CM */ -static int smbd_conn_upcall( - struct rdma_cm_id *id, struct rdma_cm_event *event) -{ - struct smbdirect_socket *sc = id->context; - struct smbdirect_socket_parameters *sp = &sc->parameters; - const char *event_name = rdma_event_msg(event->event); - u8 peer_initiator_depth; - u8 peer_responder_resources; - - log_rdma_event(INFO, "event=%s status=%d\n", - event_name, event->status); - - switch (event->event) { - case RDMA_CM_EVENT_ADDR_RESOLVED: - if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING)) - break; - sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; - wake_up(&sc->status_wait); - break; - - case RDMA_CM_EVENT_ROUTE_RESOLVED: - if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING)) - break; - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; - wake_up(&sc->status_wait); - break; - - case RDMA_CM_EVENT_ADDR_ERROR: - log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - smbd_disconnect_rdma_work(&sc->disconnect_work); - break; - - case RDMA_CM_EVENT_ROUTE_ERROR: - log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - smbd_disconnect_rdma_work(&sc->disconnect_work); - break; - - case RDMA_CM_EVENT_ESTABLISHED: - log_rdma_event(INFO, "connected event=%s\n", event_name); - - /* - * Here we work around an inconsistency between - * iWarp and other devices (at least rxe and irdma using RoCEv2) - */ - if (rdma_protocol_iwarp(id->device, id->port_num)) { - /* - * iWarp devices report the peer's values - * with the perspective of the peer here. - * Tested with siw and irdma (in iwarp mode) - * We need to change to our perspective here, - * so we need to switch the values. - */ - peer_initiator_depth = event->param.conn.responder_resources; - peer_responder_resources = event->param.conn.initiator_depth; - } else { - /* - * Non iWarp devices report the peer's values - * already changed to our perspective here. - * Tested with rxe and irdma (in roce mode). - */ - peer_initiator_depth = event->param.conn.initiator_depth; - peer_responder_resources = event->param.conn.responder_resources; - } - if (rdma_protocol_iwarp(id->device, id->port_num) && - event->param.conn.private_data_len == 8) { - /* - * Legacy clients with only iWarp MPA v1 support - * need a private blob in order to negotiate - * the IRD/ORD values. - */ - const __be32 *ird_ord_hdr = event->param.conn.private_data; - u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); - u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); - - /* - * cifs.ko sends the legacy IRD/ORD negotiation - * event if iWarp MPA v2 was used. - * - * Here we check that the values match and only - * mark the client as legacy if they don't match. - */ - if ((u32)event->param.conn.initiator_depth != ird32 || - (u32)event->param.conn.responder_resources != ord32) { - /* - * There are broken clients (old cifs.ko) - * using little endian and also - * struct rdma_conn_param only uses u8 - * for initiator_depth and responder_resources, - * so we truncate the value to U8_MAX. - * - * smb_direct_accept_client() will then - * do the real negotiation in order to - * select the minimum between client and - * server. - */ - ird32 = min_t(u32, ird32, U8_MAX); - ord32 = min_t(u32, ord32, U8_MAX); - - sc->rdma.legacy_iwarp = true; - peer_initiator_depth = (u8)ird32; - peer_responder_resources = (u8)ord32; - } - } - - /* - * negotiate the value by using the minimum - * between client and server if the client provided - * non 0 values. - */ - if (peer_initiator_depth != 0) - sp->initiator_depth = - min_t(u8, sp->initiator_depth, - peer_initiator_depth); - if (peer_responder_resources != 0) - sp->responder_resources = - min_t(u8, sp->responder_resources, - peer_responder_resources); - - if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) - break; - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; - wake_up(&sc->status_wait); - break; - - case RDMA_CM_EVENT_CONNECT_ERROR: - case RDMA_CM_EVENT_UNREACHABLE: - case RDMA_CM_EVENT_REJECTED: - log_rdma_event(ERR, "connecting failed event=%s\n", event_name); - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; - smbd_disconnect_rdma_work(&sc->disconnect_work); - break; - - case RDMA_CM_EVENT_DEVICE_REMOVAL: - case RDMA_CM_EVENT_DISCONNECTED: - /* This happens when we fail the negotiation */ - if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_FAILED) { - log_rdma_event(ERR, "event=%s during negotiation\n", event_name); - } - - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - smbd_disconnect_rdma_work(&sc->disconnect_work); - break; - - default: - log_rdma_event(ERR, "unexpected event=%s status=%d\n", - event_name, event->status); - break; - } - - return 0; -} - -/* Upcall from RDMA QP */ -static void -smbd_qp_async_error_upcall(struct ib_event *event, void *context) -{ - struct smbdirect_socket *sc = context; - - log_rdma_event(ERR, "%s on device %s socket %p\n", - ib_event_msg(event->event), event->device->name, sc); - - switch (event->event) { - case IB_EVENT_CQ_ERR: - case IB_EVENT_QP_FATAL: - smbd_disconnect_rdma_connection(sc); - break; - - default: - break; - } -} - -static inline void *smbdirect_send_io_payload(struct smbdirect_send_io *request) -{ - return (void *)request->packet; -} - -static inline void *smbdirect_recv_io_payload(struct smbdirect_recv_io *response) -{ - return (void *)response->packet; -} - -static struct smbdirect_send_io *smbd_alloc_send_io(struct smbdirect_socket *sc) -{ - struct smbdirect_send_io *msg; - - msg = mempool_alloc(sc->send_io.mem.pool, GFP_KERNEL); - if (!msg) - return ERR_PTR(-ENOMEM); - msg->socket = sc; - INIT_LIST_HEAD(&msg->sibling_list); - msg->num_sge = 0; - - return msg; -} - -static void smbd_free_send_io(struct smbdirect_send_io *msg) -{ - struct smbdirect_socket *sc = msg->socket; - size_t i; - - /* - * The list needs to be empty! - * The caller should take care of it. - */ - WARN_ON_ONCE(!list_empty(&msg->sibling_list)); - - /* - * Note we call ib_dma_unmap_page(), even if some sges are mapped using - * ib_dma_map_single(). - * - * The difference between _single() and _page() only matters for the - * ib_dma_map_*() case. - * - * For the ib_dma_unmap_*() case it does not matter as both take the - * dma_addr_t and dma_unmap_single_attrs() is just an alias to - * dma_unmap_page_attrs(). - */ - for (i = 0; i < msg->num_sge; i++) - ib_dma_unmap_page(sc->ib.dev, - msg->sge[i].addr, - msg->sge[i].length, - DMA_TO_DEVICE); - - mempool_free(msg, sc->send_io.mem.pool); -} - -/* Called when a RDMA send is done */ -static void send_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbdirect_send_io *request = - container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); - struct smbdirect_socket *sc = request->socket; - struct smbdirect_send_io *sibling, *next; - int lcredits = 0; - - log_rdma_send(INFO, "smbdirect_send_io 0x%p completed wc->status=%s\n", - request, ib_wc_status_msg(wc->status)); - - if (unlikely(!(request->wr.send_flags & IB_SEND_SIGNALED))) { - /* - * This happens when smbdirect_send_io is a sibling - * before the final message, it is signaled on - * error anyway, so we need to skip - * smbdirect_connection_free_send_io here, - * otherwise is will destroy the memory - * of the siblings too, which will cause - * use after free problems for the others - * triggered from ib_drain_qp(). - */ - if (wc->status != IB_WC_SUCCESS) - goto skip_free; - - /* - * This should not happen! - * But we better just close the - * connection... - */ - log_rdma_send(ERR, - "unexpected send completion wc->status=%s (%d) wc->opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smbd_disconnect_rdma_connection(sc); - return; - } - - /* - * Free possible siblings and then the main send_io - */ - list_for_each_entry_safe(sibling, next, &request->sibling_list, sibling_list) { - list_del_init(&sibling->sibling_list); - smbd_free_send_io(sibling); - lcredits += 1; - } - /* Note this frees wc->wr_cqe, but not wc */ - smbd_free_send_io(request); - lcredits += 1; - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { -skip_free: - if (wc->status != IB_WC_WR_FLUSH_ERR) - log_rdma_send(ERR, "wc->status=%s wc->opcode=%d\n", - ib_wc_status_msg(wc->status), wc->opcode); - smbd_disconnect_rdma_connection(sc); - return; - } - - atomic_add(lcredits, &sc->send_io.lcredits.count); - wake_up(&sc->send_io.lcredits.wait_queue); - - if (atomic_dec_and_test(&sc->send_io.pending.count)) - wake_up(&sc->send_io.pending.zero_wait_queue); - - wake_up(&sc->send_io.pending.dec_wait_queue); -} - -static void dump_smbdirect_negotiate_resp(struct smbdirect_negotiate_resp *resp) -{ - log_rdma_event(INFO, "resp message min_version %u max_version %u negotiated_version %u credits_requested %u credits_granted %u status %u max_readwrite_size %u preferred_send_size %u max_receive_size %u max_fragmented_size %u\n", - resp->min_version, resp->max_version, - resp->negotiated_version, resp->credits_requested, - resp->credits_granted, resp->status, - resp->max_readwrite_size, resp->preferred_send_size, - resp->max_receive_size, resp->max_fragmented_size); -} - -/* - * Process a negotiation response message, according to [MS-SMBD]3.1.5.7 - * response, packet_length: the negotiation response message - * return value: true if negotiation is a success, false if failed - */ -static bool process_negotiation_response( - struct smbdirect_recv_io *response, int packet_length) -{ - struct smbdirect_socket *sc = response->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_negotiate_resp *packet = smbdirect_recv_io_payload(response); - - if (packet_length < sizeof(struct smbdirect_negotiate_resp)) { - log_rdma_event(ERR, - "error: packet_length=%d\n", packet_length); - return false; - } - - if (le16_to_cpu(packet->negotiated_version) != SMBDIRECT_V1) { - log_rdma_event(ERR, "error: negotiated_version=%x\n", - le16_to_cpu(packet->negotiated_version)); - return false; - } - - if (packet->credits_requested == 0) { - log_rdma_event(ERR, "error: credits_requested==0\n"); - return false; - } - sc->recv_io.credits.target = le16_to_cpu(packet->credits_requested); - sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); - - if (packet->credits_granted == 0) { - log_rdma_event(ERR, "error: credits_granted==0\n"); - return false; - } - atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); - atomic_set(&sc->send_io.credits.count, le16_to_cpu(packet->credits_granted)); - - if (le32_to_cpu(packet->preferred_send_size) > sp->max_recv_size) { - log_rdma_event(ERR, "error: preferred_send_size=%d\n", - le32_to_cpu(packet->preferred_send_size)); - return false; - } - sp->max_recv_size = le32_to_cpu(packet->preferred_send_size); - - if (le32_to_cpu(packet->max_receive_size) < SMBD_MIN_RECEIVE_SIZE) { - log_rdma_event(ERR, "error: max_receive_size=%d\n", - le32_to_cpu(packet->max_receive_size)); - return false; - } - sp->max_send_size = min_t(u32, sp->max_send_size, - le32_to_cpu(packet->max_receive_size)); - - if (le32_to_cpu(packet->max_fragmented_size) < - SMBD_MIN_FRAGMENTED_SIZE) { - log_rdma_event(ERR, "error: max_fragmented_size=%d\n", - le32_to_cpu(packet->max_fragmented_size)); - return false; - } - sp->max_fragmented_send_size = - le32_to_cpu(packet->max_fragmented_size); - - - sp->max_read_write_size = min_t(u32, - le32_to_cpu(packet->max_readwrite_size), - sp->max_frmr_depth * PAGE_SIZE); - sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; - - atomic_set(&sc->send_io.bcredits.count, 1); - sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; - return true; -} - -static void smbd_post_send_credits(struct work_struct *work) -{ - int rc; - struct smbdirect_recv_io *response; - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); - int posted = 0; - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - return; - } - - if (sc->recv_io.credits.target > - atomic_read(&sc->recv_io.credits.count)) { - while (true) { - response = get_receive_buffer(sc); - if (!response) - break; - - response->first_segment = false; - rc = smbd_post_recv(sc, response); - if (rc) { - log_rdma_recv(ERR, - "post_recv failed rc=%d\n", rc); - put_receive_buffer(sc, response); - break; - } - - atomic_inc(&sc->recv_io.posted.count); - posted += 1; - } - } - - atomic_add(posted, &sc->recv_io.credits.available); - - /* - * If the last send credit is waiting for credits - * it can grant we need to wake it up - */ - if (posted && - atomic_read(&sc->send_io.bcredits.count) == 0 && - atomic_read(&sc->send_io.credits.count) == 0) - wake_up(&sc->send_io.credits.wait_queue); - - /* Promptly send an immediate packet as defined in [MS-SMBD] 3.1.1.1 */ - if (atomic_read(&sc->recv_io.credits.count) < - sc->recv_io.credits.target - 1) { - log_keep_alive(INFO, "schedule send of an empty message\n"); - queue_work(sc->workqueue, &sc->idle.immediate_work); - } -} - -/* Called from softirq, when recv is done */ -static void recv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbdirect_data_transfer *data_transfer; - struct smbdirect_recv_io *response = - container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); - struct smbdirect_socket *sc = response->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; - int current_recv_credits; - u16 old_recv_credit_target; - u32 data_offset = 0; - u32 data_length = 0; - u32 remaining_data_length = 0; - bool negotiate_done = false; - - log_rdma_recv(INFO, - "response=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n", - response, sc->recv_io.expected, - ib_wc_status_msg(wc->status), wc->opcode, - wc->byte_len, wc->pkey_index); - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - if (wc->status != IB_WC_WR_FLUSH_ERR) - log_rdma_recv(ERR, "wc->status=%s opcode=%d\n", - ib_wc_status_msg(wc->status), wc->opcode); - goto error; - } - - ib_dma_sync_single_for_cpu( - wc->qp->device, - response->sge.addr, - response->sge.length, - DMA_FROM_DEVICE); - - /* - * Reset timer to the keepalive interval in - * order to trigger our next keepalive message. - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); - - switch (sc->recv_io.expected) { - /* SMBD negotiation response */ - case SMBDIRECT_EXPECT_NEGOTIATE_REP: - dump_smbdirect_negotiate_resp(smbdirect_recv_io_payload(response)); - sc->recv_io.reassembly.full_packet_received = true; - negotiate_done = - process_negotiation_response(response, wc->byte_len); - put_receive_buffer(sc, response); - if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)) - negotiate_done = false; - if (!negotiate_done) { - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - smbd_disconnect_rdma_connection(sc); - } else { - sc->status = SMBDIRECT_SOCKET_CONNECTED; - wake_up(&sc->status_wait); - } - - return; - - /* SMBD data transfer packet */ - case SMBDIRECT_EXPECT_DATA_TRANSFER: - data_transfer = smbdirect_recv_io_payload(response); - - if (wc->byte_len < - offsetof(struct smbdirect_data_transfer, padding)) - goto error; - - remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); - data_offset = le32_to_cpu(data_transfer->data_offset); - data_length = le32_to_cpu(data_transfer->data_length); - if (wc->byte_len < data_offset || - (u64)wc->byte_len < (u64)data_offset + data_length) - goto error; - - if (remaining_data_length > sp->max_fragmented_recv_size || - data_length > sp->max_fragmented_recv_size || - (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size) - goto error; - - if (data_length) { - if (sc->recv_io.reassembly.full_packet_received) - response->first_segment = true; - - if (le32_to_cpu(data_transfer->remaining_data_length)) - sc->recv_io.reassembly.full_packet_received = false; - else - sc->recv_io.reassembly.full_packet_received = true; - } - - atomic_dec(&sc->recv_io.posted.count); - current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count); - - old_recv_credit_target = sc->recv_io.credits.target; - sc->recv_io.credits.target = - le16_to_cpu(data_transfer->credits_requested); - sc->recv_io.credits.target = - min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); - sc->recv_io.credits.target = - max_t(u16, sc->recv_io.credits.target, 1); - if (le16_to_cpu(data_transfer->credits_granted)) { - atomic_add(le16_to_cpu(data_transfer->credits_granted), - &sc->send_io.credits.count); - /* - * We have new send credits granted from remote peer - * If any sender is waiting for credits, unblock it - */ - wake_up(&sc->send_io.credits.wait_queue); - } - - log_incoming(INFO, "data flags %d data_offset %d data_length %d remaining_data_length %d\n", - le16_to_cpu(data_transfer->flags), - le32_to_cpu(data_transfer->data_offset), - le32_to_cpu(data_transfer->data_length), - le32_to_cpu(data_transfer->remaining_data_length)); - - /* Send an immediate response right away if requested */ - if (le16_to_cpu(data_transfer->flags) & - SMBDIRECT_FLAG_RESPONSE_REQUESTED) { - log_keep_alive(INFO, "schedule send of immediate response\n"); - queue_work(sc->workqueue, &sc->idle.immediate_work); - } - - /* - * If this is a packet with data playload place the data in - * reassembly queue and wake up the reading thread - */ - if (data_length) { - if (current_recv_credits <= (sc->recv_io.credits.target / 4) || - sc->recv_io.credits.target > old_recv_credit_target) - queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); - - enqueue_reassembly(sc, response, data_length); - wake_up(&sc->recv_io.reassembly.wait_queue); - } else - put_receive_buffer(sc, response); - - return; - - case SMBDIRECT_EXPECT_NEGOTIATE_REQ: - /* Only server... */ - break; - } - - /* - * This is an internal error! - */ - log_rdma_recv(ERR, "unexpected response type=%d\n", sc->recv_io.expected); - WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); -error: - put_receive_buffer(sc, response); - smbd_disconnect_rdma_connection(sc); -} - -static struct rdma_cm_id *smbd_create_id( - struct smbdirect_socket *sc, - struct sockaddr *dstaddr, int port) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct rdma_cm_id *id; - u8 node_type = RDMA_NODE_UNSPECIFIED; - int rc; - __be16 *sport; - - id = rdma_create_id(&init_net, smbd_conn_upcall, sc, - RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(id)) { - rc = PTR_ERR(id); - log_rdma_event(ERR, "rdma_create_id() failed %i\n", rc); - return id; - } - - switch (port) { - case SMBD_PORT: - /* - * only allow iWarp devices - * for port 5445. - */ - node_type = RDMA_NODE_RNIC; - break; - case SMB_PORT: - /* - * only allow InfiniBand, RoCEv1 or RoCEv2 - * devices for port 445. - * - * (Basically don't allow iWarp devices) - */ - node_type = RDMA_NODE_IB_CA; - break; - } - rc = rdma_restrict_node_type(id, node_type); - if (rc) { - log_rdma_event(ERR, "rdma_restrict_node_type(%u) failed %i\n", - node_type, rc); - goto out; - } - - if (dstaddr->sa_family == AF_INET6) - sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; - else - sport = &((struct sockaddr_in *)dstaddr)->sin_port; - - *sport = htons(port); - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED); - sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; - rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)dstaddr, - sp->resolve_addr_timeout_msec); - if (rc) { - log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc); - goto out; - } - rc = wait_event_interruptible_timeout( - sc->status_wait, - sc->status != SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, - msecs_to_jiffies(sp->resolve_addr_timeout_msec)); - /* e.g. if interrupted returns -ERESTARTSYS */ - if (rc < 0) { - log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); - goto out; - } - if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING) { - rc = -ETIMEDOUT; - log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); - goto out; - } - if (sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED) { - rc = -EHOSTUNREACH; - log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc); - goto out; - } - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED); - sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; - rc = rdma_resolve_route(id, sp->resolve_route_timeout_msec); - if (rc) { - log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc); - goto out; - } - rc = wait_event_interruptible_timeout( - sc->status_wait, - sc->status != SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING, - msecs_to_jiffies(sp->resolve_route_timeout_msec)); - /* e.g. if interrupted returns -ERESTARTSYS */ - if (rc < 0) { - log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc); - goto out; - } - if (sc->status == SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING) { - rc = -ETIMEDOUT; - log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); - goto out; - } - if (sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED) { - rc = -ENETUNREACH; - log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc); - goto out; - } - - return id; - -out: - rdma_destroy_id(id); - return ERR_PTR(rc); -} - -/* - * Test if FRWR (Fast Registration Work Requests) is supported on the device - * This implementation requires FRWR on RDMA read/write - * return value: true if it is supported - */ -static bool frwr_is_supported(struct ib_device_attr *attrs) -{ - if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) - return false; - if (attrs->max_fast_reg_page_list_len == 0) - return false; - return true; -} - -static int smbd_ia_open( - struct smbdirect_socket *sc, - struct sockaddr *dstaddr, int port) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - int rc; - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); - sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; - - sc->rdma.cm_id = smbd_create_id(sc, dstaddr, port); - if (IS_ERR(sc->rdma.cm_id)) { - rc = PTR_ERR(sc->rdma.cm_id); - goto out1; - } - sc->ib.dev = sc->rdma.cm_id->device; - - if (!frwr_is_supported(&sc->ib.dev->attrs)) { - log_rdma_event(ERR, "Fast Registration Work Requests (FRWR) is not supported\n"); - log_rdma_event(ERR, "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", - sc->ib.dev->attrs.device_cap_flags, - sc->ib.dev->attrs.max_fast_reg_page_list_len); - rc = -EPROTONOSUPPORT; - goto out2; - } - sp->max_frmr_depth = min_t(u32, - sp->max_frmr_depth, - sc->ib.dev->attrs.max_fast_reg_page_list_len); - sc->mr_io.type = IB_MR_TYPE_MEM_REG; - if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) - sc->mr_io.type = IB_MR_TYPE_SG_GAPS; - - return 0; - -out2: - rdma_destroy_id(sc->rdma.cm_id); - sc->rdma.cm_id = NULL; - -out1: - return rc; -} - -/* - * Send a negotiation request message to the peer - * The negotiation procedure is in [MS-SMBD] 3.1.5.2 and 3.1.5.3 - * After negotiation, the transport is connected and ready for - * carrying upper layer SMB payload - */ -static int smbd_post_send_negotiate_req(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - int rc; - struct smbdirect_send_io *request; - struct smbdirect_negotiate_req *packet; - - request = smbd_alloc_send_io(sc); - if (IS_ERR(request)) - return PTR_ERR(request); - - packet = smbdirect_send_io_payload(request); - packet->min_version = cpu_to_le16(SMBDIRECT_V1); - packet->max_version = cpu_to_le16(SMBDIRECT_V1); - packet->reserved = 0; - packet->credits_requested = cpu_to_le16(sp->send_credit_target); - packet->preferred_send_size = cpu_to_le32(sp->max_send_size); - packet->max_receive_size = cpu_to_le32(sp->max_recv_size); - packet->max_fragmented_size = - cpu_to_le32(sp->max_fragmented_recv_size); - - request->sge[0].addr = ib_dma_map_single( - sc->ib.dev, (void *)packet, - sizeof(*packet), DMA_TO_DEVICE); - if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { - rc = -EIO; - goto dma_mapping_failed; - } - request->num_sge = 1; - - request->sge[0].length = sizeof(*packet); - request->sge[0].lkey = sc->ib.pd->local_dma_lkey; - - rc = smbd_post_send(sc, NULL, request); - if (!rc) - return 0; - - if (rc == -EAGAIN) - rc = -EIO; - -dma_mapping_failed: - smbd_free_send_io(request); - return rc; -} - -/* - * Extend the credits to remote peer - * This implements [MS-SMBD] 3.1.5.9 - * The idea is that we should extend credits to remote peer as quickly as - * it's allowed, to maintain data flow. We allocate as much receive - * buffer as possible, and extend the receive credits to remote peer - * return value: the new credtis being granted. - */ -static int manage_credits_prior_sending(struct smbdirect_socket *sc) -{ - int missing; - int available; - int new_credits; - - if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) - return 0; - - missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count); - available = atomic_xchg(&sc->recv_io.credits.available, 0); - new_credits = (u16)min3(U16_MAX, missing, available); - if (new_credits <= 0) { - /* - * If credits are available, but not granted - * we need to re-add them again. - */ - if (available) - atomic_add(available, &sc->recv_io.credits.available); - return 0; - } - - if (new_credits < available) { - /* - * Readd the remaining available again. - */ - available -= new_credits; - atomic_add(available, &sc->recv_io.credits.available); - } - - /* - * Remember we granted the credits - */ - atomic_add(new_credits, &sc->recv_io.credits.count); - return new_credits; -} - -/* - * Check if we need to send a KEEP_ALIVE message - * The idle connection timer triggers a KEEP_ALIVE message when expires - * SMBDIRECT_FLAG_RESPONSE_REQUESTED is set in the message flag to have peer send - * back a response. - * return value: - * 1 if SMBDIRECT_FLAG_RESPONSE_REQUESTED needs to be set - * 0: otherwise - */ -static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - - if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; - /* - * Now use the keepalive timeout (instead of keepalive interval) - * in order to wait for a response - */ - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_timeout_msec)); - return 1; - } - return 0; -} - -static int smbd_ib_post_send(struct smbdirect_socket *sc, - struct ib_send_wr *wr) -{ - int ret; - - atomic_inc(&sc->send_io.pending.count); - ret = ib_post_send(sc->ib.qp, wr, NULL); - if (ret) { - pr_err("failed to post send: %d\n", ret); - smbd_disconnect_rdma_connection(sc); - ret = -EAGAIN; - } - return ret; -} - -/* Post the send request */ -static int smbd_post_send(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - struct smbdirect_send_io *request) -{ - int i; - - for (i = 0; i < request->num_sge; i++) { - log_rdma_send(INFO, - "rdma_request sge[%d] addr=0x%llx length=%u\n", - i, request->sge[i].addr, request->sge[i].length); - ib_dma_sync_single_for_device( - sc->ib.dev, - request->sge[i].addr, - request->sge[i].length, - DMA_TO_DEVICE); - } - - request->cqe.done = send_done; - request->wr.next = NULL; - request->wr.sg_list = request->sge; - request->wr.num_sge = request->num_sge; - request->wr.opcode = IB_WR_SEND; - - if (batch) { - request->wr.wr_cqe = NULL; - request->wr.send_flags = 0; - if (!list_empty(&batch->msg_list)) { - struct smbdirect_send_io *last; - - last = list_last_entry(&batch->msg_list, - struct smbdirect_send_io, - sibling_list); - last->wr.next = &request->wr; - } - list_add_tail(&request->sibling_list, &batch->msg_list); - batch->wr_cnt++; - return 0; - } - - request->wr.wr_cqe = &request->cqe; - request->wr.send_flags = IB_SEND_SIGNALED; - return smbd_ib_post_send(sc, &request->wr); -} - -static void smbd_send_batch_init(struct smbdirect_send_batch *batch, - bool need_invalidate_rkey, - unsigned int remote_key) -{ - INIT_LIST_HEAD(&batch->msg_list); - batch->wr_cnt = 0; - batch->need_invalidate_rkey = need_invalidate_rkey; - batch->remote_key = remote_key; - batch->credit = 0; -} - -static int smbd_send_batch_flush(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - bool is_last) -{ - struct smbdirect_send_io *first, *last; - int ret = 0; - - if (list_empty(&batch->msg_list)) - goto release_credit; - - first = list_first_entry(&batch->msg_list, - struct smbdirect_send_io, - sibling_list); - last = list_last_entry(&batch->msg_list, - struct smbdirect_send_io, - sibling_list); - - if (batch->need_invalidate_rkey) { - first->wr.opcode = IB_WR_SEND_WITH_INV; - first->wr.ex.invalidate_rkey = batch->remote_key; - batch->need_invalidate_rkey = false; - batch->remote_key = 0; - } - - last->wr.send_flags = IB_SEND_SIGNALED; - last->wr.wr_cqe = &last->cqe; - - /* - * Remove last from batch->msg_list - * and splice the rest of batch->msg_list - * to last->sibling_list. - * - * batch->msg_list is a valid empty list - * at the end. - */ - list_del_init(&last->sibling_list); - list_splice_tail_init(&batch->msg_list, &last->sibling_list); - batch->wr_cnt = 0; - - ret = smbd_ib_post_send(sc, &first->wr); - if (ret) { - struct smbdirect_send_io *sibling, *next; - - list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { - list_del_init(&sibling->sibling_list); - smbd_free_send_io(sibling); - } - smbd_free_send_io(last); - } - -release_credit: - if (is_last && !ret && batch->credit) { - atomic_add(batch->credit, &sc->send_io.bcredits.count); - batch->credit = 0; - wake_up(&sc->send_io.bcredits.wait_queue); - } - - return ret; -} - -static int wait_for_credits(struct smbdirect_socket *sc, - wait_queue_head_t *waitq, atomic_t *total_credits, - int needed) -{ - int ret; - - do { - if (atomic_sub_return(needed, total_credits) >= 0) - return 0; - - atomic_add(needed, total_credits); - ret = wait_event_interruptible(*waitq, - atomic_read(total_credits) >= needed || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return -ENOTCONN; - else if (ret < 0) - return ret; - } while (true); -} - -static int wait_for_send_bcredit(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch) -{ - int ret; - - if (batch->credit) - return 0; - - ret = wait_for_credits(sc, - &sc->send_io.bcredits.wait_queue, - &sc->send_io.bcredits.count, - 1); - if (ret) - return ret; - - batch->credit = 1; - return 0; -} - -static int wait_for_send_lcredit(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch) -{ - if (batch && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { - int ret; - - ret = smbd_send_batch_flush(sc, batch, false); - if (ret) - return ret; - } - - return wait_for_credits(sc, - &sc->send_io.lcredits.wait_queue, - &sc->send_io.lcredits.count, - 1); -} - -static int wait_for_send_credits(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch) -{ - if (batch && - (batch->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { - int ret; - - ret = smbd_send_batch_flush(sc, batch, false); - if (ret) - return ret; - } - - return wait_for_credits(sc, - &sc->send_io.credits.wait_queue, - &sc->send_io.credits.count, - 1); -} - -static int smbd_post_send_iter(struct smbdirect_socket *sc, - struct smbdirect_send_batch *batch, - struct iov_iter *iter, - int *_remaining_data_length) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - int rc; - int header_length; - int data_length; - struct smbdirect_send_io *request; - struct smbdirect_data_transfer *packet; - int new_credits = 0; - struct smbdirect_send_batch _batch; - - if (!batch) { - smbd_send_batch_init(&_batch, false, 0); - batch = &_batch; - } - - rc = wait_for_send_bcredit(sc, batch); - if (rc) { - log_outgoing(ERR, "disconnected not sending on wait_bcredit\n"); - rc = -EAGAIN; - goto err_wait_bcredit; - } - - rc = wait_for_send_lcredit(sc, batch); - if (rc) { - log_outgoing(ERR, "disconnected not sending on wait_lcredit\n"); - rc = -EAGAIN; - goto err_wait_lcredit; - } - - rc = wait_for_send_credits(sc, batch); - if (rc) { - log_outgoing(ERR, "disconnected not sending on wait_credit\n"); - rc = -EAGAIN; - goto err_wait_credit; - } - - new_credits = manage_credits_prior_sending(sc); - if (new_credits == 0 && - atomic_read(&sc->send_io.credits.count) == 0 && - atomic_read(&sc->recv_io.credits.count) == 0) { - queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); - rc = wait_event_interruptible(sc->send_io.credits.wait_queue, - atomic_read(&sc->send_io.credits.count) >= 1 || - atomic_read(&sc->recv_io.credits.available) >= 1 || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - rc = -ENOTCONN; - if (rc < 0) { - log_outgoing(ERR, "disconnected not sending on last credit\n"); - rc = -EAGAIN; - goto err_wait_credit; - } - - new_credits = manage_credits_prior_sending(sc); - } - - request = smbd_alloc_send_io(sc); - if (IS_ERR(request)) { - rc = PTR_ERR(request); - goto err_alloc; - } - - memset(request->sge, 0, sizeof(request->sge)); - - /* Map the packet to DMA */ - header_length = sizeof(struct smbdirect_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!iter) - header_length = offsetof(struct smbdirect_data_transfer, padding); - - packet = smbdirect_send_io_payload(request); - request->sge[0].addr = ib_dma_map_single(sc->ib.dev, - (void *)packet, - header_length, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(sc->ib.dev, request->sge[0].addr)) { - rc = -EIO; - goto err_dma; - } - - request->sge[0].length = header_length; - request->sge[0].lkey = sc->ib.pd->local_dma_lkey; - request->num_sge = 1; - - /* Fill in the data payload to find out how much data we can add */ - if (iter) { - struct smb_extract_to_rdma extract = { - .nr_sge = request->num_sge, - .max_sge = SMBDIRECT_SEND_IO_MAX_SGE, - .sge = request->sge, - .device = sc->ib.dev, - .local_dma_lkey = sc->ib.pd->local_dma_lkey, - .direction = DMA_TO_DEVICE, - }; - size_t payload_len = umin(*_remaining_data_length, - sp->max_send_size - sizeof(*packet)); - - rc = smb_extract_iter_to_rdma(iter, payload_len, - &extract); - if (rc < 0) - goto err_dma; - data_length = rc; - request->num_sge = extract.nr_sge; - *_remaining_data_length -= data_length; - } else { - data_length = 0; - } - - /* Fill in the packet header */ - packet->credits_requested = cpu_to_le16(sp->send_credit_target); - packet->credits_granted = cpu_to_le16(new_credits); - - packet->flags = 0; - if (manage_keep_alive_before_sending(sc)) - packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); - - packet->reserved = 0; - if (!data_length) - packet->data_offset = 0; - else - packet->data_offset = cpu_to_le32(24); - packet->data_length = cpu_to_le32(data_length); - packet->remaining_data_length = cpu_to_le32(*_remaining_data_length); - packet->padding = 0; - - log_outgoing(INFO, "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", - le16_to_cpu(packet->credits_requested), - le16_to_cpu(packet->credits_granted), - le32_to_cpu(packet->data_offset), - le32_to_cpu(packet->data_length), - le32_to_cpu(packet->remaining_data_length)); - - rc = smbd_post_send(sc, batch, request); - if (!rc) { - /* - * From here request is moved to batch - * and we should not free it explicitly. - */ - - if (batch != &_batch) - return 0; - - rc = smbd_send_batch_flush(sc, batch, true); - if (!rc) - return 0; - - goto err_flush; - } - -err_dma: - smbd_free_send_io(request); - -err_flush: -err_alloc: - atomic_inc(&sc->send_io.credits.count); - wake_up(&sc->send_io.credits.wait_queue); - -err_wait_credit: - atomic_inc(&sc->send_io.lcredits.count); - wake_up(&sc->send_io.lcredits.wait_queue); - -err_wait_lcredit: - atomic_add(batch->credit, &sc->send_io.bcredits.count); - batch->credit = 0; - wake_up(&sc->send_io.bcredits.wait_queue); - -err_wait_bcredit: - return rc; -} - -/* - * Send an empty message - * Empty message is used to extend credits to peer to for keep live - * while there is no upper layer payload to send at the time - */ -static int smbd_post_send_empty(struct smbdirect_socket *sc) -{ - int remaining_data_length = 0; - - sc->statistics.send_empty++; - return smbd_post_send_iter(sc, NULL, NULL, &remaining_data_length); -} - static int smbd_post_send_full_iter(struct smbdirect_socket *sc, struct smbdirect_send_batch *batch, struct iov_iter *iter, - int *_remaining_data_length) + u32 remaining_data_length) { - int rc = 0; + int bytes = 0; /* - * smbd_post_send_iter() respects the + * smbdirect_connection_send_single_iter() respects the * negotiated max_send_size, so we need to * loop until the full iter is posted */ while (iov_iter_count(iter) > 0) { - rc = smbd_post_send_iter(sc, batch, iter, _remaining_data_length); + int rc; + + rc = smbdirect_connection_send_single_iter(sc, + batch, + iter, + 0, /* flags */ + remaining_data_length); if (rc < 0) - break; + return rc; + remaining_data_length -= rc; + bytes += rc; } - return rc; -} - -/* - * Post a receive request to the transport - * The remote peer can only send data when a receive request is posted - * The interaction is controlled by send/receive credit system - */ -static int smbd_post_recv( - struct smbdirect_socket *sc, struct smbdirect_recv_io *response) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct ib_recv_wr recv_wr; - int rc = -EIO; - - response->sge.addr = ib_dma_map_single( - sc->ib.dev, response->packet, - sp->max_recv_size, DMA_FROM_DEVICE); - if (ib_dma_mapping_error(sc->ib.dev, response->sge.addr)) - return rc; - - response->sge.length = sp->max_recv_size; - response->sge.lkey = sc->ib.pd->local_dma_lkey; - - response->cqe.done = recv_done; - - recv_wr.wr_cqe = &response->cqe; - recv_wr.next = NULL; - recv_wr.sg_list = &response->sge; - recv_wr.num_sge = 1; - - rc = ib_post_recv(sc->ib.qp, &recv_wr, NULL); - if (rc) { - ib_dma_unmap_single(sc->ib.dev, response->sge.addr, - response->sge.length, DMA_FROM_DEVICE); - response->sge.length = 0; - smbd_disconnect_rdma_connection(sc); - log_rdma_recv(ERR, "ib_post_recv failed rc=%d\n", rc); - } - - return rc; -} - -/* Perform SMBD negotiate according to [MS-SMBD] 3.1.5.2 */ -static int smbd_negotiate(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - int rc; - struct smbdirect_recv_io *response = get_receive_buffer(sc); - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED); - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; - - sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; - rc = smbd_post_recv(sc, response); - log_rdma_event(INFO, "smbd_post_recv rc=%d iov.addr=0x%llx iov.length=%u iov.lkey=0x%x\n", - rc, response->sge.addr, - response->sge.length, response->sge.lkey); - if (rc) { - put_receive_buffer(sc, response); - return rc; - } - - rc = smbd_post_send_negotiate_req(sc); - if (rc) - return rc; - - rc = wait_event_interruptible_timeout( - sc->status_wait, - sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING, - msecs_to_jiffies(sp->negotiate_timeout_msec)); - log_rdma_event(INFO, "wait_event_interruptible_timeout rc=%d\n", rc); - - if (sc->status == SMBDIRECT_SOCKET_CONNECTED) - return 0; - - if (rc == 0) - rc = -ETIMEDOUT; - else if (rc == -ERESTARTSYS) - rc = -EINTR; - else - rc = -ENOTCONN; - - return rc; -} - -/* - * Implement Connection.FragmentReassemblyBuffer defined in [MS-SMBD] 3.1.1.1 - * This is a queue for reassembling upper layer payload and present to upper - * layer. All the inncoming payload go to the reassembly queue, regardless of - * if reassembly is required. The uuper layer code reads from the queue for all - * incoming payloads. - * Put a received packet to the reassembly queue - * response: the packet received - * data_length: the size of payload in this packet - */ -static void enqueue_reassembly( - struct smbdirect_socket *sc, - struct smbdirect_recv_io *response, - int data_length) -{ - unsigned long flags; - - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - list_add_tail(&response->list, &sc->recv_io.reassembly.list); - sc->recv_io.reassembly.queue_length++; - /* - * Make sure reassembly_data_length is updated after list and - * reassembly_queue_length are updated. On the dequeue side - * reassembly_data_length is checked without a lock to determine - * if reassembly_queue_length and list is up to date - */ - virt_wmb(); - sc->recv_io.reassembly.data_length += data_length; - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - sc->statistics.enqueue_reassembly_queue++; -} - -/* - * Get the first entry at the front of reassembly queue - * Caller is responsible for locking - * return value: the first entry if any, NULL if queue is empty - */ -static struct smbdirect_recv_io *_get_first_reassembly(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *ret = NULL; - - if (!list_empty(&sc->recv_io.reassembly.list)) { - ret = list_first_entry( - &sc->recv_io.reassembly.list, - struct smbdirect_recv_io, list); - } - return ret; -} - -/* - * Get a receive buffer - * For each remote send, we need to post a receive. The receive buffers are - * pre-allocated in advance. - * return value: the receive buffer, NULL if none is available - */ -static struct smbdirect_recv_io *get_receive_buffer(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *ret = NULL; - unsigned long flags; - - spin_lock_irqsave(&sc->recv_io.free.lock, flags); - if (!list_empty(&sc->recv_io.free.list)) { - ret = list_first_entry( - &sc->recv_io.free.list, - struct smbdirect_recv_io, list); - list_del(&ret->list); - sc->statistics.get_receive_buffer++; - } - spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - - return ret; -} - -/* - * Return a receive buffer - * Upon returning of a receive buffer, we can post new receive and extend - * more receive credits to remote peer. This is done immediately after a - * receive buffer is returned. - */ -static void put_receive_buffer( - struct smbdirect_socket *sc, struct smbdirect_recv_io *response) -{ - unsigned long flags; - - if (likely(response->sge.length != 0)) { - ib_dma_unmap_single(sc->ib.dev, - response->sge.addr, - response->sge.length, - DMA_FROM_DEVICE); - response->sge.length = 0; - } - - spin_lock_irqsave(&sc->recv_io.free.lock, flags); - list_add_tail(&response->list, &sc->recv_io.free.list); - sc->statistics.put_receive_buffer++; - spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - - queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); -} - -/* Preallocate all receive buffer on transport establishment */ -static int allocate_receive_buffers(struct smbdirect_socket *sc, int num_buf) -{ - struct smbdirect_recv_io *response; - int i; - - for (i = 0; i < num_buf; i++) { - response = mempool_alloc(sc->recv_io.mem.pool, GFP_KERNEL); - if (!response) - goto allocate_failed; - - response->socket = sc; - response->sge.length = 0; - list_add_tail(&response->list, &sc->recv_io.free.list); - } - - return 0; - -allocate_failed: - while (!list_empty(&sc->recv_io.free.list)) { - response = list_first_entry( - &sc->recv_io.free.list, - struct smbdirect_recv_io, list); - list_del(&response->list); - - mempool_free(response, sc->recv_io.mem.pool); - } - return -ENOMEM; -} - -static void destroy_receive_buffers(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *response; - - while ((response = get_receive_buffer(sc))) - mempool_free(response, sc->recv_io.mem.pool); -} - -static void send_immediate_empty_message(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, idle.immediate_work); - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return; - - log_keep_alive(INFO, "send an empty message\n"); - smbd_post_send_empty(sc); -} - -/* Implement idle connection timer [MS-SMBD] 3.1.6.2 */ -static void idle_connection_timer(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, idle.timer_work.work); - struct smbdirect_socket_parameters *sp = &sc->parameters; - - if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { - log_keep_alive(ERR, - "error status sc->idle.keepalive=%d\n", - sc->idle.keepalive); - smbd_disconnect_rdma_connection(sc); - return; - } - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return; - - /* - * Now use the keepalive timeout (instead of keepalive interval) - * in order to wait for a response - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_timeout_msec)); - log_keep_alive(INFO, "schedule send of empty idle message\n"); - queue_work(sc->workqueue, &sc->idle.immediate_work); + return bytes; } /* @@ -1892,88 +203,14 @@ static void idle_connection_timer(struct work_struct *work) void smbd_destroy(struct TCP_Server_Info *server) { struct smbd_connection *info = server->smbd_conn; - struct smbdirect_socket *sc; - struct smbdirect_recv_io *response; - unsigned long flags; if (!info) { log_rdma_event(INFO, "rdma session already destroyed\n"); return; } - sc = &info->socket; - log_rdma_event(INFO, "cancelling and disable disconnect_work\n"); - disable_work_sync(&sc->disconnect_work); + smbdirect_socket_release(info->socket); - log_rdma_event(INFO, "destroying rdma session\n"); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) - smbd_disconnect_rdma_work(&sc->disconnect_work); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) { - log_rdma_event(INFO, "wait for transport being disconnected\n"); - wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); - log_rdma_event(INFO, "waited for transport being disconnected\n"); - } - - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - * - * Most likely this was already called via - * smbd_disconnect_rdma_work(), but call it again... - */ - smbd_disconnect_wake_up_all(sc); - - log_rdma_event(INFO, "cancelling recv_io.posted.refill_work\n"); - disable_work_sync(&sc->recv_io.posted.refill_work); - - log_rdma_event(INFO, "destroying qp\n"); - ib_drain_qp(sc->ib.qp); - rdma_destroy_qp(sc->rdma.cm_id); - sc->ib.qp = NULL; - - log_rdma_event(INFO, "cancelling idle timer\n"); - disable_delayed_work_sync(&sc->idle.timer_work); - log_rdma_event(INFO, "cancelling send immediate work\n"); - disable_work_sync(&sc->idle.immediate_work); - - /* It's not possible for upper layer to get to reassembly */ - log_rdma_event(INFO, "drain the reassembly queue\n"); - do { - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - response = _get_first_reassembly(sc); - if (response) { - list_del(&response->list); - spin_unlock_irqrestore( - &sc->recv_io.reassembly.lock, flags); - put_receive_buffer(sc, response); - } else - spin_unlock_irqrestore( - &sc->recv_io.reassembly.lock, flags); - } while (response); - sc->recv_io.reassembly.data_length = 0; - - log_rdma_event(INFO, "free receive buffers\n"); - destroy_receive_buffers(sc); - - log_rdma_event(INFO, "freeing mr list\n"); - destroy_mr_list(sc); - - ib_free_cq(sc->ib.send_cq); - ib_free_cq(sc->ib.recv_cq); - ib_dealloc_pd(sc->ib.pd); - rdma_destroy_id(sc->rdma.cm_id); - - /* free mempools */ - mempool_destroy(sc->send_io.mem.pool); - kmem_cache_destroy(sc->send_io.mem.cache); - - mempool_destroy(sc->recv_io.mem.pool); - kmem_cache_destroy(sc->recv_io.mem.cache); - - sc->status = SMBDIRECT_SOCKET_DESTROYED; - - destroy_workqueue(sc->workqueue); - log_rdma_event(INFO, "rdma session destroyed\n"); kfree(info); server->smbd_conn = NULL; } @@ -1995,10 +232,8 @@ int smbd_reconnect(struct TCP_Server_Info *server) * This is possible if transport is disconnected and we haven't received * notification from RDMA, but upper layer has detected timeout */ - if (server->smbd_conn->socket.status == SMBDIRECT_SOCKET_CONNECTED) { - log_rdma_event(INFO, "disconnecting transport\n"); - smbd_destroy(server); - } + log_rdma_event(INFO, "disconnecting transport\n"); + smbd_destroy(server); create_conn: log_rdma_event(INFO, "creating rdma session\n"); @@ -2014,112 +249,43 @@ create_conn: return -ENOENT; } -static void destroy_caches(struct smbdirect_socket *sc) -{ - destroy_receive_buffers(sc); - mempool_destroy(sc->recv_io.mem.pool); - kmem_cache_destroy(sc->recv_io.mem.cache); - mempool_destroy(sc->send_io.mem.pool); - kmem_cache_destroy(sc->send_io.mem.cache); -} - -#define MAX_NAME_LEN 80 -static int allocate_caches(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - char name[MAX_NAME_LEN]; - int rc; - - if (WARN_ON_ONCE(sp->max_recv_size < sizeof(struct smbdirect_data_transfer))) - return -ENOMEM; - - scnprintf(name, MAX_NAME_LEN, "smbdirect_send_io_%p", sc); - sc->send_io.mem.cache = - kmem_cache_create( - name, - sizeof(struct smbdirect_send_io) + - sizeof(struct smbdirect_data_transfer), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!sc->send_io.mem.cache) - return -ENOMEM; - - sc->send_io.mem.pool = - mempool_create(sp->send_credit_target, mempool_alloc_slab, - mempool_free_slab, sc->send_io.mem.cache); - if (!sc->send_io.mem.pool) - goto out1; - - scnprintf(name, MAX_NAME_LEN, "smbdirect_recv_io_%p", sc); - - struct kmem_cache_args response_args = { - .align = __alignof__(struct smbdirect_recv_io), - .useroffset = (offsetof(struct smbdirect_recv_io, packet) + - sizeof(struct smbdirect_data_transfer)), - .usersize = sp->max_recv_size - sizeof(struct smbdirect_data_transfer), - }; - sc->recv_io.mem.cache = - kmem_cache_create(name, - sizeof(struct smbdirect_recv_io) + sp->max_recv_size, - &response_args, SLAB_HWCACHE_ALIGN); - if (!sc->recv_io.mem.cache) - goto out2; - - sc->recv_io.mem.pool = - mempool_create(sp->recv_credit_max, mempool_alloc_slab, - mempool_free_slab, sc->recv_io.mem.cache); - if (!sc->recv_io.mem.pool) - goto out3; - - rc = allocate_receive_buffers(sc, sp->recv_credit_max); - if (rc) { - log_rdma_event(ERR, "failed to allocate receive buffers\n"); - goto out4; - } - - return 0; - -out4: - mempool_destroy(sc->recv_io.mem.pool); -out3: - kmem_cache_destroy(sc->recv_io.mem.cache); -out2: - mempool_destroy(sc->send_io.mem.pool); -out1: - kmem_cache_destroy(sc->send_io.mem.cache); - return -ENOMEM; -} - /* Create a SMBD connection, called by upper layer */ static struct smbd_connection *_smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr, int port) { - int rc; + struct net *net = cifs_net_ns(server); struct smbd_connection *info; struct smbdirect_socket *sc; + struct smbdirect_socket_parameters init_params = {}; struct smbdirect_socket_parameters *sp; - struct rdma_conn_param conn_param; - struct ib_qp_cap qp_cap; - struct ib_qp_init_attr qp_attr; - struct sockaddr_in *addr_in = (struct sockaddr_in *) dstaddr; - struct ib_port_immutable port_immutable; - __be32 ird_ord_hdr[2]; - char wq_name[80]; - struct workqueue_struct *workqueue; + __be16 *sport; + u64 port_flags = 0; + int ret; - info = kzalloc_obj(struct smbd_connection); - if (!info) - return NULL; - sc = &info->socket; - scnprintf(wq_name, ARRAY_SIZE(wq_name), "smbd_%p", sc); - workqueue = create_workqueue(wq_name); - if (!workqueue) - goto create_wq_failed; - smbdirect_socket_init(sc); - sc->workqueue = workqueue; - sp = &sc->parameters; - - INIT_WORK(&sc->disconnect_work, smbd_disconnect_rdma_work); + switch (port) { + case SMBD_PORT: + /* + * only allow iWarp devices + * for port 5445. + */ + port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW; + break; + case SMB_PORT: + /* + * only allow InfiniBand, RoCEv1 or RoCEv2 + * devices for port 445. + * + * (Basically don't allow iWarp devices) + */ + port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB; + break; + } + /* + * Create the initial parameters + */ + sp = &init_params; + sp->flags = port_flags; sp->resolve_addr_timeout_msec = RDMA_RESOLVE_TIMEOUT; sp->resolve_route_timeout_msec = RDMA_RESOLVE_TIMEOUT; sp->rdma_connect_timeout_msec = RDMA_RESOLVE_TIMEOUT; @@ -2135,215 +301,57 @@ static struct smbd_connection *_smbd_get_connection( sp->keepalive_interval_msec = smbd_keep_alive_interval * 1000; sp->keepalive_timeout_msec = KEEPALIVE_RECV_TIMEOUT * 1000; - rc = smbd_ia_open(sc, dstaddr, port); - if (rc) { - log_rdma_event(INFO, "smbd_ia_open rc=%d\n", rc); - goto create_id_failed; - } - - if (sp->send_credit_target > sc->ib.dev->attrs.max_cqe || - sp->send_credit_target > sc->ib.dev->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering send_credit_target = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - sp->send_credit_target, - sc->ib.dev->attrs.max_cqe, - sc->ib.dev->attrs.max_qp_wr); - goto config_failed; - } - - if (sp->recv_credit_max > sc->ib.dev->attrs.max_cqe || - sp->recv_credit_max > sc->ib.dev->attrs.max_qp_wr) { - log_rdma_event(ERR, "consider lowering receive_credit_max = %d. Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", - sp->recv_credit_max, - sc->ib.dev->attrs.max_cqe, - sc->ib.dev->attrs.max_qp_wr); - goto config_failed; - } - - if (sc->ib.dev->attrs.max_send_sge < SMBDIRECT_SEND_IO_MAX_SGE || - sc->ib.dev->attrs.max_recv_sge < SMBDIRECT_RECV_IO_MAX_SGE) { - log_rdma_event(ERR, - "device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", - IB_DEVICE_NAME_MAX, - sc->ib.dev->name, - sc->ib.dev->attrs.max_send_sge, - sc->ib.dev->attrs.max_recv_sge); - goto config_failed; - } - - sp->responder_resources = - min_t(u8, sp->responder_resources, - sc->ib.dev->attrs.max_qp_rd_atom); - log_rdma_mr(INFO, "responder_resources=%d\n", - sp->responder_resources); - - /* - * We use allocate sp->responder_resources * 2 MRs - * and each MR needs WRs for REG and INV, so - * we use '* 4'. - * - * +1 for ib_drain_qp() - */ - memset(&qp_cap, 0, sizeof(qp_cap)); - qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1; - qp_cap.max_recv_wr = sp->recv_credit_max + 1; - qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; - qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; - - sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); - if (IS_ERR(sc->ib.pd)) { - rc = PTR_ERR(sc->ib.pd); - sc->ib.pd = NULL; - log_rdma_event(ERR, "ib_alloc_pd() returned %d\n", rc); - goto alloc_pd_failed; - } - - sc->ib.send_cq = - ib_alloc_cq_any(sc->ib.dev, sc, - qp_cap.max_send_wr, IB_POLL_SOFTIRQ); - if (IS_ERR(sc->ib.send_cq)) { - sc->ib.send_cq = NULL; - goto alloc_cq_failed; - } - - sc->ib.recv_cq = - ib_alloc_cq_any(sc->ib.dev, sc, - qp_cap.max_recv_wr, IB_POLL_SOFTIRQ); - if (IS_ERR(sc->ib.recv_cq)) { - sc->ib.recv_cq = NULL; - goto alloc_cq_failed; - } - - memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.event_handler = smbd_qp_async_error_upcall; - qp_attr.qp_context = sc; - qp_attr.cap = qp_cap; - qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = sc->ib.send_cq; - qp_attr.recv_cq = sc->ib.recv_cq; - qp_attr.port_num = ~0; - - rc = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); - if (rc) { - log_rdma_event(ERR, "rdma_create_qp failed %i\n", rc); - goto create_qp_failed; - } - sc->ib.qp = sc->rdma.cm_id->qp; - - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = sp->initiator_depth; - conn_param.responder_resources = sp->responder_resources; - - /* Need to send IRD/ORD in private data for iWARP */ - sc->ib.dev->ops.get_port_immutable( - sc->ib.dev, sc->rdma.cm_id->port_num, &port_immutable); - if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { - ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); - ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); - conn_param.private_data = ird_ord_hdr; - conn_param.private_data_len = sizeof(ird_ord_hdr); - } else { - conn_param.private_data = NULL; - conn_param.private_data_len = 0; - } - - conn_param.retry_count = SMBD_CM_RETRY; - conn_param.rnr_retry_count = SMBD_CM_RNR_RETRY; - conn_param.flow_control = 0; - - log_rdma_event(INFO, "connecting to IP %pI4 port %d\n", - &addr_in->sin_addr, port); - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; - rc = rdma_connect(sc->rdma.cm_id, &conn_param); - if (rc) { - log_rdma_event(ERR, "rdma_connect() failed with %i\n", rc); - goto rdma_connect_failed; - } - - wait_event_interruptible_timeout( - sc->status_wait, - sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING, - msecs_to_jiffies(sp->rdma_connect_timeout_msec)); - - if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) { - log_rdma_event(ERR, "rdma_connect failed port=%d\n", port); - goto rdma_connect_failed; - } - - log_rdma_event(INFO, "rdma_connect connected\n"); - - rc = allocate_caches(sc); - if (rc) { - log_rdma_event(ERR, "cache allocation failed\n"); - goto allocate_cache_failed; - } - - INIT_WORK(&sc->idle.immediate_work, send_immediate_empty_message); - INIT_DELAYED_WORK(&sc->idle.timer_work, idle_connection_timer); - /* - * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING - * so that the timer will cause a disconnect. - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->negotiate_timeout_msec)); - - INIT_WORK(&sc->recv_io.posted.refill_work, smbd_post_send_credits); - - rc = smbd_negotiate(sc); - if (rc) { - log_rdma_event(ERR, "smbd_negotiate rc=%d\n", rc); - goto negotiation_failed; - } - - rc = allocate_mr_list(sc); - if (rc) { - log_rdma_mr(ERR, "memory registration allocation failed\n"); - goto allocate_mr_failed; + info = kzalloc_obj(*info); + if (!info) + return NULL; + ret = smbdirect_socket_create_kern(net, &sc); + if (ret) + goto socket_init_failed; + smbdirect_socket_set_logging(sc, NULL, smbd_logging_needed, smbd_logging_vaprintf); + ret = smbdirect_socket_set_initial_parameters(sc, sp); + if (ret) + goto set_params_failed; + ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_SOFTIRQ, GFP_KERNEL); + if (ret) + goto set_settings_failed; + + if (dstaddr->sa_family == AF_INET6) + sport = &((struct sockaddr_in6 *)dstaddr)->sin6_port; + else + sport = &((struct sockaddr_in *)dstaddr)->sin_port; + + *sport = htons(port); + + ret = smbdirect_connect_sync(sc, dstaddr); + if (ret) { + log_rdma_event(ERR, "connect to %pISpsfc failed: %1pe\n", + dstaddr, ERR_PTR(ret)); + goto connect_failed; } + info->socket = sc; return info; -allocate_mr_failed: - /* At this point, need to a full transport shutdown */ - server->smbd_conn = info; - smbd_destroy(server); - return NULL; - -negotiation_failed: - disable_delayed_work_sync(&sc->idle.timer_work); - destroy_caches(sc); - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - rdma_disconnect(sc->rdma.cm_id); - wait_event(sc->status_wait, - sc->status == SMBDIRECT_SOCKET_DISCONNECTED); - -allocate_cache_failed: -rdma_connect_failed: - rdma_destroy_qp(sc->rdma.cm_id); - -create_qp_failed: -alloc_cq_failed: - if (sc->ib.send_cq) - ib_free_cq(sc->ib.send_cq); - if (sc->ib.recv_cq) - ib_free_cq(sc->ib.recv_cq); - - ib_dealloc_pd(sc->ib.pd); - -alloc_pd_failed: -config_failed: - rdma_destroy_id(sc->rdma.cm_id); - -create_id_failed: - destroy_workqueue(sc->workqueue); -create_wq_failed: +connect_failed: +set_settings_failed: +set_params_failed: + smbdirect_socket_release(sc); +socket_init_failed: kfree(info); return NULL; } +const struct smbdirect_socket_parameters *smbd_get_parameters(struct smbd_connection *conn) +{ + if (unlikely(!conn->socket)) { + static const struct smbdirect_socket_parameters zero_params; + + return &zero_params; + } + + return smbdirect_socket_get_current_parameters(conn->socket); +} + struct smbd_connection *smbd_get_connection( struct TCP_Server_Info *server, struct sockaddr *dstaddr) { @@ -2362,7 +370,7 @@ try_again: if (!ret) return NULL; - sp = &ret->socket.parameters; + sp = smbd_get_parameters(ret); server->rdma_readwrite_threshold = rdma_readwrite_threshold > sp->max_fragmented_send_size ? @@ -2388,138 +396,12 @@ try_again: */ int smbd_recv(struct smbd_connection *info, struct msghdr *msg) { - struct smbdirect_socket *sc = &info->socket; - struct smbdirect_recv_io *response; - struct smbdirect_data_transfer *data_transfer; - size_t size = iov_iter_count(&msg->msg_iter); - int to_copy, to_read, data_read, offset; - u32 data_length, remaining_data_length, data_offset; - int rc; + struct smbdirect_socket *sc = info->socket; - if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) == WRITE)) - return -EINVAL; /* It's a bug in upper layer to get there */ + if (!smbdirect_connection_is_connected(sc)) + return -ENOTCONN; -again: - /* - * No need to hold the reassembly queue lock all the time as we are - * the only one reading from the front of the queue. The transport - * may add more entries to the back of the queue at the same time - */ - log_read(INFO, "size=%zd sc->recv_io.reassembly.data_length=%d\n", size, - sc->recv_io.reassembly.data_length); - if (sc->recv_io.reassembly.data_length >= size) { - int queue_length; - int queue_removed = 0; - unsigned long flags; - - /* - * Need to make sure reassembly_data_length is read before - * reading reassembly_queue_length and calling - * _get_first_reassembly. This call is lock free - * as we never read at the end of the queue which are being - * updated in SOFTIRQ as more data is received - */ - virt_rmb(); - queue_length = sc->recv_io.reassembly.queue_length; - data_read = 0; - to_read = size; - offset = sc->recv_io.reassembly.first_entry_offset; - while (data_read < size) { - response = _get_first_reassembly(sc); - data_transfer = smbdirect_recv_io_payload(response); - data_length = le32_to_cpu(data_transfer->data_length); - remaining_data_length = - le32_to_cpu( - data_transfer->remaining_data_length); - data_offset = le32_to_cpu(data_transfer->data_offset); - - /* - * The upper layer expects RFC1002 length at the - * beginning of the payload. Return it to indicate - * the total length of the packet. This minimize the - * change to upper layer packet processing logic. This - * will be eventually remove when an intermediate - * transport layer is added - */ - if (response->first_segment && size == 4) { - unsigned int rfc1002_len = - data_length + remaining_data_length; - __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len); - if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr), - &msg->msg_iter) != sizeof(rfc1002_hdr)) - return -EFAULT; - data_read = 4; - response->first_segment = false; - log_read(INFO, "returning rfc1002 length %d\n", - rfc1002_len); - goto read_rfc1002_done; - } - - to_copy = min_t(int, data_length - offset, to_read); - if (copy_to_iter((char *)data_transfer + data_offset + offset, - to_copy, &msg->msg_iter) != to_copy) - return -EFAULT; - - /* move on to the next buffer? */ - if (to_copy == data_length - offset) { - queue_length--; - /* - * No need to lock if we are not at the - * end of the queue - */ - if (queue_length) - list_del(&response->list); - else { - spin_lock_irqsave( - &sc->recv_io.reassembly.lock, flags); - list_del(&response->list); - spin_unlock_irqrestore( - &sc->recv_io.reassembly.lock, flags); - } - queue_removed++; - sc->statistics.dequeue_reassembly_queue++; - put_receive_buffer(sc, response); - offset = 0; - log_read(INFO, "put_receive_buffer offset=0\n"); - } else - offset += to_copy; - - to_read -= to_copy; - data_read += to_copy; - - log_read(INFO, "_get_first_reassembly memcpy %d bytes data_transfer_length-offset=%d after that to_read=%d data_read=%d offset=%d\n", - to_copy, data_length - offset, - to_read, data_read, offset); - } - - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - sc->recv_io.reassembly.data_length -= data_read; - sc->recv_io.reassembly.queue_length -= queue_removed; - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - - sc->recv_io.reassembly.first_entry_offset = offset; - log_read(INFO, "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", - data_read, sc->recv_io.reassembly.data_length, - sc->recv_io.reassembly.first_entry_offset); -read_rfc1002_done: - return data_read; - } - - log_read(INFO, "wait_event on more data\n"); - rc = wait_event_interruptible( - sc->recv_io.reassembly.wait_queue, - sc->recv_io.reassembly.data_length >= size || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - /* Don't return any data if interrupted */ - if (rc) - return rc; - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - log_read(ERR, "disconnected\n"); - return -ECONNABORTED; - } - - goto again; + return smbdirect_connection_recvmsg(sc, msg, 0); } /* @@ -2532,16 +414,17 @@ int smbd_send(struct TCP_Server_Info *server, int num_rqst, struct smb_rqst *rqst_array) { struct smbd_connection *info = server->smbd_conn; - struct smbdirect_socket *sc = &info->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_socket *sc = info->socket; + const struct smbdirect_socket_parameters *sp = smbd_get_parameters(info); struct smb_rqst *rqst; struct iov_iter iter; - struct smbdirect_send_batch batch; + struct smbdirect_send_batch_storage bstorage; + struct smbdirect_send_batch *batch; unsigned int remaining_data_length, klen; int rc, i, rqst_idx; int error = 0; - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + if (!smbdirect_connection_is_connected(sc)) return -EAGAIN; /* @@ -2564,7 +447,7 @@ int smbd_send(struct TCP_Server_Info *server, num_rqst, remaining_data_length); rqst_idx = 0; - smbd_send_batch_init(&batch, false, 0); + batch = smbdirect_init_send_batch_storage(&bstorage, false, 0); do { rqst = &rqst_array[rqst_idx]; @@ -2583,25 +466,27 @@ int smbd_send(struct TCP_Server_Info *server, klen += rqst->rq_iov[i].iov_len; iov_iter_kvec(&iter, ITER_SOURCE, rqst->rq_iov, rqst->rq_nvec, klen); - rc = smbd_post_send_full_iter(sc, &batch, &iter, &remaining_data_length); + rc = smbd_post_send_full_iter(sc, batch, &iter, remaining_data_length); if (rc < 0) { error = rc; break; } + remaining_data_length -= rc; if (iov_iter_count(&rqst->rq_iter) > 0) { /* And then the data pages if there are any */ - rc = smbd_post_send_full_iter(sc, &batch, &rqst->rq_iter, - &remaining_data_length); + rc = smbd_post_send_full_iter(sc, batch, &rqst->rq_iter, + remaining_data_length); if (rc < 0) { error = rc; break; } + remaining_data_length -= rc; } } while (++rqst_idx < num_rqst); - rc = smbd_send_batch_flush(sc, &batch, true); + rc = smbdirect_connection_send_batch_flush(sc, batch, true); if (unlikely(!rc && error)) rc = error; @@ -2612,298 +497,15 @@ int smbd_send(struct TCP_Server_Info *server, * that means all the I/Os have been out and we are good to return */ - wait_event(sc->send_io.pending.zero_wait_queue, - atomic_read(&sc->send_io.pending.count) == 0 || - sc->status != SMBDIRECT_SOCKET_CONNECTED); + error = rc; + rc = smbdirect_connection_send_wait_zero_pending(sc); + if (unlikely(rc && !error)) + error = -EAGAIN; - if (sc->status != SMBDIRECT_SOCKET_CONNECTED && rc == 0) - rc = -EAGAIN; - - return rc; -} - -static void register_mr_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbdirect_mr_io *mr = - container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); - struct smbdirect_socket *sc = mr->socket; - - if (wc->status) { - log_rdma_mr(ERR, "status=%d\n", wc->status); - smbd_disconnect_rdma_connection(sc); - } -} - -/* - * The work queue function that recovers MRs - * We need to call ib_dereg_mr() and ib_alloc_mr() before this MR can be used - * again. Both calls are slow, so finish them in a workqueue. This will not - * block I/O path. - * There is one workqueue that recovers MRs, there is no need to lock as the - * I/O requests calling smbd_register_mr will never update the links in the - * mr_list. - */ -static void smbd_mr_recovery_work(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, mr_io.recovery_work); - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_mr_io *smbdirect_mr; - int rc; - - list_for_each_entry(smbdirect_mr, &sc->mr_io.all.list, list) { - if (smbdirect_mr->state == SMBDIRECT_MR_ERROR) { - - /* recover this MR entry */ - rc = ib_dereg_mr(smbdirect_mr->mr); - if (rc) { - log_rdma_mr(ERR, - "ib_dereg_mr failed rc=%x\n", - rc); - smbd_disconnect_rdma_connection(sc); - continue; - } - - smbdirect_mr->mr = ib_alloc_mr( - sc->ib.pd, sc->mr_io.type, - sp->max_frmr_depth); - if (IS_ERR(smbdirect_mr->mr)) { - log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - sc->mr_io.type, - sp->max_frmr_depth); - smbd_disconnect_rdma_connection(sc); - continue; - } - } else - /* This MR is being used, don't recover it */ - continue; - - smbdirect_mr->state = SMBDIRECT_MR_READY; - - /* smbdirect_mr->state is updated by this function - * and is read and updated by I/O issuing CPUs trying - * to get a MR, the call to atomic_inc_return - * implicates a memory barrier and guarantees this - * value is updated before waking up any calls to - * get_mr() from the I/O issuing CPUs - */ - if (atomic_inc_return(&sc->mr_io.ready.count) == 1) - wake_up(&sc->mr_io.ready.wait_queue); - } -} - -static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr) -{ - struct smbdirect_socket *sc = mr->socket; - - lockdep_assert_held(&mr->mutex); - - if (mr->state == SMBDIRECT_MR_DISABLED) - return; - - if (mr->mr) - ib_dereg_mr(mr->mr); - if (mr->sgt.nents) - ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); - kfree(mr->sgt.sgl); - - mr->mr = NULL; - mr->sgt.sgl = NULL; - mr->sgt.nents = 0; - - mr->state = SMBDIRECT_MR_DISABLED; -} - -static void smbd_mr_free_locked(struct kref *kref) -{ - struct smbdirect_mr_io *mr = - container_of(kref, struct smbdirect_mr_io, kref); - - lockdep_assert_held(&mr->mutex); - - /* - * smbd_mr_disable_locked() should already be called! - */ - if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) - smbd_mr_disable_locked(mr); - - mutex_unlock(&mr->mutex); - mutex_destroy(&mr->mutex); - kfree(mr); -} - -static void destroy_mr_list(struct smbdirect_socket *sc) -{ - struct smbdirect_mr_io *mr, *tmp; - LIST_HEAD(all_list); - unsigned long flags; - - disable_work_sync(&sc->mr_io.recovery_work); - - spin_lock_irqsave(&sc->mr_io.all.lock, flags); - list_splice_tail_init(&sc->mr_io.all.list, &all_list); - spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); - - list_for_each_entry_safe(mr, tmp, &all_list, list) { - mutex_lock(&mr->mutex); - - smbd_mr_disable_locked(mr); - list_del(&mr->list); - mr->socket = NULL; - - /* - * No kref_put_mutex() as it's already locked. - * - * If smbd_mr_free_locked() is called - * and the mutex is unlocked and mr is gone, - * in that case kref_put() returned 1. - * - * If kref_put() returned 0 we know that - * smbd_mr_free_locked() didn't - * run. Not by us nor by anyone else, as we - * still hold the mutex, so we need to unlock. - * - * If the mr is still registered it will - * be dangling (detached from the connection - * waiting for smbd_deregister_mr() to be - * called in order to free the memory. - */ - if (!kref_put(&mr->kref, smbd_mr_free_locked)) - mutex_unlock(&mr->mutex); - } -} - -/* - * Allocate MRs used for RDMA read/write - * The number of MRs will not exceed hardware capability in responder_resources - * All MRs are kept in mr_list. The MR can be recovered after it's used - * Recovery is done in smbd_mr_recovery_work. The content of list entry changes - * as MRs are used and recovered for I/O, but the list links will not change - */ -static int allocate_mr_list(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_mr_io *mr; - int ret; - u32 i; - - if (sp->responder_resources == 0) { - log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); - return -EINVAL; - } - - /* Allocate more MRs (2x) than hardware responder_resources */ - for (i = 0; i < sp->responder_resources * 2; i++) { - mr = kzalloc_obj(*mr); - if (!mr) { - ret = -ENOMEM; - goto kzalloc_mr_failed; - } - - kref_init(&mr->kref); - mutex_init(&mr->mutex); - - mr->mr = ib_alloc_mr(sc->ib.pd, - sc->mr_io.type, - sp->max_frmr_depth); - if (IS_ERR(mr->mr)) { - ret = PTR_ERR(mr->mr); - log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", - sc->mr_io.type, sp->max_frmr_depth); - goto ib_alloc_mr_failed; - } - - mr->sgt.sgl = kzalloc_objs(struct scatterlist, - sp->max_frmr_depth); - if (!mr->sgt.sgl) { - ret = -ENOMEM; - log_rdma_mr(ERR, "failed to allocate sgl\n"); - goto kcalloc_sgl_failed; - } - mr->state = SMBDIRECT_MR_READY; - mr->socket = sc; - - list_add_tail(&mr->list, &sc->mr_io.all.list); - atomic_inc(&sc->mr_io.ready.count); - } - - INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + if (unlikely(error)) + return error; return 0; - -kcalloc_sgl_failed: - ib_dereg_mr(mr->mr); -ib_alloc_mr_failed: - mutex_destroy(&mr->mutex); - kfree(mr); -kzalloc_mr_failed: - destroy_mr_list(sc); - return ret; -} - -/* - * Get a MR from mr_list. This function waits until there is at least one - * MR available in the list. It may access the list while the - * smbd_mr_recovery_work is recovering the MR list. This doesn't need a lock - * as they never modify the same places. However, there may be several CPUs - * issuing I/O trying to get MR at the same time, mr_list_lock is used to - * protect this situation. - */ -static struct smbdirect_mr_io *get_mr(struct smbdirect_socket *sc) -{ - struct smbdirect_mr_io *ret; - unsigned long flags; - int rc; -again: - rc = wait_event_interruptible(sc->mr_io.ready.wait_queue, - atomic_read(&sc->mr_io.ready.count) || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (rc) { - log_rdma_mr(ERR, "wait_event_interruptible rc=%x\n", rc); - return NULL; - } - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - log_rdma_mr(ERR, "sc->status=%x\n", sc->status); - return NULL; - } - - spin_lock_irqsave(&sc->mr_io.all.lock, flags); - list_for_each_entry(ret, &sc->mr_io.all.list, list) { - if (ret->state == SMBDIRECT_MR_READY) { - ret->state = SMBDIRECT_MR_REGISTERED; - kref_get(&ret->kref); - spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); - atomic_dec(&sc->mr_io.ready.count); - atomic_inc(&sc->mr_io.used.count); - return ret; - } - } - - spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); - /* - * It is possible that we could fail to get MR because other processes may - * try to acquire a MR at the same time. If this is the case, retry it. - */ - goto again; -} - -/* - * Transcribe the pages from an iterator into an MR scatterlist. - */ -static int smbd_iter_to_mr(struct iov_iter *iter, - struct sg_table *sgt, - unsigned int max_sg) -{ - int ret; - - memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); - - ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); - WARN_ON(ret < 0); - if (sgt->nents > 0) - sg_mark_end(&sgt->sgl[sgt->nents - 1]); - return ret; } /* @@ -2917,132 +519,18 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate) { - struct smbdirect_socket *sc = &info->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_mr_io *mr; - int rc, num_pages; - struct ib_reg_wr *reg_wr; + struct smbdirect_socket *sc = info->socket; - num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); - if (num_pages > sp->max_frmr_depth) { - log_rdma_mr(ERR, "num_pages=%d max_frmr_depth=%d\n", - num_pages, sp->max_frmr_depth); - WARN_ON_ONCE(1); + if (!smbdirect_connection_is_connected(sc)) return NULL; - } - mr = get_mr(sc); - if (!mr) { - log_rdma_mr(ERR, "get_mr returning NULL\n"); - return NULL; - } - - mutex_lock(&mr->mutex); - - mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - mr->need_invalidate = need_invalidate; - mr->sgt.nents = 0; - mr->sgt.orig_nents = 0; - - log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", - num_pages, iov_iter_count(iter), sp->max_frmr_depth); - smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth); - - rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); - if (!rc) { - log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", - num_pages, mr->dir, rc); - goto dma_map_error; - } - - rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); - if (rc != mr->sgt.nents) { - log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d nents = %x\n", - rc, mr->sgt.nents); - goto map_mr_error; - } - - ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); - reg_wr = &mr->wr; - reg_wr->wr.opcode = IB_WR_REG_MR; - mr->cqe.done = register_mr_done; - reg_wr->wr.wr_cqe = &mr->cqe; - reg_wr->wr.num_sge = 0; - reg_wr->wr.send_flags = IB_SEND_SIGNALED; - reg_wr->mr = mr->mr; - reg_wr->key = mr->mr->rkey; - reg_wr->access = writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ; - - /* - * There is no need for waiting for complemtion on ib_post_send - * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution - * on the next ib_post_send when we actually send I/O to remote peer - */ - rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); - if (!rc) { - /* - * get_mr() gave us a reference - * via kref_get(&mr->kref), we keep that and let - * the caller use smbd_deregister_mr() - * to remove it again. - */ - mutex_unlock(&mr->mutex); - return mr; - } - - log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", - rc, reg_wr->key); - - /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ -map_mr_error: - ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); - -dma_map_error: - mr->sgt.nents = 0; - mr->state = SMBDIRECT_MR_ERROR; - if (atomic_dec_and_test(&sc->mr_io.used.count)) - wake_up(&sc->mr_io.cleanup.wait_queue); - - smbd_disconnect_rdma_connection(sc); - - /* - * get_mr() gave us a reference - * via kref_get(&mr->kref), we need to remove it again - * on error. - * - * No kref_put_mutex() as it's already locked. - * - * If smbd_mr_free_locked() is called - * and the mutex is unlocked and mr is gone, - * in that case kref_put() returned 1. - * - * If kref_put() returned 0 we know that - * smbd_mr_free_locked() didn't - * run. Not by us nor by anyone else, as we - * still hold the mutex, so we need to unlock. - */ - if (!kref_put(&mr->kref, smbd_mr_free_locked)) - mutex_unlock(&mr->mutex); - - return NULL; + return smbdirect_connection_register_mr_io(sc, iter, writing, need_invalidate); } -static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) +void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1) { - struct smbdirect_mr_io *smbdirect_mr; - struct ib_cqe *cqe; - - cqe = wc->wr_cqe; - smbdirect_mr = container_of(cqe, struct smbdirect_mr_io, cqe); - smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; - if (wc->status != IB_WC_SUCCESS) { - log_rdma_mr(ERR, "invalidate failed status=%x\n", wc->status); - smbdirect_mr->state = SMBDIRECT_MR_ERROR; - } - complete(&smbdirect_mr->invalidate_done); + smbdirect_mr_io_fill_buffer_descriptor(mr, v1); } /* @@ -3053,300 +541,20 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) */ void smbd_deregister_mr(struct smbdirect_mr_io *mr) { - struct smbdirect_socket *sc = mr->socket; - - mutex_lock(&mr->mutex); - if (mr->state == SMBDIRECT_MR_DISABLED) - goto put_kref; - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - smbd_mr_disable_locked(mr); - goto put_kref; - } - - if (mr->need_invalidate) { - struct ib_send_wr *wr = &mr->inv_wr; - int rc; - - /* Need to finish local invalidation before returning */ - wr->opcode = IB_WR_LOCAL_INV; - mr->cqe.done = local_inv_done; - wr->wr_cqe = &mr->cqe; - wr->num_sge = 0; - wr->ex.invalidate_rkey = mr->mr->rkey; - wr->send_flags = IB_SEND_SIGNALED; - - init_completion(&mr->invalidate_done); - rc = ib_post_send(sc->ib.qp, wr, NULL); - if (rc) { - log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); - smbd_mr_disable_locked(mr); - smbd_disconnect_rdma_connection(sc); - goto done; - } - wait_for_completion(&mr->invalidate_done); - mr->need_invalidate = false; - } else - /* - * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED - * and defer to mr_recovery_work to recover the MR for next use - */ - mr->state = SMBDIRECT_MR_INVALIDATED; - - if (mr->sgt.nents) { - ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); - mr->sgt.nents = 0; - } - - if (mr->state == SMBDIRECT_MR_INVALIDATED) { - mr->state = SMBDIRECT_MR_READY; - if (atomic_inc_return(&sc->mr_io.ready.count) == 1) - wake_up(&sc->mr_io.ready.wait_queue); - } else - /* - * Schedule the work to do MR recovery for future I/Os MR - * recovery is slow and don't want it to block current I/O - */ - queue_work(sc->workqueue, &sc->mr_io.recovery_work); - -done: - if (atomic_dec_and_test(&sc->mr_io.used.count)) - wake_up(&sc->mr_io.cleanup.wait_queue); - -put_kref: - /* - * No kref_put_mutex() as it's already locked. - * - * If smbd_mr_free_locked() is called - * and the mutex is unlocked and mr is gone, - * in that case kref_put() returned 1. - * - * If kref_put() returned 0 we know that - * smbd_mr_free_locked() didn't - * run. Not by us nor by anyone else, as we - * still hold the mutex, so we need to unlock - * and keep the mr in SMBDIRECT_MR_READY or - * SMBDIRECT_MR_ERROR state. - */ - if (!kref_put(&mr->kref, smbd_mr_free_locked)) - mutex_unlock(&mr->mutex); + smbdirect_connection_deregister_mr_io(mr); } -static bool smb_set_sge(struct smb_extract_to_rdma *rdma, - struct page *lowest_page, size_t off, size_t len) +void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m) { - struct ib_sge *sge = &rdma->sge[rdma->nr_sge]; - u64 addr; + if (!server->rdma) + return; - addr = ib_dma_map_page(rdma->device, lowest_page, - off, len, rdma->direction); - if (ib_dma_mapping_error(rdma->device, addr)) - return false; - - sge->addr = addr; - sge->length = len; - sge->lkey = rdma->local_dma_lkey; - rdma->nr_sge++; - return true; -} - -/* - * Extract page fragments from a BVEC-class iterator and add them to an RDMA - * element list. The pages are not pinned. - */ -static ssize_t smb_extract_bvec_to_rdma(struct iov_iter *iter, - struct smb_extract_to_rdma *rdma, - ssize_t maxsize) -{ - const struct bio_vec *bv = iter->bvec; - unsigned long start = iter->iov_offset; - unsigned int i; - ssize_t ret = 0; - - for (i = 0; i < iter->nr_segs; i++) { - size_t off, len; - - len = bv[i].bv_len; - if (start >= len) { - start -= len; - continue; - } - - len = min_t(size_t, maxsize, len - start); - off = bv[i].bv_offset + start; - - if (!smb_set_sge(rdma, bv[i].bv_page, off, len)) - return -EIO; - - ret += len; - maxsize -= len; - if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) - break; - start = 0; + if (!server->smbd_conn) { + seq_puts(m, "\nSMBDirect transport not available"); + return; } - if (ret > 0) - iov_iter_advance(iter, ret); - return ret; -} - -/* - * Extract fragments from a KVEC-class iterator and add them to an RDMA list. - * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers. - * The pages are not pinned. - */ -static ssize_t smb_extract_kvec_to_rdma(struct iov_iter *iter, - struct smb_extract_to_rdma *rdma, - ssize_t maxsize) -{ - const struct kvec *kv = iter->kvec; - unsigned long start = iter->iov_offset; - unsigned int i; - ssize_t ret = 0; - - for (i = 0; i < iter->nr_segs; i++) { - struct page *page; - unsigned long kaddr; - size_t off, len, seg; - - len = kv[i].iov_len; - if (start >= len) { - start -= len; - continue; - } - - kaddr = (unsigned long)kv[i].iov_base + start; - off = kaddr & ~PAGE_MASK; - len = min_t(size_t, maxsize, len - start); - kaddr &= PAGE_MASK; - - maxsize -= len; - do { - seg = min_t(size_t, len, PAGE_SIZE - off); - - if (is_vmalloc_or_module_addr((void *)kaddr)) - page = vmalloc_to_page((void *)kaddr); - else - page = virt_to_page((void *)kaddr); - - if (!smb_set_sge(rdma, page, off, seg)) - return -EIO; - - ret += seg; - len -= seg; - kaddr += PAGE_SIZE; - off = 0; - } while (len > 0 && rdma->nr_sge < rdma->max_sge); - - if (rdma->nr_sge >= rdma->max_sge || maxsize <= 0) - break; - start = 0; - } - - if (ret > 0) - iov_iter_advance(iter, ret); - return ret; -} - -/* - * Extract folio fragments from a FOLIOQ-class iterator and add them to an RDMA - * list. The folios are not pinned. - */ -static ssize_t smb_extract_folioq_to_rdma(struct iov_iter *iter, - struct smb_extract_to_rdma *rdma, - ssize_t maxsize) -{ - const struct folio_queue *folioq = iter->folioq; - unsigned int slot = iter->folioq_slot; - ssize_t ret = 0; - size_t offset = iter->iov_offset; - - BUG_ON(!folioq); - - if (slot >= folioq_nr_slots(folioq)) { - folioq = folioq->next; - if (WARN_ON_ONCE(!folioq)) - return -EIO; - slot = 0; - } - - do { - struct folio *folio = folioq_folio(folioq, slot); - size_t fsize = folioq_folio_size(folioq, slot); - - if (offset < fsize) { - size_t part = umin(maxsize, fsize - offset); - - if (!smb_set_sge(rdma, folio_page(folio, 0), offset, part)) - return -EIO; - - offset += part; - ret += part; - maxsize -= part; - } - - if (offset >= fsize) { - offset = 0; - slot++; - if (slot >= folioq_nr_slots(folioq)) { - if (!folioq->next) { - WARN_ON_ONCE(ret < iter->count); - break; - } - folioq = folioq->next; - slot = 0; - } - } - } while (rdma->nr_sge < rdma->max_sge && maxsize > 0); - - iter->folioq = folioq; - iter->folioq_slot = slot; - iter->iov_offset = offset; - iter->count -= ret; - return ret; -} - -/* - * Extract page fragments from up to the given amount of the source iterator - * and build up an RDMA list that refers to all of those bits. The RDMA list - * is appended to, up to the maximum number of elements set in the parameter - * block. - * - * The extracted page fragments are not pinned or ref'd in any way; if an - * IOVEC/UBUF-type iterator is to be used, it should be converted to a - * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some - * way. - */ -static ssize_t smb_extract_iter_to_rdma(struct iov_iter *iter, size_t len, - struct smb_extract_to_rdma *rdma) -{ - ssize_t ret; - int before = rdma->nr_sge; - - switch (iov_iter_type(iter)) { - case ITER_BVEC: - ret = smb_extract_bvec_to_rdma(iter, rdma, len); - break; - case ITER_KVEC: - ret = smb_extract_kvec_to_rdma(iter, rdma, len); - break; - case ITER_FOLIOQ: - ret = smb_extract_folioq_to_rdma(iter, rdma, len); - break; - default: - WARN_ON_ONCE(1); - return -EIO; - } - - if (ret < 0) { - while (rdma->nr_sge > before) { - struct ib_sge *sge = &rdma->sge[rdma->nr_sge--]; - - ib_dma_unmap_single(rdma->device, sge->addr, sge->length, - rdma->direction); - sge->addr = 0; - } - } - - return ret; + smbdirect_connection_legacy_debug_proc_show(server->smbd_conn->socket, + server->rdma_readwrite_threshold, + m); } diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index 577d37dbeb8a..0017d5b2de44 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -11,12 +11,8 @@ #define cifs_rdma_enabled(server) ((server)->rdma) #include "cifsglob.h" -#include -#include -#include #include "../common/smbdirect/smbdirect.h" -#include "../common/smbdirect/smbdirect_socket.h" extern int rdma_readwrite_threshold; extern int smbd_max_frmr_depth; @@ -27,17 +23,8 @@ extern int smbd_max_send_size; extern int smbd_send_credit_target; extern int smbd_receive_credit_max; -/* - * The context for the SMBDirect transport - * Everything related to the transport is here. It has several logical parts - * 1. RDMA related structures - * 2. SMBDirect connection parameters - * 3. Memory registrations - * 4. Receive and reassembly queues for data receive path - * 5. mempools for allocating packets - */ struct smbd_connection { - struct smbdirect_socket socket; + struct smbdirect_socket *socket; }; /* Create a SMBDirect session */ @@ -60,8 +47,12 @@ int smbd_send(struct TCP_Server_Info *server, struct smbdirect_mr_io *smbd_register_mr( struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate); +void smbd_mr_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1); void smbd_deregister_mr(struct smbdirect_mr_io *mr); +void smbd_debug_proc_show(struct TCP_Server_Info *server, struct seq_file *m); + #else #define cifs_rdma_enabled(server) 0 struct smbd_connection {}; diff --git a/fs/smb/common/Makefile b/fs/smb/common/Makefile index 9e0730a385fb..e6ee65c31b5d 100644 --- a/fs/smb/common/Makefile +++ b/fs/smb/common/Makefile @@ -4,3 +4,4 @@ # obj-$(CONFIG_SMBFS) += cifs_md4.o +obj-$(CONFIG_SMB_COMMON_SMBDIRECT) += smbdirect/ diff --git a/fs/smb/common/smbdirect/Kconfig b/fs/smb/common/smbdirect/Kconfig new file mode 100644 index 000000000000..a46a2e6ec87a --- /dev/null +++ b/fs/smb/common/smbdirect/Kconfig @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# smbdirect configuration + +config SMB_COMMON_SMBDIRECT + def_tristate n + depends on INFINIBAND && INFINIBAND_ADDR_TRANS + depends on m || INFINIBAND=y + select SG_POOL diff --git a/fs/smb/common/smbdirect/Makefile b/fs/smb/common/smbdirect/Makefile new file mode 100644 index 000000000000..423f533e1002 --- /dev/null +++ b/fs/smb/common/smbdirect/Makefile @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Makefile for smbdirect support +# + +obj-$(CONFIG_SMB_COMMON_SMBDIRECT) += smbdirect.o + +smbdirect-y := \ + smbdirect_socket.o \ + smbdirect_connection.o \ + smbdirect_mr.o \ + smbdirect_rw.o \ + smbdirect_debug.o \ + smbdirect_connect.o \ + smbdirect_listen.o \ + smbdirect_accept.o \ + smbdirect_devices.o \ + smbdirect_main.o diff --git a/fs/smb/common/smbdirect/smbdirect.h b/fs/smb/common/smbdirect/smbdirect.h index 821a34c4cc47..bbab5f7f7cc9 100644 --- a/fs/smb/common/smbdirect/smbdirect.h +++ b/fs/smb/common/smbdirect/smbdirect.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ /* - * Copyright (C) 2017, Microsoft Corporation. - * Copyright (C) 2018, LG Electronics. + * Copyright (C) 2025 Stefan Metzmacher */ #ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ @@ -25,12 +24,15 @@ struct smbdirect_buffer_descriptor_v1 { * Some values are important for the upper layer. */ struct smbdirect_socket_parameters { + __u64 flags; +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB ((__u64)0x1) +#define SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW ((__u64)0x2) __u32 resolve_addr_timeout_msec; __u32 resolve_route_timeout_msec; __u32 rdma_connect_timeout_msec; __u32 negotiate_timeout_msec; - __u8 initiator_depth; - __u8 responder_resources; + __u16 initiator_depth; /* limited to U8_MAX */ + __u16 responder_resources; /* limited to U8_MAX */ __u16 recv_credit_max; __u16 send_credit_target; __u32 max_send_size; @@ -43,4 +45,8 @@ struct smbdirect_socket_parameters { __u32 keepalive_timeout_msec; } __packed; +#define SMBDIRECT_FLAG_PORT_RANGE_MASK ( \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB | \ + SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) + #endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_H__ */ diff --git a/fs/smb/common/smbdirect/smbdirect_accept.c b/fs/smb/common/smbdirect/smbdirect_accept.c new file mode 100644 index 000000000000..d6d5e6a3f5de --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_accept.c @@ -0,0 +1,857 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" +#include +#include "../../common/smb2status.h" + +static int smbdirect_accept_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event); +static int smbdirect_accept_init_params(struct smbdirect_socket *sc); +static void smbdirect_accept_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void smbdirect_accept_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc); + +int smbdirect_accept_connect_request(struct smbdirect_socket *sc, + const struct rdma_conn_param *param) +{ + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recv_io; + u8 peer_initiator_depth; + u8 peer_responder_resources; + struct rdma_conn_param conn_param; + __be32 ird_ord_hdr[2]; + int ret; + + if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_CREATED)) + return -EINVAL; + + /* + * First set what the we as server are able to support + */ + sp->initiator_depth = min_t(u8, sp->initiator_depth, + sc->ib.dev->attrs.max_qp_rd_atom); + + peer_initiator_depth = param->initiator_depth; + peer_responder_resources = param->responder_resources; + smbdirect_connection_negotiate_rdma_resources(sc, + peer_initiator_depth, + peer_responder_resources, + param); + + ret = smbdirect_accept_init_params(sc); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_accept_init_params() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto init_params_failed; + } + + ret = smbdirect_connection_create_qp(sc); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_create_qp() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto create_qp_failed; + } + + ret = smbdirect_connection_create_mem_pools(sc); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_create_mem_pools() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto create_mem_failed; + } + + recv_io = smbdirect_connection_get_recv_io(sc); + if (WARN_ON_ONCE(!recv_io)) { + ret = -EINVAL; + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_get_recv_io() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto get_recv_io_failed; + } + recv_io->cqe.done = smbdirect_accept_negotiate_recv_done; + + /* + * Now post the recv_io buffer in order to get + * the negotiate request + */ + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; + ret = smbdirect_connection_post_recv_io(recv_io); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_post_recv_io() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto post_recv_io_failed; + } + /* + * From here recv_io is known to the RDMA QP and needs ib_drain_qp and + * smbdirect_accept_negotiate_recv_done to cleanup... + */ + recv_io = NULL; + + /* already checked with SMBDIRECT_CHECK_STATUS_WARN above */ + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + + /* + * We already negotiated sp->initiator_depth + * and sp->responder_resources above. + */ + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; + + if (sc->rdma.legacy_iwarp) { + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); + conn_param.private_data = ird_ord_hdr; + conn_param.private_data_len = sizeof(ird_ord_hdr); + } else { + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + } + conn_param.retry_count = SMBDIRECT_RDMA_CM_RETRY; + conn_param.rnr_retry_count = SMBDIRECT_RDMA_CM_RNR_RETRY; + conn_param.flow_control = 0; + + /* explicitly set above */ + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; + sc->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED; + sc->rdma.cm_id->event_handler = smbdirect_accept_rdma_event_handler; + ret = rdma_accept(sc->rdma.cm_id, &conn_param); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "rdma_accept() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto rdma_accept_failed; + } + + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + INIT_DELAYED_WORK(&sc->idle.timer_work, smbdirect_connection_idle_timer_work); + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + + return 0; + +rdma_accept_failed: + /* + * smbdirect_connection_destroy_qp() calls ib_drain_qp(), + * so that smbdirect_accept_negotiate_recv_done() will + * call smbdirect_connection_put_recv_io() + */ +post_recv_io_failed: + if (recv_io) + smbdirect_connection_put_recv_io(recv_io); +get_recv_io_failed: + smbdirect_connection_destroy_mem_pools(sc); +create_mem_failed: + smbdirect_connection_destroy_qp(sc); +create_qp_failed: +init_params_failed: + return ret; +} + +static int smbdirect_accept_init_params(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + int max_send_sges; + unsigned int maxpages; + + /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, + * SMB2 response could be mapped. + */ + max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; + if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { + pr_err("max_send_size %d is too large\n", sp->max_send_size); + return -EINVAL; + } + + /* + * There is only a single batch credit + */ + atomic_set(&sc->send_io.bcredits.count, 1); + + /* + * Initialize the local credits to post + * IB_WR_SEND[_WITH_INV]. + */ + atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); + + if (sp->max_read_write_size) { + maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); + sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, + sc->rdma.cm_id->port_num, + maxpages); + sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); + /* add one extra in order to handle unaligned pages */ + sc->rw_io.credits.max += 1; + } + + sc->recv_io.credits.target = 1; + + atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); + + return 0; +} + +static void smbdirect_accept_negotiate_recv_work(struct work_struct *work); + +static void smbdirect_accept_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_recv_io *recv_io = + container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + struct smbdirect_socket *sc = recv_io->socket; + unsigned long flags; + + if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + goto error; + } + + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO, + "smbdirect_recv_io completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + + /* + * This is an internal error! + */ + if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) + goto error; + + /* + * Don't reset timer to the keepalive interval in + * this will be done in smbdirect_accept_direct_negotiate_recv_work. + */ + + ib_dma_sync_single_for_cpu(sc->ib.dev, + recv_io->sge.addr, + recv_io->sge.length, + DMA_FROM_DEVICE); + + /* + * Only remember recv_io if it has enough bytes, + * this gives smbdirect_accept_negotiate_recv_work enough + * information in order to disconnect if it was not + * valid. + */ + sc->recv_io.reassembly.full_packet_received = true; + if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req)) + smbdirect_connection_reassembly_append_recv_io(sc, recv_io, 0); + else + smbdirect_connection_put_recv_io(recv_io); + + /* + * Some drivers (at least mlx5_ib and irdma) might post a + * recv completion before RDMA_CM_EVENT_ESTABLISHED, + * we need to adjust our expectation in that case. + * + * So we defer further processing of the negotiation + * to smbdirect_accept_negotiate_recv_work(). + * + * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED + * we queue the work directly otherwise + * smbdirect_accept_rdma_event_handler() will do it, when + * RDMA_CM_EVENT_ESTABLISHED arrived. + */ + spin_lock_irqsave(&sc->connect.lock, flags); + if (!sc->first_error) { + INIT_WORK(&sc->connect.work, smbdirect_accept_negotiate_recv_work); + if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) + queue_work(sc->workqueues.accept, &sc->connect.work); + } + spin_unlock_irqrestore(&sc->connect.lock, flags); + + return; + +error: + /* + * recv_io.posted.refill_work is still disabled, + * so smbdirect_connection_put_recv_io() won't + * start it. + */ + smbdirect_connection_put_recv_io(recv_io); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); +} + +static void smbdirect_accept_negotiate_recv_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, connect.work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recv_io; + struct smbdirect_negotiate_req *nreq; + unsigned long flags; + u16 min_version; + u16 max_version; + u16 credits_requested; + u32 preferred_send_size; + u32 max_receive_size; + u32 max_fragmented_size; + u32 ntstatus; + + if (sc->first_error) + return; + + /* + * make sure we won't start again... + */ + disable_work(work); + + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + + /* + * If smbdirect_accept_negotiate_recv_done() detected an + * invalid request we want to disconnect. + */ + recv_io = smbdirect_connection_reassembly_first_recv_io(sc); + if (!recv_io) { + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.queue_length--; + list_del(&recv_io->list); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + smbdirect_connection_put_recv_io(recv_io); + + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) + return; + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; + + /* + * Note recv_io is already part of the free list, + * as we just called smbdirect_connection_put_recv_io(), + * but it won't be reused before we call + * smbdirect_connection_recv_io_refill() below. + */ + + nreq = (struct smbdirect_negotiate_req *)recv_io->packet; + min_version = le16_to_cpu(nreq->min_version); + max_version = le16_to_cpu(nreq->max_version); + credits_requested = le16_to_cpu(nreq->credits_requested); + preferred_send_size = le32_to_cpu(nreq->preferred_send_size); + max_receive_size = le32_to_cpu(nreq->max_receive_size); + max_fragmented_size = le32_to_cpu(nreq->max_fragmented_size); + + smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO, + "ReqIn: %s%x, %s%x, %s%u, %s%u, %s%u, %s%u\n", + "MinVersion=0x", + le16_to_cpu(nreq->min_version), + "MaxVersion=0x", + le16_to_cpu(nreq->max_version), + "CreditsRequested=", + le16_to_cpu(nreq->credits_requested), + "PreferredSendSize=", + le32_to_cpu(nreq->preferred_send_size), + "MaxRecvSize=", + le32_to_cpu(nreq->max_receive_size), + "MaxFragmentedSize=", + le32_to_cpu(nreq->max_fragmented_size)); + + if (!(min_version <= SMBDIRECT_V1 && max_version >= SMBDIRECT_V1)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: min_version=0x%x max_version=0x%x\n", + min_version, max_version); + ntstatus = le32_to_cpu(STATUS_NOT_SUPPORTED); + goto not_supported; + } + + if (credits_requested == 0) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: credits_requested == 0\n"); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + if (max_receive_size < SMBDIRECT_MIN_RECEIVE_SIZE) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: max_receive_size=%u < %u\n", + max_receive_size, + SMBDIRECT_MIN_RECEIVE_SIZE); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + if (max_fragmented_size < SMBDIRECT_MIN_FRAGMENTED_SIZE) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: max_fragmented_size=%u < %u\n", + max_fragmented_size, + SMBDIRECT_MIN_FRAGMENTED_SIZE); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + /* + * At least the value of SMBDIRECT_MIN_RECEIVE_SIZE is used. + */ + sp->max_recv_size = min_t(u32, sp->max_recv_size, preferred_send_size); + sp->max_recv_size = max_t(u32, sp->max_recv_size, SMBDIRECT_MIN_RECEIVE_SIZE); + + /* + * The maximum fragmented upper-layer payload receive size supported + * + * Assume max_payload_per_credit is + * smb_direct_receive_credit_max - 24 = 1340 + * + * The maximum number would be + * smb_direct_receive_credit_max * max_payload_per_credit + * + * 1340 * 255 = 341700 (0x536C4) + * + * The minimum value from the spec is 131072 (0x20000) + * + * For now we use the logic we used in ksmbd before: + * (1364 * 255) / 2 = 173910 (0x2A756) + * + * We need to adjust this here in case the peer + * lowered sp->max_recv_size. + * + * TODO: instead of adjusting max_fragmented_recv_size + * we should adjust the number of available buffers, + * but for now we keep the logic as it was used + * in ksmbd before. + */ + sp->max_fragmented_recv_size = (sp->recv_credit_max * sp->max_recv_size) / 2; + + /* + * We take the value from the peer, which is checked to be higher than 0, + * but we limit it to the max value we support in order to have + * the main logic simpler. + */ + sc->recv_io.credits.target = credits_requested; + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, + sp->recv_credit_max); + + /* + * Note nreq->max_receive_size was already checked against + * SMBDIRECT_MIN_RECEIVE_SIZE above. + */ + sp->max_send_size = min_t(u32, sp->max_send_size, max_receive_size); + + /* + * Note nreq->max_fragmented_size was already checked against + * SMBDIRECT_MIN_FRAGMENTED_SIZE above. + */ + sp->max_fragmented_send_size = max_fragmented_size; + + if (sc->accept.listener) { + struct smbdirect_socket *lsc = sc->accept.listener; + unsigned long flags; + + spin_lock_irqsave(&lsc->listen.lock, flags); + list_del(&sc->accept.list); + list_add_tail(&sc->accept.list, &lsc->listen.ready); + wake_up(&lsc->listen.wait_queue); + spin_unlock_irqrestore(&lsc->listen.lock, flags); + + /* + * smbdirect_socket_accept() will call + * smbdirect_accept_negotiate_finish(nsc, 0); + * + * So that we don't send the negotiation + * response that grants credits to the peer + * before the socket is accepted by the + * application. + */ + return; + } + + ntstatus = le32_to_cpu(STATUS_SUCCESS); + +not_supported: + smbdirect_accept_negotiate_finish(sc, ntstatus); +} + +void smbdirect_accept_negotiate_finish(struct smbdirect_socket *sc, u32 ntstatus) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recv_io; + struct smbdirect_send_io *send_io; + struct smbdirect_negotiate_resp *nrep; + int posted; + u16 new_credits; + int ret; + + if (ntstatus) + goto not_supported; + + /* + * Prepare for receiving data_transfer messages + */ + sc->recv_io.reassembly.full_packet_received = true; + sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; + list_for_each_entry(recv_io, &sc->recv_io.free.list, list) + recv_io->cqe.done = smbdirect_connection_recv_io_done; + recv_io = NULL; + + /* + * We should at least post 1 smbdirect_recv_io! + */ + posted = smbdirect_connection_recv_io_refill(sc); + if (posted < 1) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_recv_io_refill() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(posted)); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + /* + * The response will grant credits for all posted + * smbdirect_recv_io messages. + */ + new_credits = smbdirect_connection_grant_recv_credits(sc); + +not_supported: + send_io = smbdirect_connection_alloc_send_io(sc); + if (IS_ERR(send_io)) { + ret = PTR_ERR(send_io); + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_alloc_send_io() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_socket_schedule_cleanup(sc, ret); + return; + } + send_io->cqe.done = smbdirect_accept_negotiate_send_done; + + nrep = (struct smbdirect_negotiate_resp *)send_io->packet; + nrep->min_version = cpu_to_le16(SMBDIRECT_V1); + nrep->max_version = cpu_to_le16(SMBDIRECT_V1); + if (ntstatus == 0) { + nrep->negotiated_version = cpu_to_le16(SMBDIRECT_V1); + nrep->reserved = 0; + nrep->credits_requested = cpu_to_le16(sp->send_credit_target); + nrep->credits_granted = cpu_to_le16(new_credits); + nrep->status = cpu_to_le32(ntstatus); + nrep->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); + nrep->preferred_send_size = cpu_to_le32(sp->max_send_size); + nrep->max_receive_size = cpu_to_le32(sp->max_recv_size); + nrep->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size); + } else { + nrep->negotiated_version = 0; + nrep->reserved = 0; + nrep->credits_requested = 0; + nrep->credits_granted = 0; + nrep->status = cpu_to_le32(ntstatus); + nrep->max_readwrite_size = 0; + nrep->preferred_send_size = 0; + nrep->max_receive_size = 0; + nrep->max_fragmented_size = 0; + } + + smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO, + "RepOut: %s%x, %s%x, %s%x, %s%u, %s%u, %s%x, %s%u, %s%u, %s%u, %s%u\n", + "MinVersion=0x", + le16_to_cpu(nrep->min_version), + "MaxVersion=0x", + le16_to_cpu(nrep->max_version), + "NegotiatedVersion=0x", + le16_to_cpu(nrep->negotiated_version), + "CreditsRequested=", + le16_to_cpu(nrep->credits_requested), + "CreditsGranted=", + le16_to_cpu(nrep->credits_granted), + "Status=0x", + le32_to_cpu(nrep->status), + "MaxReadWriteSize=", + le32_to_cpu(nrep->max_readwrite_size), + "PreferredSendSize=", + le32_to_cpu(nrep->preferred_send_size), + "MaxRecvSize=", + le32_to_cpu(nrep->max_receive_size), + "MaxFragmentedSize=", + le32_to_cpu(nrep->max_fragmented_size)); + + send_io->sge[0].addr = ib_dma_map_single(sc->ib.dev, + nrep, + sizeof(*nrep), + DMA_TO_DEVICE); + ret = ib_dma_mapping_error(sc->ib.dev, send_io->sge[0].addr); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "ib_dma_mapping_error() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_connection_free_send_io(send_io); + smbdirect_socket_schedule_cleanup(sc, ret); + return; + } + + send_io->sge[0].length = sizeof(*nrep); + send_io->sge[0].lkey = sc->ib.pd->local_dma_lkey; + send_io->num_sge = 1; + + ib_dma_sync_single_for_device(sc->ib.dev, + send_io->sge[0].addr, + send_io->sge[0].length, + DMA_TO_DEVICE); + + send_io->wr.next = NULL; + send_io->wr.wr_cqe = &send_io->cqe; + send_io->wr.sg_list = send_io->sge; + send_io->wr.num_sge = send_io->num_sge; + send_io->wr.opcode = IB_WR_SEND; + send_io->wr.send_flags = IB_SEND_SIGNALED; + + ret = smbdirect_connection_post_send_wr(sc, &send_io->wr); + if (ret) { + /* if we reach here, post send failed */ + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_post_send_wr() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + /* + * Note smbdirect_connection_free_send_io() + * does ib_dma_unmap_page() + */ + smbdirect_connection_free_send_io(send_io); + smbdirect_socket_schedule_cleanup(sc, ret); + return; + } + + /* + * smbdirect_accept_negotiate_send_done + * will do all remaining work... + */ +} + +static void smbdirect_accept_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_send_io *send_io = + container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); + struct smbdirect_socket *sc = send_io->socket; + struct smbdirect_negotiate_resp *nrep; + u32 ntstatus; + + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO, + "smbdirect_send_io completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + + nrep = (struct smbdirect_negotiate_resp *)send_io->packet; + ntstatus = le32_to_cpu(nrep->status); + + /* Note this frees wc->wr_cqe, but not wc */ + smbdirect_connection_free_send_io(send_io); + atomic_dec(&sc->send_io.pending.count); + + if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + /* + * If we send a smbdirect_negotiate_resp without NT_STATUS_OK (0) + * we need to disconnect now. + * + * Otherwise smbdirect_connection_negotiation_done() + * will setup all required things and wake up + * the waiter. + */ + if (ntstatus) + smbdirect_socket_schedule_cleanup(sc, -EOPNOTSUPP); + else + smbdirect_connection_negotiation_done(sc); +} + +static int smbdirect_accept_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct smbdirect_socket *sc = id->context; + unsigned long flags; + + /* + * cma_cm_event_handler() has + * lockdep_assert_held(&id_priv->handler_mutex); + * + * Mutexes are not allowed in interrupts, + * and we rely on not being in an interrupt here, + * as we might sleep. + * + * We didn't timeout so we cancel our idle timer, + * it will be scheduled again if needed. + */ + WARN_ON_ONCE(in_interrupt()); + + if (event->status || event->event != sc->rdma.expected_event) { + int ret = -ECONNABORTED; + + if (event->event == RDMA_CM_EVENT_REJECTED) + ret = -ECONNREFUSED; + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ret = -ENETDOWN; + if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status))) + ret = event->status; + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(sc->rdma.expected_event), + rdma_event_msg(event->event), + event->status, + SMBDIRECT_DEBUG_ERR_PTR(ret)); + + smbdirect_socket_schedule_cleanup(sc, ret); + return 0; + } + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "%s (first_error=%1pe) event=%s\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(event->event)); + + if (sc->first_error) + return 0; + + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: + smbdirect_connection_rdma_established(sc); + + /* + * Some drivers (at least mlx5_ib and irdma) might post a + * recv completion before RDMA_CM_EVENT_ESTABLISHED, + * we need to adjust our expectation in that case. + * + * If smbdirect_accept_negotiate_recv_done was called first + * it initialized sc->connect.work only for us to + * start, so that we turned into + * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before + * smbdirect_accept_negotiate_recv_work() runs. + * + * If smbdirect_accept_negotiate_recv_done didn't happen + * yet. sc->connect.work is still be disabled and + * queue_work() is a no-op. + */ + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) + return 0; + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + spin_lock_irqsave(&sc->connect.lock, flags); + if (!sc->first_error) + queue_work(sc->workqueues.accept, &sc->connect.work); + spin_unlock_irqrestore(&sc->connect.lock, flags); + + /* + * wait for smbdirect_accept_negotiate_recv_done() + * to get the negotiate request. + */ + return 0; + + default: + break; + } + + /* + * This is an internal error + */ + WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_ESTABLISHED); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return 0; +} + +static long smbdirect_socket_wait_for_accept(struct smbdirect_socket *lsc, long timeo) +{ + long ret; + + ret = wait_event_interruptible_timeout(lsc->listen.wait_queue, + !list_empty_careful(&lsc->listen.ready) || + lsc->status != SMBDIRECT_SOCKET_LISTENING || + lsc->first_error, + timeo); + if (lsc->status != SMBDIRECT_SOCKET_LISTENING) + return -EINVAL; + if (lsc->first_error) + return lsc->first_error; + if (!ret) + ret = -ETIMEDOUT; + if (ret < 0) + return ret; + + return 0; +} + +struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, + long timeo, + struct proto_accept_arg *arg) +{ + struct smbdirect_socket *nsc; + unsigned long flags; + + if (lsc->status != SMBDIRECT_SOCKET_LISTENING) { + arg->err = -EINVAL; + return NULL; + } + + if (lsc->first_error) { + arg->err = lsc->first_error; + return NULL; + } + + if (list_empty_careful(&lsc->listen.ready)) { + int ret; + + if (timeo == 0) { + arg->err = -EAGAIN; + return NULL; + } + + ret = smbdirect_socket_wait_for_accept(lsc, timeo); + if (ret) { + arg->err = ret; + return NULL; + } + } + + spin_lock_irqsave(&lsc->listen.lock, flags); + nsc = list_first_entry_or_null(&lsc->listen.ready, + struct smbdirect_socket, + accept.list); + if (nsc) { + nsc->accept.listener = NULL; + list_del_init_careful(&nsc->accept.list); + arg->is_empty = list_empty_careful(&lsc->listen.ready); + } + spin_unlock_irqrestore(&lsc->listen.lock, flags); + if (!nsc) { + arg->err = -EAGAIN; + return NULL; + } + + /* + * We did not send the negotiation response + * yet, so we did not grant any credits to the client, + * so it didn't grant any credits to us. + * + * The caller expects a connected socket + * now as there are no credits anyway. + * + * Then we send the negotiation response in + * order to grant credits to the peer. + */ + nsc->status = SMBDIRECT_SOCKET_CONNECTED; + smbdirect_accept_negotiate_finish(nsc, 0); + + return nsc; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_accept); diff --git a/fs/smb/common/smbdirect/smbdirect_connect.c b/fs/smb/common/smbdirect/smbdirect_connect.c new file mode 100644 index 000000000000..2b54f79dba43 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_connect.c @@ -0,0 +1,925 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2012,2016,2017,2025 Stefan Metzmacher + */ + +#include "smbdirect_internal.h" +#include "../../common/smb2status.h" + +static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc); +static int smbdirect_connect_resolve_addr(struct smbdirect_socket *sc, + const struct sockaddr *src, + const struct sockaddr *dst); +static int smbdirect_connect_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event); +static int smbdirect_connect_negotiate_start(struct smbdirect_socket *sc); +static void smbdirect_connect_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void smbdirect_connect_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc); + +int smbdirect_connect(struct smbdirect_socket *sc, const struct sockaddr *dst) +{ + const struct sockaddr *src = NULL; + union { + struct sockaddr sa; + struct sockaddr_storage ss; + } src_addr = { + .sa = { + .sa_family = AF_UNSPEC, + }, + }; + int ret; + + if (sc->first_error) + return -ENOTCONN; + + if (sc->status != SMBDIRECT_SOCKET_CREATED) + return -EALREADY; + + if (WARN_ON_ONCE(!sc->rdma.cm_id)) + return -EINVAL; + + src_addr.ss = sc->rdma.cm_id->route.addr.src_addr; + if (src_addr.sa.sa_family != AF_UNSPEC) + src = &src_addr.sa; + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "connect: src: %pISpsfc dst: %pISpsfc\n", + src, dst); + + ret = smbdirect_connect_setup_connection(sc); + if (ret) + return ret; + + ret = smbdirect_connect_resolve_addr(sc, src, dst); + if (ret) + return ret; + + /* + * The rest happens async via smbdirect_connect_rdma_event_handler() + * the caller will decide to wait or not. + */ + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect); + +static int smbdirect_connect_setup_connection(struct smbdirect_socket *sc) +{ + rdma_lock_handler(sc->rdma.cm_id); + sc->rdma.cm_id->event_handler = smbdirect_connect_rdma_event_handler; + rdma_unlock_handler(sc->rdma.cm_id); + + if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_CREATED)) + return -EINVAL; + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED; + + return 0; +} + +static int smbdirect_connect_resolve_addr(struct smbdirect_socket *sc, + const struct sockaddr *src, + const struct sockaddr *dst) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct sockaddr *src_addr = NULL; + struct sockaddr *dst_addr = NULL; + int ret; + + src_addr = (struct sockaddr *)src; + if (src_addr && src_addr->sa_family == AF_UNSPEC) + src_addr = NULL; + dst_addr = (struct sockaddr *)dst; + + if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED)) + return -EINVAL; + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING; + sc->rdma.expected_event = RDMA_CM_EVENT_ADDR_RESOLVED; + ret = rdma_resolve_addr(sc->rdma.cm_id, src_addr, dst_addr, + sp->resolve_addr_timeout_msec); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "rdma_resolve_addr() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + return 0; +} + +static int smbdirect_connect_resolve_route(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + int ret; + + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED)) + return sc->first_error; + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING; + sc->rdma.expected_event = RDMA_CM_EVENT_ROUTE_RESOLVED; + ret = rdma_resolve_route(sc->rdma.cm_id, sp->resolve_route_timeout_msec); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "rdma_resolve_route() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + return 0; +} + +static int smbdirect_connect_rdma_connect(struct smbdirect_socket *sc) +{ + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct rdma_conn_param conn_param; + __be32 ird_ord_hdr[2]; + int ret; + + sc->ib.dev = sc->rdma.cm_id->device; + + if (!smbdirect_frwr_is_supported(&sc->ib.dev->attrs)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "Fast Registration Work Requests (FRWR) is not supported device %.*s\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name); + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", + sc->ib.dev->attrs.device_cap_flags, + sc->ib.dev->attrs.max_fast_reg_page_list_len); + return -EPROTONOSUPPORT; + } + + if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB && + !rdma_ib_or_roce(sc->ib.dev, sc->rdma.cm_id->port_num)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "Not IB: device: %.*s IW:%u local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num), + &sc->rdma.cm_id->route.addr.src_addr, + &sc->rdma.cm_id->route.addr.dst_addr); + return -EPROTONOSUPPORT; + } + if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW && + !rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "Not IW: device: %.*s IB:%u local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + rdma_ib_or_roce(sc->ib.dev, sc->rdma.cm_id->port_num), + &sc->rdma.cm_id->route.addr.src_addr, + &sc->rdma.cm_id->route.addr.dst_addr); + return -EPROTONOSUPPORT; + } + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "rdma connect: device: %.*s local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + &sc->rdma.cm_id->route.addr.src_addr, + &sc->rdma.cm_id->route.addr.dst_addr); + + sp->max_frmr_depth = min_t(u32, sp->max_frmr_depth, + sc->ib.dev->attrs.max_fast_reg_page_list_len); + sc->mr_io.type = IB_MR_TYPE_MEM_REG; + if (sc->ib.dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) + sc->mr_io.type = IB_MR_TYPE_SG_GAPS; + + sp->responder_resources = min_t(u8, sp->responder_resources, + sc->ib.dev->attrs.max_qp_rd_atom); + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO, + "responder_resources=%d\n", + sp->responder_resources); + + ret = smbdirect_connection_create_qp(sc); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_create_qp() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = sp->initiator_depth; + conn_param.responder_resources = sp->responder_resources; + + /* Need to send IRD/ORD in private data for iWARP */ + if (rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num)) { + ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); + ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); + conn_param.private_data = ird_ord_hdr; + conn_param.private_data_len = sizeof(ird_ord_hdr); + } else { + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + } + + conn_param.retry_count = SMBDIRECT_RDMA_CM_RETRY; + conn_param.rnr_retry_count = SMBDIRECT_RDMA_CM_RNR_RETRY; + conn_param.flow_control = 0; + + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED)) + return sc->first_error; + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; + sc->rdma.expected_event = RDMA_CM_EVENT_ESTABLISHED; + ret = rdma_connect_locked(sc->rdma.cm_id, &conn_param); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "rdma_connect_locked() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + /* + * start with the rdma connect timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + INIT_DELAYED_WORK(&sc->idle.timer_work, smbdirect_connection_idle_timer_work); + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->rdma_connect_timeout_msec)); + + return 0; +} + +static int smbdirect_connect_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct smbdirect_socket *sc = id->context; + u8 peer_initiator_depth; + u8 peer_responder_resources; + int ret; + + /* + * cma_cm_event_handler() has + * lockdep_assert_held(&id_priv->handler_mutex); + * + * Mutexes are not allowed in interrupts, + * and we rely on not being in an interrupt here, + * as we might sleep. + * + * We didn't timeout so we cancel our idle timer, + * it will be scheduled again if needed. + */ + WARN_ON_ONCE(in_interrupt()); + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + cancel_delayed_work_sync(&sc->idle.timer_work); + + if (event->status || event->event != sc->rdma.expected_event) { + int lvl = SMBDIRECT_LOG_ERR; + + ret = -ECONNABORTED; + + if (event->event == RDMA_CM_EVENT_REJECTED) + ret = -ECONNREFUSED; + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ret = -ENETDOWN; + if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status))) + ret = event->status; + + if (ret == -ENODEV) + lvl = SMBDIRECT_LOG_INFO; + + smbdirect_log_rdma_event(sc, lvl, + "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(sc->rdma.expected_event), + rdma_event_msg(event->event), + event->status, + SMBDIRECT_DEBUG_ERR_PTR(ret)); + + smbdirect_socket_schedule_cleanup_lvl(sc, + lvl, + ret); + return 0; + } + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "%s (first_error=%1pe) event=%s\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(event->event)); + + if (sc->first_error) + return 0; + + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING)) + return 0; + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED; + + ret = smbdirect_connect_resolve_route(sc); + if (ret) + smbdirect_socket_schedule_cleanup(sc, ret); + return 0; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING)) + return 0; + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; + + ret = smbdirect_connect_rdma_connect(sc); + if (ret) + smbdirect_socket_schedule_cleanup(sc, ret); + return 0; + + case RDMA_CM_EVENT_ESTABLISHED: + smbdirect_connection_rdma_established(sc); + + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) + return 0; + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; + + /* + * Here we work around an inconsistency between + * iWarp and other devices (at least rxe and irdma using RoCEv2) + */ + if (rdma_protocol_iwarp(id->device, id->port_num)) { + /* + * iWarp devices report the peer's values + * with the perspective of the peer here. + * Tested with siw and irdma (in iwarp mode) + * We need to change to our perspective here, + * so we need to switch the values. + */ + peer_initiator_depth = event->param.conn.responder_resources; + peer_responder_resources = event->param.conn.initiator_depth; + } else { + /* + * Non iWarp devices report the peer's values + * already changed to our perspective here. + * Tested with rxe and irdma (in roce mode). + */ + peer_initiator_depth = event->param.conn.initiator_depth; + peer_responder_resources = event->param.conn.responder_resources; + } + smbdirect_connection_negotiate_rdma_resources(sc, + peer_initiator_depth, + peer_responder_resources, + &event->param.conn); + + ret = smbdirect_connect_negotiate_start(sc); + if (ret) + smbdirect_socket_schedule_cleanup(sc, ret); + return 0; + + default: + break; + } + + /* + * This is an internal error + */ + WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_ESTABLISHED); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return 0; +} + +static int smbdirect_connect_negotiate_start(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recv_io = NULL; + struct smbdirect_send_io *send_io = NULL; + struct smbdirect_negotiate_req *nreq = NULL; + int ret; + + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) + return sc->first_error; + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; + + ret = smbdirect_connection_create_mem_pools(sc); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_create_mem_pools() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto create_mem_pools_failed; + } + + /* + * There is only a single batch credit + */ + atomic_set(&sc->send_io.bcredits.count, 1); + + /* + * Initialize the local credits to post + * IB_WR_SEND[_WITH_INV]. + */ + atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); + + recv_io = smbdirect_connection_get_recv_io(sc); + if (WARN_ON_ONCE(!recv_io)) { + ret = -EINVAL; + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_get_recv_io() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto get_recv_io_failed; + } + recv_io->cqe.done = smbdirect_connect_negotiate_recv_done; + + send_io = smbdirect_connection_alloc_send_io(sc); + if (IS_ERR(send_io)) { + ret = PTR_ERR(send_io); + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_alloc_send_io() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto alloc_send_io_failed; + } + send_io->cqe.done = smbdirect_connect_negotiate_send_done; + + nreq = (struct smbdirect_negotiate_req *)send_io->packet; + nreq->min_version = cpu_to_le16(SMBDIRECT_V1); + nreq->max_version = cpu_to_le16(SMBDIRECT_V1); + nreq->reserved = 0; + nreq->credits_requested = cpu_to_le16(sp->send_credit_target); + nreq->preferred_send_size = cpu_to_le32(sp->max_send_size); + nreq->max_receive_size = cpu_to_le32(sp->max_recv_size); + nreq->max_fragmented_size = cpu_to_le32(sp->max_fragmented_recv_size); + + smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO, + "ReqOut: %s%x, %s%x, %s%u, %s%u, %s%u, %s%u\n", + "MinVersion=0x", + le16_to_cpu(nreq->min_version), + "MaxVersion=0x", + le16_to_cpu(nreq->max_version), + "CreditsRequested=", + le16_to_cpu(nreq->credits_requested), + "PreferredSendSize=", + le32_to_cpu(nreq->preferred_send_size), + "MaxRecvSize=", + le32_to_cpu(nreq->max_receive_size), + "MaxFragmentedSize=", + le32_to_cpu(nreq->max_fragmented_size)); + + send_io->sge[0].addr = ib_dma_map_single(sc->ib.dev, + nreq, + sizeof(*nreq), + DMA_TO_DEVICE); + ret = ib_dma_mapping_error(sc->ib.dev, send_io->sge[0].addr); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "ib_dma_mapping_error() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto dma_mapping_failed; + } + + send_io->sge[0].length = sizeof(*nreq); + send_io->sge[0].lkey = sc->ib.pd->local_dma_lkey; + send_io->num_sge = 1; + + ib_dma_sync_single_for_device(sc->ib.dev, + send_io->sge[0].addr, + send_io->sge[0].length, + DMA_TO_DEVICE); + + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO, + "sge addr=0x%llx length=%u lkey=0x%x\n", + send_io->sge[0].addr, + send_io->sge[0].length, + send_io->sge[0].lkey); + + /* + * Now post the recv_io buffer in order to get + * the negotiate response + */ + sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REP; + ret = smbdirect_connection_post_recv_io(recv_io); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_post_recv_io() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto post_recv_io_failed; + } + + send_io->wr.next = NULL; + send_io->wr.wr_cqe = &send_io->cqe; + send_io->wr.sg_list = send_io->sge; + send_io->wr.num_sge = send_io->num_sge; + send_io->wr.opcode = IB_WR_SEND; + send_io->wr.send_flags = IB_SEND_SIGNALED; + + ret = smbdirect_connection_post_send_wr(sc, &send_io->wr); + if (ret) { + /* if we reach here, post send failed */ + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_post_send_wr() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto post_send_wr_failed; + } + + /* + * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING + * so that the timer will cause a disconnect. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + + return 0; + +post_send_wr_failed: + /* + * ib_dma_unmap_single is called in + * smbdirect_connection_free_send_io() + */ + smbdirect_connection_free_send_io(send_io); + /* + * recv_io is given to the rdma layer, + * we should not put it even on error + * nor call smbdirect_connection_destroy_mem_pools() + * it will be cleaned up during disconnect. + */ + return ret; + +post_recv_io_failed: + /* + * ib_dma_unmap_single is called in + * smbdirect_connection_free_send_io() + */ +dma_mapping_failed: + smbdirect_connection_free_send_io(send_io); + +alloc_send_io_failed: + smbdirect_connection_put_recv_io(recv_io); + +get_recv_io_failed: + smbdirect_connection_destroy_mem_pools(sc); + +create_mem_pools_failed: + return ret; +} + +static void smbdirect_connect_negotiate_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_send_io *send_io = + container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); + struct smbdirect_socket *sc = send_io->socket; + + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO, + "smbdirect_send_io completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + + /* Note this frees wc->wr_cqe, but not wc */ + smbdirect_connection_free_send_io(send_io); + atomic_dec(&sc->send_io.pending.count); + + if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } +} + +static void smbdirect_connect_negotiate_recv_work(struct work_struct *work); + +static void smbdirect_connect_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_recv_io *recv_io = + container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + struct smbdirect_socket *sc = recv_io->socket; + unsigned long flags; + + if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + goto error; + } + + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO, + "smbdirect_recv_io completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + + /* + * This is an internal error! + */ + if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REP)) + goto error; + + /* + * Don't reset timer to the keepalive interval in + * this will be done in smbdirect_accept_direct_negotiate_recv_work. + */ + + ib_dma_sync_single_for_cpu(sc->ib.dev, + recv_io->sge.addr, + recv_io->sge.length, + DMA_FROM_DEVICE); + + /* + * Only remember recv_io if it has enough bytes, + * this gives smbdirect_accept_negotiate_recv_work enough + * information in order to disconnect if it was not + * valid. + */ + sc->recv_io.reassembly.full_packet_received = true; + if (wc->byte_len >= sizeof(struct smbdirect_negotiate_resp)) + smbdirect_connection_reassembly_append_recv_io(sc, recv_io, 0); + else + smbdirect_connection_put_recv_io(recv_io); + + /* + * We continue via the workqueue as we may have + * complex work that might sleep. + * + * So we defer further processing of the negotiation + * to smbdirect_connect_negotiate_recv_work(). + */ + spin_lock_irqsave(&sc->connect.lock, flags); + if (!sc->first_error) { + INIT_WORK(&sc->connect.work, smbdirect_connect_negotiate_recv_work); + if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) + queue_work(sc->workqueues.connect, &sc->connect.work); + } + spin_unlock_irqrestore(&sc->connect.lock, flags); + + return; + +error: + /* + * recv_io.posted.refill_work is still disabled, + * so smbdirect_connection_put_recv_io() won't + * start it. + */ + smbdirect_connection_put_recv_io(recv_io); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); +} + +static void smbdirect_connect_negotiate_recv_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, connect.work); + struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_recv_io *recv_io; + struct smbdirect_negotiate_resp *nrep; + unsigned long flags; + u16 negotiated_version; + u16 credits_requested; + u16 credits_granted; + u32 status; + u32 max_readwrite_size; + u32 preferred_send_size; + u32 max_receive_size; + u32 max_fragmented_size; + int posted; + int ret; + + if (sc->first_error) + return; + + /* + * make sure we won't start again... + */ + disable_work(work); + + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + + /* + * If smbdirect_connect_negotiate_recv_done() detected an + * invalid request we want to disconnect. + */ + recv_io = smbdirect_connection_reassembly_first_recv_io(sc); + if (!recv_io) { + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.queue_length--; + list_del(&recv_io->list); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + smbdirect_connection_put_recv_io(recv_io); + + if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_NEGOTIATE_RUNNING)) + return; + + /* + * Note recv_io is already part of the free list, + * as we just called smbdirect_connection_put_recv_io(), + * but it won't be reused before we call + * smbdirect_connection_recv_io_refill() below. + */ + + nrep = (struct smbdirect_negotiate_resp *)recv_io->packet; + negotiated_version = le16_to_cpu(nrep->negotiated_version); + credits_requested = le16_to_cpu(nrep->credits_requested); + credits_granted = le16_to_cpu(nrep->credits_granted); + status = le32_to_cpu(nrep->status); + max_readwrite_size = le32_to_cpu(nrep->max_readwrite_size); + preferred_send_size = le32_to_cpu(nrep->preferred_send_size); + max_receive_size = le32_to_cpu(nrep->max_receive_size); + max_fragmented_size = le32_to_cpu(nrep->max_fragmented_size); + + smbdirect_log_negotiate(sc, SMBDIRECT_LOG_INFO, + "RepIn: %s%x, %s%x, %s%x, %s%u, %s%u, %s%x, %s%u, %s%u, %s%u, %s%u\n", + "MinVersion=0x", + le16_to_cpu(nrep->min_version), + "MaxVersion=0x", + le16_to_cpu(nrep->max_version), + "NegotiatedVersion=0x", + le16_to_cpu(nrep->negotiated_version), + "CreditsRequested=", + le16_to_cpu(nrep->credits_requested), + "CreditsGranted=", + le16_to_cpu(nrep->credits_granted), + "Status=0x", + le32_to_cpu(nrep->status), + "MaxReadWriteSize=", + le32_to_cpu(nrep->max_readwrite_size), + "PreferredSendSize=", + le32_to_cpu(nrep->preferred_send_size), + "MaxRecvSize=", + le32_to_cpu(nrep->max_receive_size), + "MaxFragmentedSize=", + le32_to_cpu(nrep->max_fragmented_size)); + + if (negotiated_version != SMBDIRECT_V1) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: negotiated_version=0x%x\n", + negotiated_version); + smbdirect_socket_schedule_cleanup(sc, -ECONNREFUSED); + return; + } + + if (status != le32_to_cpu(STATUS_SUCCESS)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: status=0x%x != 0x0\n", + status); + smbdirect_socket_schedule_cleanup(sc, -ECONNREFUSED); + return; + } + + if (max_receive_size < SMBDIRECT_MIN_RECEIVE_SIZE) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: max_receive_size=%u < %u\n", + max_receive_size, + SMBDIRECT_MIN_RECEIVE_SIZE); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + if (max_fragmented_size < SMBDIRECT_MIN_FRAGMENTED_SIZE) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: max_fragmented_size=%u < %u\n", + max_fragmented_size, + SMBDIRECT_MIN_FRAGMENTED_SIZE); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + if (credits_granted == 0) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: credits_granted == 0\n"); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + if (credits_requested == 0) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: credits_requested == 0\n"); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + if (preferred_send_size > sp->max_recv_size) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: preferred_send_size=%u > max_recv_size=%u\n", + preferred_send_size, + sp->max_recv_size); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + /* + * We take the value from the peer, which is checked to be higher than 0, + * but we limit it to the max value we support in order to have + * the main logic simpler. + */ + sc->recv_io.credits.target = credits_requested; + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, + sp->recv_credit_max); + + /* + * At least the value of SMBDIRECT_MIN_RECEIVE_SIZE is used. + */ + sp->max_recv_size = min_t(u32, sp->max_recv_size, preferred_send_size); + sp->max_recv_size = max_t(u32, sp->max_recv_size, SMBDIRECT_MIN_RECEIVE_SIZE); + + /* + * We already sent our sp->max_fragmented_recv_size + * to the peer, so we can't lower it here any more. + * + * TODO: but if the peer lowered sp->max_recv_size + * we will have to adjust our number of buffers. + * + * But for now we keep it as the cifs.ko code + * worked before. + */ + + /* + * Note nrep->max_receive_size was already checked against + * SMBDIRECT_MIN_RECEIVE_SIZE above. + */ + sp->max_send_size = min_t(u32, sp->max_send_size, max_receive_size); + + /* + * Make sure the resulting max_frmr_depth is at least 1, + * which means max_read_write_size needs to be at least PAGE_SIZE. + */ + sp->max_read_write_size = min_t(u32, sp->max_frmr_depth * PAGE_SIZE, + max_readwrite_size); + if (sp->max_read_write_size < PAGE_SIZE) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: max_readwrite_size=%u < PAGE_SIZE(%lu)\n", + max_readwrite_size, + PAGE_SIZE); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + sp->max_frmr_depth = sp->max_read_write_size / PAGE_SIZE; + + /* + * Note nrep->credits_granted was already checked against 0 above. + */ + atomic_set(&sc->send_io.credits.count, credits_granted); + + /* + * Note nrep->max_fragmented_size was already checked against + * SMBDIRECT_MIN_FRAGMENTED_SIZE above. + */ + sp->max_fragmented_send_size = max_fragmented_size; + + ret = smbdirect_connection_create_mr_list(sc); + if (ret) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_create_mr_list() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_socket_schedule_cleanup(sc, ret); + return; + } + + /* + * Prepare for receiving data_transfer messages + */ + sc->recv_io.reassembly.full_packet_received = true; + sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; + list_for_each_entry(recv_io, &sc->recv_io.free.list, list) + recv_io->cqe.done = smbdirect_connection_recv_io_done; + recv_io = NULL; + + /* + * We should at least post 1 smbdirect_recv_io! + */ + posted = smbdirect_connection_recv_io_refill(sc); + if (posted < 1) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_recv_io_refill() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + /* + * smbdirect_connection_negotiation_done() + * will setup all required things and wake up + * the waiter. + */ + smbdirect_connection_negotiation_done(sc); +} + +int smbdirect_connect_sync(struct smbdirect_socket *sc, + const struct sockaddr *dst) +{ + int ret; + + ret = smbdirect_connect(sc, dst); + if (ret) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connect(%pISpsfc) failed %1pe\n", + dst, SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + ret = smbdirect_connection_wait_for_connected(sc); + if (ret) { + int lvl = SMBDIRECT_LOG_ERR; + + if (ret == -ENODEV) + lvl = SMBDIRECT_LOG_INFO; + + smbdirect_log_rdma_event(sc, lvl, + "wait for smbdirect_connect(%pISpsfc) failed %1pe\n", + dst, SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connect_sync); diff --git a/fs/smb/common/smbdirect/smbdirect_connection.c b/fs/smb/common/smbdirect/smbdirect_connection.c new file mode 100644 index 000000000000..7e4921b9538c --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_connection.c @@ -0,0 +1,2181 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" +#include + +struct smbdirect_map_sges { + struct ib_sge *sge; + size_t num_sge; + size_t max_sge; + struct ib_device *device; + u32 local_dma_lkey; + enum dma_data_direction direction; +}; + +static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len, + struct smbdirect_map_sges *state); + +static void smbdirect_connection_recv_io_refill_work(struct work_struct *work); +static void smbdirect_connection_send_immediate_work(struct work_struct *work); + +static void smbdirect_connection_qp_event_handler(struct ib_event *event, void *context) +{ + struct smbdirect_socket *sc = context; + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "%s on device %.*s socket %p (cm_id=%p) status %s first_error %1pe\n", + ib_event_msg(event->event), + IB_DEVICE_NAME_MAX, + event->device->name, + sc, sc->rdma.cm_id, + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + + switch (event->event) { + case IB_EVENT_CQ_ERR: + case IB_EVENT_QP_FATAL: + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + break; + + default: + break; + } +} + +static int smbdirect_connection_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct smbdirect_socket *sc = id->context; + int ret = -ECONNRESET; + + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ret = -ENETDOWN; + if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status))) + ret = event->status; + + /* + * cma_cm_event_handler() has + * lockdep_assert_held(&id_priv->handler_mutex); + * + * Mutexes are not allowed in interrupts, + * and we rely on not being in an interrupt here. + */ + WARN_ON_ONCE(in_interrupt()); + + if (event->event != sc->rdma.expected_event) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(sc->rdma.expected_event), + rdma_event_msg(event->event), + event->status, + SMBDIRECT_DEBUG_ERR_PTR(ret)); + + /* + * If we get RDMA_CM_EVENT_DEVICE_REMOVAL, + * we should change to SMBDIRECT_SOCKET_DISCONNECTED, + * so that rdma_disconnect() is avoided later via + * smbdirect_socket_schedule_cleanup[_status]() => + * smbdirect_socket_cleanup_work(). + * + * As otherwise we'd set SMBDIRECT_SOCKET_DISCONNECTING, + * but never ever get RDMA_CM_EVENT_DISCONNECTED and + * never reach SMBDIRECT_SOCKET_DISCONNECTED. + */ + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + smbdirect_socket_schedule_cleanup_status(sc, + SMBDIRECT_LOG_ERR, + ret, + SMBDIRECT_SOCKET_DISCONNECTED); + else + smbdirect_socket_schedule_cleanup(sc, ret); + if (sc->ib.qp) + ib_drain_qp(sc->ib.qp); + return 0; + } + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "%s (first_error=%1pe) event=%s\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(event->event)); + + switch (event->event) { + case RDMA_CM_EVENT_DISCONNECTED: + /* + * We need to change to SMBDIRECT_SOCKET_DISCONNECTED, + * so that rdma_disconnect() is avoided later via + * smbdirect_socket_schedule_cleanup_status() => + * smbdirect_socket_cleanup_work(). + * + * As otherwise we'd set SMBDIRECT_SOCKET_DISCONNECTING, + * but never ever get RDMA_CM_EVENT_DISCONNECTED and + * never reach SMBDIRECT_SOCKET_DISCONNECTED. + * + * This is also a normal disconnect so + * SMBDIRECT_LOG_INFO should be good enough + * and avoids spamming the default logs. + */ + smbdirect_socket_schedule_cleanup_status(sc, + SMBDIRECT_LOG_INFO, + ret, + SMBDIRECT_SOCKET_DISCONNECTED); + if (sc->ib.qp) + ib_drain_qp(sc->ib.qp); + return 0; + + default: + break; + } + + /* + * This is an internal error, should be handled above via + * event->event != sc->rdma.expected_event already. + */ + WARN_ON_ONCE(sc->rdma.expected_event != RDMA_CM_EVENT_DISCONNECTED); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return 0; +} + +void smbdirect_connection_rdma_established(struct smbdirect_socket *sc) +{ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "rdma established: device: %.*s local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + &sc->rdma.cm_id->route.addr.src_addr, + &sc->rdma.cm_id->route.addr.dst_addr); + + sc->rdma.cm_id->event_handler = smbdirect_connection_rdma_event_handler; + sc->rdma.expected_event = RDMA_CM_EVENT_DISCONNECTED; +} + +void smbdirect_connection_negotiation_done(struct smbdirect_socket *sc) +{ + if (unlikely(sc->first_error)) + return; + + if (sc->status == SMBDIRECT_SOCKET_CONNECTED) + /* + * This is the accept case where + * smbdirect_socket_accept() already sets + * SMBDIRECT_SOCKET_CONNECTED + */ + goto done; + + if (sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) { + /* + * Something went wrong... + */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "status=%s first_error=%1pe local: %pISpsfc remote: %pISpsfc\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + &sc->rdma.cm_id->route.addr.src_addr, + &sc->rdma.cm_id->route.addr.dst_addr); + return; + } + + /* + * We are done, so we can wake up the waiter. + */ + WARN_ONCE(sc->status == SMBDIRECT_SOCKET_CONNECTED, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + sc->status = SMBDIRECT_SOCKET_CONNECTED; + + /* + * We need to setup the refill and send immediate work + * in order to get a working connection. + */ +done: + INIT_WORK(&sc->recv_io.posted.refill_work, smbdirect_connection_recv_io_refill_work); + INIT_WORK(&sc->idle.immediate_work, smbdirect_connection_send_immediate_work); + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "negotiated: local: %pISpsfc remote: %pISpsfc\n", + &sc->rdma.cm_id->route.addr.src_addr, + &sc->rdma.cm_id->route.addr.dst_addr); + + wake_up(&sc->status_wait); +} + +static u32 smbdirect_rdma_rw_send_wrs(struct ib_device *dev, + const struct ib_qp_init_attr *attr) +{ + /* + * This could be split out of rdma_rw_init_qp() + * and be a helper function next to rdma_rw_mr_factor() + * + * We can't check unlikely(rdma_rw_force_mr) here, + * but that is most likely 0 anyway. + */ + u32 factor; + + WARN_ON_ONCE(attr->port_num == 0); + + /* + * Each context needs at least one RDMA READ or WRITE WR. + * + * For some hardware we might need more, eventually we should ask the + * HCA driver for a multiplier here. + */ + factor = 1; + + /* + * If the device needs MRs to perform RDMA READ or WRITE operations, + * we'll need two additional MRs for the registrations and the + * invalidation. + */ + if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) + factor += 2; /* inv + reg */ + + return factor * attr->cap.max_rdma_ctxs; +} + +int smbdirect_connection_create_qp(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct ib_qp_init_attr qp_attr; + struct ib_qp_cap qp_cap; + u32 rdma_send_wr; + u32 max_send_wr; + int ret; + + /* + * Note that {rdma,ib}_create_qp() will call + * rdma_rw_init_qp() if max_rdma_ctxs is not 0. + * It will adjust max_send_wr to the required + * number of additional WRs for the RDMA RW operations. + * It will cap max_send_wr to the device limit. + * + * We use allocate sp->responder_resources * 2 MRs + * and each MR needs WRs for REG and INV, so + * we use '* 4'. + * + * +1 for ib_drain_qp() + */ + memset(&qp_cap, 0, sizeof(qp_cap)); + qp_cap.max_send_wr = sp->send_credit_target + sp->responder_resources * 4 + 1; + qp_cap.max_recv_wr = sp->recv_credit_max + 1; + qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; + qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; + qp_cap.max_inline_data = 0; + qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; + + /* + * Find out the number of max_send_wr + * after rdma_rw_init_qp() adjusted it. + * + * We only do it on a temporary variable, + * as rdma_create_qp() will trigger + * rdma_rw_init_qp() again. + */ + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.cap = qp_cap; + qp_attr.port_num = sc->rdma.cm_id->port_num; + rdma_send_wr = smbdirect_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); + max_send_wr = qp_cap.max_send_wr + rdma_send_wr; + + if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || + qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { + pr_err("Possible CQE overrun: max_send_wr %d\n", + qp_cap.max_send_wr); + pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); + pr_err("consider lowering send_credit_target = %d\n", + sp->send_credit_target); + return -EINVAL; + } + + if (qp_cap.max_rdma_ctxs && + (max_send_wr >= sc->ib.dev->attrs.max_cqe || + max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { + pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", + rdma_send_wr, qp_cap.max_send_wr, max_send_wr); + pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); + pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", + sp->send_credit_target, qp_cap.max_rdma_ctxs); + return -EINVAL; + } + + if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || + qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { + pr_err("Possible CQE overrun: max_recv_wr %d\n", + qp_cap.max_recv_wr); + pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_cqe, + sc->ib.dev->attrs.max_qp_wr); + pr_err("consider lowering receive_credit_max = %d\n", + sp->recv_credit_max); + return -EINVAL; + } + + if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || + qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { + pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", + IB_DEVICE_NAME_MAX, + sc->ib.dev->name, + sc->ib.dev->attrs.max_send_sge, + sc->ib.dev->attrs.max_recv_sge); + return -EINVAL; + } + + sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); + if (IS_ERR(sc->ib.pd)) { + pr_err("Can't create RDMA PD: %1pe\n", sc->ib.pd); + ret = PTR_ERR(sc->ib.pd); + sc->ib.pd = NULL; + return ret; + } + + sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, + max_send_wr, + sc->ib.poll_ctx); + if (IS_ERR(sc->ib.send_cq)) { + pr_err("Can't create RDMA send CQ: %1pe\n", sc->ib.send_cq); + ret = PTR_ERR(sc->ib.send_cq); + sc->ib.send_cq = NULL; + goto err; + } + + sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, + qp_cap.max_recv_wr, + sc->ib.poll_ctx); + if (IS_ERR(sc->ib.recv_cq)) { + pr_err("Can't create RDMA recv CQ: %1pe\n", sc->ib.recv_cq); + ret = PTR_ERR(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; + goto err; + } + + /* + * We reset completely here! + * As the above use was just temporary + * to calc max_send_wr and rdma_send_wr. + * + * rdma_create_qp() will trigger rdma_rw_init_qp() + * again if max_rdma_ctxs is not 0. + */ + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.event_handler = smbdirect_connection_qp_event_handler; + qp_attr.qp_context = sc; + qp_attr.cap = qp_cap; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = sc->ib.send_cq; + qp_attr.recv_cq = sc->ib.recv_cq; + qp_attr.port_num = ~0; + + ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); + if (ret) { + pr_err("Can't create RDMA QP: %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto err; + } + sc->ib.qp = sc->rdma.cm_id->qp; + + return 0; +err: + smbdirect_connection_destroy_qp(sc); + return ret; +} + +void smbdirect_connection_destroy_qp(struct smbdirect_socket *sc) +{ + if (sc->ib.qp) { + ib_drain_qp(sc->ib.qp); + sc->ib.qp = NULL; + rdma_destroy_qp(sc->rdma.cm_id); + } + if (sc->ib.recv_cq) { + ib_destroy_cq(sc->ib.recv_cq); + sc->ib.recv_cq = NULL; + } + if (sc->ib.send_cq) { + ib_destroy_cq(sc->ib.send_cq); + sc->ib.send_cq = NULL; + } + if (sc->ib.pd) { + ib_dealloc_pd(sc->ib.pd); + sc->ib.pd = NULL; + } +} + +int smbdirect_connection_create_mem_pools(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + char name[80]; + size_t i; + + /* + * We use sizeof(struct smbdirect_negotiate_resp) for the + * payload size as it is larger as + * sizeof(struct smbdirect_data_transfer). + * + * This will fit client and server usage for now. + */ + snprintf(name, sizeof(name), "smbdirect_send_io_cache_%p", sc); + struct kmem_cache_args send_io_args = { + .align = __alignof__(struct smbdirect_send_io), + }; + sc->send_io.mem.cache = kmem_cache_create(name, + sizeof(struct smbdirect_send_io) + + sizeof(struct smbdirect_negotiate_resp), + &send_io_args, + SLAB_HWCACHE_ALIGN); + if (!sc->send_io.mem.cache) + goto err; + + sc->send_io.mem.pool = mempool_create_slab_pool(sp->send_credit_target, + sc->send_io.mem.cache); + if (!sc->send_io.mem.pool) + goto err; + + /* + * A payload size of sp->max_recv_size should fit + * any message. + * + * For smbdirect_data_transfer messages the whole + * buffer might be exposed to userspace + * (currently on the client side...) + * The documentation says data_offset = 0 would be + * strange but valid. + */ + snprintf(name, sizeof(name), "smbdirect_recv_io_cache_%p", sc); + struct kmem_cache_args recv_io_args = { + .align = __alignof__(struct smbdirect_recv_io), + .useroffset = sizeof(struct smbdirect_recv_io), + .usersize = sp->max_recv_size, + }; + sc->recv_io.mem.cache = kmem_cache_create(name, + sizeof(struct smbdirect_recv_io) + + sp->max_recv_size, + &recv_io_args, + SLAB_HWCACHE_ALIGN); + if (!sc->recv_io.mem.cache) + goto err; + + sc->recv_io.mem.pool = mempool_create_slab_pool(sp->recv_credit_max, + sc->recv_io.mem.cache); + if (!sc->recv_io.mem.pool) + goto err; + + for (i = 0; i < sp->recv_credit_max; i++) { + struct smbdirect_recv_io *recv_io; + + recv_io = mempool_alloc(sc->recv_io.mem.pool, + sc->recv_io.mem.gfp_mask); + if (!recv_io) + goto err; + recv_io->socket = sc; + recv_io->sge.length = 0; + list_add_tail(&recv_io->list, &sc->recv_io.free.list); + } + + return 0; +err: + smbdirect_connection_destroy_mem_pools(sc); + return -ENOMEM; +} + +void smbdirect_connection_destroy_mem_pools(struct smbdirect_socket *sc) +{ + struct smbdirect_recv_io *recv_io, *next_io; + + list_for_each_entry_safe(recv_io, next_io, &sc->recv_io.free.list, list) { + list_del(&recv_io->list); + mempool_free(recv_io, sc->recv_io.mem.pool); + } + + /* + * Note mempool_destroy() and kmem_cache_destroy() + * work fine with a NULL pointer + */ + + mempool_destroy(sc->recv_io.mem.pool); + sc->recv_io.mem.pool = NULL; + + kmem_cache_destroy(sc->recv_io.mem.cache); + sc->recv_io.mem.cache = NULL; + + mempool_destroy(sc->send_io.mem.pool); + sc->send_io.mem.pool = NULL; + + kmem_cache_destroy(sc->send_io.mem.cache); + sc->send_io.mem.cache = NULL; +} + +struct smbdirect_send_io *smbdirect_connection_alloc_send_io(struct smbdirect_socket *sc) +{ + struct smbdirect_send_io *msg; + + msg = mempool_alloc(sc->send_io.mem.pool, sc->send_io.mem.gfp_mask); + if (!msg) + return ERR_PTR(-ENOMEM); + msg->socket = sc; + INIT_LIST_HEAD(&msg->sibling_list); + msg->num_sge = 0; + + return msg; +} + +void smbdirect_connection_free_send_io(struct smbdirect_send_io *msg) +{ + struct smbdirect_socket *sc = msg->socket; + size_t i; + + /* + * The list needs to be empty! + * The caller should take care of it. + */ + WARN_ON_ONCE(!list_empty(&msg->sibling_list)); + + /* + * Note we call ib_dma_unmap_page(), even if some sges are mapped using + * ib_dma_map_single(). + * + * The difference between _single() and _page() only matters for the + * ib_dma_map_*() case. + * + * For the ib_dma_unmap_*() case it does not matter as both take the + * dma_addr_t and dma_unmap_single_attrs() is just an alias to + * dma_unmap_page_attrs(). + */ + for (i = 0; i < msg->num_sge; i++) + ib_dma_unmap_page(sc->ib.dev, + msg->sge[i].addr, + msg->sge[i].length, + DMA_TO_DEVICE); + + mempool_free(msg, sc->send_io.mem.pool); +} + +struct smbdirect_recv_io *smbdirect_connection_get_recv_io(struct smbdirect_socket *sc) +{ + struct smbdirect_recv_io *msg = NULL; + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.free.lock, flags); + if (likely(!sc->first_error)) + msg = list_first_entry_or_null(&sc->recv_io.free.list, + struct smbdirect_recv_io, + list); + if (likely(msg)) { + list_del(&msg->list); + sc->statistics.get_receive_buffer++; + } + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); + + return msg; +} + +void smbdirect_connection_put_recv_io(struct smbdirect_recv_io *msg) +{ + struct smbdirect_socket *sc = msg->socket; + unsigned long flags; + + if (likely(msg->sge.length != 0)) { + ib_dma_unmap_single(sc->ib.dev, + msg->sge.addr, + msg->sge.length, + DMA_FROM_DEVICE); + msg->sge.length = 0; + } + + spin_lock_irqsave(&sc->recv_io.free.lock, flags); + list_add_tail(&msg->list, &sc->recv_io.free.list); + sc->statistics.put_receive_buffer++; + spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); + + queue_work(sc->workqueues.refill, &sc->recv_io.posted.refill_work); +} + +void smbdirect_connection_reassembly_append_recv_io(struct smbdirect_socket *sc, + struct smbdirect_recv_io *msg, + u32 data_length) +{ + unsigned long flags; + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + list_add_tail(&msg->list, &sc->recv_io.reassembly.list); + sc->recv_io.reassembly.queue_length++; + /* + * Make sure reassembly_data_length is updated after list and + * reassembly_queue_length are updated. On the dequeue side + * reassembly_data_length is checked without a lock to determine + * if reassembly_queue_length and list is up to date + */ + virt_wmb(); + sc->recv_io.reassembly.data_length += data_length; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + sc->statistics.enqueue_reassembly_queue++; +} + +struct smbdirect_recv_io * +smbdirect_connection_reassembly_first_recv_io(struct smbdirect_socket *sc) +{ + struct smbdirect_recv_io *msg; + + msg = list_first_entry_or_null(&sc->recv_io.reassembly.list, + struct smbdirect_recv_io, + list); + + return msg; +} + +void smbdirect_connection_negotiate_rdma_resources(struct smbdirect_socket *sc, + u8 peer_initiator_depth, + u8 peer_responder_resources, + const struct rdma_conn_param *param) +{ + struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (rdma_protocol_iwarp(sc->ib.dev, sc->rdma.cm_id->port_num) && + param->private_data_len == 8) { + /* + * Legacy clients with only iWarp MPA v1 support + * need a private blob in order to negotiate + * the IRD/ORD values. + */ + const __be32 *ird_ord_hdr = param->private_data; + u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); + u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); + + /* + * cifs.ko sends the legacy IRD/ORD negotiation + * event if iWarp MPA v2 was used. + * + * Here we check that the values match and only + * mark the client as legacy if they don't match. + */ + if ((u32)param->initiator_depth != ird32 || + (u32)param->responder_resources != ord32) { + /* + * There are broken clients (old cifs.ko) + * using little endian and also + * struct rdma_conn_param only uses u8 + * for initiator_depth and responder_resources, + * so we truncate the value to U8_MAX. + * + * smb_direct_accept_client() will then + * do the real negotiation in order to + * select the minimum between client and + * server. + */ + ird32 = min_t(u32, ird32, U8_MAX); + ord32 = min_t(u32, ord32, U8_MAX); + + sc->rdma.legacy_iwarp = true; + peer_initiator_depth = (u8)ird32; + peer_responder_resources = (u8)ord32; + } + } + + /* + * negotiate the value by using the minimum + * between client and server if the client provided + * non 0 values. + */ + if (peer_initiator_depth != 0) + sp->initiator_depth = min_t(u8, sp->initiator_depth, + peer_initiator_depth); + if (peer_responder_resources != 0) + sp->responder_resources = min_t(u8, sp->responder_resources, + peer_responder_resources); +} + +bool smbdirect_connection_is_connected(struct smbdirect_socket *sc) +{ + if (unlikely(!sc || sc->first_error || sc->status != SMBDIRECT_SOCKET_CONNECTED)) + return false; + return true; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_is_connected); + +int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + union { + struct sockaddr sa; + struct sockaddr_storage ss; + } src_addr, dst_addr; + const struct sockaddr *src = NULL; + const struct sockaddr *dst = NULL; + char _devname[IB_DEVICE_NAME_MAX] = { 0, }; + const char *devname = NULL; + int ret; + + if (sc->rdma.cm_id) { + src_addr.ss = sc->rdma.cm_id->route.addr.src_addr; + if (src_addr.sa.sa_family != AF_UNSPEC) + src = &src_addr.sa; + dst_addr.ss = sc->rdma.cm_id->route.addr.dst_addr; + if (dst_addr.sa.sa_family != AF_UNSPEC) + dst = &dst_addr.sa; + + if (sc->ib.dev) { + memcpy(_devname, sc->ib.dev->name, IB_DEVICE_NAME_MAX); + devname = _devname; + } + } + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "waiting for connection: device: %.*s local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, devname, src, dst); + + ret = wait_event_interruptible_timeout(sc->status_wait, + sc->status == SMBDIRECT_SOCKET_CONNECTED || + sc->first_error, + msecs_to_jiffies(sp->negotiate_timeout_msec)); + if (sc->rdma.cm_id) { + /* + * Maybe src and dev are updated in the meantime. + */ + src_addr.ss = sc->rdma.cm_id->route.addr.src_addr; + if (src_addr.sa.sa_family != AF_UNSPEC) + src = &src_addr.sa; + dst_addr.ss = sc->rdma.cm_id->route.addr.dst_addr; + if (dst_addr.sa.sa_family != AF_UNSPEC) + dst = &dst_addr.sa; + + if (sc->ib.dev) { + memcpy(_devname, sc->ib.dev->name, IB_DEVICE_NAME_MAX); + devname = _devname; + } + } + if (ret == 0) + ret = -ETIMEDOUT; + if (ret < 0) + smbdirect_socket_schedule_cleanup(sc, ret); + if (sc->first_error) { + int lvl = SMBDIRECT_LOG_ERR; + + ret = sc->first_error; + if (ret == -ENODEV) + lvl = SMBDIRECT_LOG_INFO; + + smbdirect_log_rdma_event(sc, lvl, + "connection failed %1pe device: %.*s local: %pISpsfc remote: %pISpsfc\n", + SMBDIRECT_DEBUG_ERR_PTR(ret), + IB_DEVICE_NAME_MAX, devname, src, dst); + return ret; + } + + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_wait_for_connected); + +void smbdirect_connection_idle_timer_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.timer_work.work); + const struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { + smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_ERR, + "%s => timeout sc->idle.keepalive=%s\n", + smbdirect_socket_status_string(sc->status), + sc->idle.keepalive == SMBDIRECT_KEEPALIVE_SENT ? + "SENT" : "PENDING"); + smbdirect_socket_schedule_cleanup(sc, -ETIMEDOUT); + return; + } + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO, + "schedule send of empty idle message\n"); + queue_work(sc->workqueues.immediate, &sc->idle.immediate_work); +} + +u16 smbdirect_connection_grant_recv_credits(struct smbdirect_socket *sc) +{ + int missing; + int available; + int new_credits; + + if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) + return 0; + + missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count); + available = atomic_xchg(&sc->recv_io.credits.available, 0); + new_credits = min3((int)U16_MAX, missing, available); + if (new_credits <= 0) { + /* + * If credits are available, but not granted + * we need to re-add them again. + */ + if (available) + atomic_add(available, &sc->recv_io.credits.available); + return 0; + } + + if (new_credits < available) { + /* + * Readd the remaining available again. + */ + available -= new_credits; + atomic_add(available, &sc->recv_io.credits.available); + } + + /* + * Remember we granted the credits + */ + atomic_add(new_credits, &sc->recv_io.credits.count); + return new_credits; +} + +static bool smbdirect_connection_request_keep_alive(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + + if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; + /* + * Now use the keepalive timeout (instead of keepalive interval) + * in order to wait for a response + */ + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_timeout_msec)); + return true; + } + + return false; +} + +int smbdirect_connection_post_send_wr(struct smbdirect_socket *sc, + struct ib_send_wr *wr) +{ + int ret; + + if (unlikely(sc->first_error)) + return sc->first_error; + + atomic_inc(&sc->send_io.pending.count); + ret = ib_post_send(sc->ib.qp, wr, NULL); + if (ret) { + atomic_dec(&sc->send_io.pending.count); + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "ib_post_send() failed %1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_socket_schedule_cleanup(sc, ret); + } + + return ret; +} + +static void smbdirect_connection_send_batch_init(struct smbdirect_send_batch *batch, + bool need_invalidate_rkey, + unsigned int remote_key) +{ + INIT_LIST_HEAD(&batch->msg_list); + batch->wr_cnt = 0; + batch->need_invalidate_rkey = need_invalidate_rkey; + batch->remote_key = remote_key; + batch->credit = 0; +} + +int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + bool is_last) +{ + struct smbdirect_send_io *first, *last; + int ret = 0; + + if (list_empty(&batch->msg_list)) + goto release_credit; + + first = list_first_entry(&batch->msg_list, + struct smbdirect_send_io, + sibling_list); + last = list_last_entry(&batch->msg_list, + struct smbdirect_send_io, + sibling_list); + + if (batch->need_invalidate_rkey) { + first->wr.opcode = IB_WR_SEND_WITH_INV; + first->wr.ex.invalidate_rkey = batch->remote_key; + batch->need_invalidate_rkey = false; + batch->remote_key = 0; + } + + last->wr.send_flags = IB_SEND_SIGNALED; + last->wr.wr_cqe = &last->cqe; + + /* + * Remove last from send_ctx->msg_list + * and splice the rest of send_ctx->msg_list + * to last->sibling_list. + * + * send_ctx->msg_list is a valid empty list + * at the end. + */ + list_del_init(&last->sibling_list); + list_splice_tail_init(&batch->msg_list, &last->sibling_list); + batch->wr_cnt = 0; + + ret = smbdirect_connection_post_send_wr(sc, &first->wr); + if (ret) { + struct smbdirect_send_io *sibling, *next; + + list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { + list_del_init(&sibling->sibling_list); + smbdirect_connection_free_send_io(sibling); + } + smbdirect_connection_free_send_io(last); + } + +release_credit: + if (is_last && !ret && batch->credit) { + atomic_add(batch->credit, &sc->send_io.bcredits.count); + batch->credit = 0; + wake_up(&sc->send_io.bcredits.wait_queue); + } + + return ret; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_batch_flush); + +struct smbdirect_send_batch * +smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, + bool need_invalidate_rkey, + unsigned int remote_key) +{ + struct smbdirect_send_batch *batch = (struct smbdirect_send_batch *)storage; + + memset(storage, 0, sizeof(*storage)); + BUILD_BUG_ON(sizeof(*batch) > sizeof(*storage)); + + smbdirect_connection_send_batch_init(batch, + need_invalidate_rkey, + remote_key); + + return batch; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_init_send_batch_storage); + +static int smbdirect_connection_wait_for_send_bcredit(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch) +{ + int ret; + + if (batch->credit) + return 0; + + ret = smbdirect_socket_wait_for_credits(sc, + SMBDIRECT_SOCKET_CONNECTED, + -ENOTCONN, + &sc->send_io.bcredits.wait_queue, + &sc->send_io.bcredits.count, + 1); + if (ret) + return ret; + + batch->credit = 1; + return 0; +} + +static int smbdirect_connection_wait_for_send_lcredit(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch) +{ + if (batch && atomic_read(&sc->send_io.lcredits.count) <= 1) { + int ret; + + ret = smbdirect_connection_send_batch_flush(sc, batch, false); + if (ret) + return ret; + } + + return smbdirect_socket_wait_for_credits(sc, + SMBDIRECT_SOCKET_CONNECTED, + -ENOTCONN, + &sc->send_io.lcredits.wait_queue, + &sc->send_io.lcredits.count, + 1); +} + +static int smbdirect_connection_wait_for_send_credits(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch) +{ + if (batch && (batch->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { + int ret; + + ret = smbdirect_connection_send_batch_flush(sc, batch, false); + if (ret) + return ret; + } + + return smbdirect_socket_wait_for_credits(sc, + SMBDIRECT_SOCKET_CONNECTED, + -ENOTCONN, + &sc->send_io.credits.wait_queue, + &sc->send_io.credits.count, + 1); +} + +static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc); + +static int smbdirect_connection_post_send_io(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + struct smbdirect_send_io *msg) +{ + int i; + + for (i = 0; i < msg->num_sge; i++) + ib_dma_sync_single_for_device(sc->ib.dev, + msg->sge[i].addr, msg->sge[i].length, + DMA_TO_DEVICE); + + msg->cqe.done = smbdirect_connection_send_io_done; + msg->wr.wr_cqe = &msg->cqe; + msg->wr.opcode = IB_WR_SEND; + msg->wr.sg_list = &msg->sge[0]; + msg->wr.num_sge = msg->num_sge; + msg->wr.next = NULL; + + if (batch) { + msg->wr.send_flags = 0; + if (!list_empty(&batch->msg_list)) { + struct smbdirect_send_io *last; + + last = list_last_entry(&batch->msg_list, + struct smbdirect_send_io, + sibling_list); + last->wr.next = &msg->wr; + } + list_add_tail(&msg->sibling_list, &batch->msg_list); + batch->wr_cnt++; + return 0; + } + + msg->wr.send_flags = IB_SEND_SIGNALED; + return smbdirect_connection_post_send_wr(sc, &msg->wr); +} + +int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + struct iov_iter *iter, + unsigned int flags, + u32 remaining_data_length) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_send_batch _batch; + struct smbdirect_send_io *msg; + struct smbdirect_data_transfer *packet; + size_t header_length; + u16 new_credits = 0; + u32 data_length = 0; + int ret; + + if (WARN_ON_ONCE(flags)) + return -EINVAL; /* no flags support for now */ + + if (iter) { + if (WARN_ON_ONCE(iov_iter_rw(iter) != ITER_SOURCE)) + return -EINVAL; /* It's a bug in upper layer to get there */ + + header_length = sizeof(struct smbdirect_data_transfer); + if (WARN_ON_ONCE(remaining_data_length == 0 || + iov_iter_count(iter) > remaining_data_length)) + return -EINVAL; + } else { + /* If this is a packet without payload, don't send padding */ + header_length = offsetof(struct smbdirect_data_transfer, padding); + if (WARN_ON_ONCE(remaining_data_length)) + return -EINVAL; + } + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbdirect_log_write(sc, SMBDIRECT_LOG_ERR, + "status=%s first_error=%1pe => %1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN)); + return -ENOTCONN; + } + + if (!batch) { + smbdirect_connection_send_batch_init(&_batch, false, 0); + batch = &_batch; + } + + ret = smbdirect_connection_wait_for_send_bcredit(sc, batch); + if (ret) + goto bcredit_failed; + + ret = smbdirect_connection_wait_for_send_lcredit(sc, batch); + if (ret) + goto lcredit_failed; + + ret = smbdirect_connection_wait_for_send_credits(sc, batch); + if (ret) + goto credit_failed; + + new_credits = smbdirect_connection_grant_recv_credits(sc); + if (new_credits == 0 && + atomic_read(&sc->send_io.credits.count) == 0 && + atomic_read(&sc->recv_io.credits.count) == 0) { + /* + * queue the refill work in order to + * get some new recv credits we can grant to + * the peer. + */ + queue_work(sc->workqueues.refill, &sc->recv_io.posted.refill_work); + + /* + * wait until either the refill work or the peer + * granted new credits + */ + ret = wait_event_interruptible(sc->send_io.credits.wait_queue, + atomic_read(&sc->send_io.credits.count) >= 1 || + atomic_read(&sc->recv_io.credits.available) >= 1 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + ret = -ENOTCONN; + if (ret < 0) + goto credit_failed; + + new_credits = smbdirect_connection_grant_recv_credits(sc); + } + + msg = smbdirect_connection_alloc_send_io(sc); + if (IS_ERR(msg)) { + ret = PTR_ERR(msg); + goto alloc_failed; + } + + /* Map the packet to DMA */ + msg->sge[0].addr = ib_dma_map_single(sc->ib.dev, + msg->packet, + header_length, + DMA_TO_DEVICE); + ret = ib_dma_mapping_error(sc->ib.dev, msg->sge[0].addr); + if (ret) + goto err; + + msg->sge[0].length = header_length; + msg->sge[0].lkey = sc->ib.pd->local_dma_lkey; + msg->num_sge = 1; + + if (iter) { + struct smbdirect_map_sges extract = { + .num_sge = msg->num_sge, + .max_sge = ARRAY_SIZE(msg->sge), + .sge = msg->sge, + .device = sc->ib.dev, + .local_dma_lkey = sc->ib.pd->local_dma_lkey, + .direction = DMA_TO_DEVICE, + }; + size_t payload_len = umin(iov_iter_count(iter), + sp->max_send_size - sizeof(*packet)); + + ret = smbdirect_map_sges_from_iter(iter, payload_len, &extract); + if (ret < 0) + goto err; + data_length = ret; + remaining_data_length -= data_length; + msg->num_sge = extract.num_sge; + } + + /* Fill in the packet header */ + packet = (struct smbdirect_data_transfer *)msg->packet; + packet->credits_requested = cpu_to_le16(sp->send_credit_target); + packet->credits_granted = cpu_to_le16(new_credits); + + packet->flags = 0; + if (smbdirect_connection_request_keep_alive(sc)) + packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); + + packet->reserved = 0; + if (!data_length) + packet->data_offset = 0; + else + packet->data_offset = cpu_to_le32(24); + packet->data_length = cpu_to_le32(data_length); + packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->padding = 0; + + smbdirect_log_outgoing(sc, SMBDIRECT_LOG_INFO, + "DataOut: %s=%u, %s=%u, %s=0x%x, %s=%u, %s=%u, %s=%u\n", + "CreditsRequested", + le16_to_cpu(packet->credits_requested), + "CreditsGranted", + le16_to_cpu(packet->credits_granted), + "Flags", + le16_to_cpu(packet->flags), + "RemainingDataLength", + le32_to_cpu(packet->remaining_data_length), + "DataOffset", + le32_to_cpu(packet->data_offset), + "DataLength", + le32_to_cpu(packet->data_length)); + + ret = smbdirect_connection_post_send_io(sc, batch, msg); + if (ret) + goto err; + + /* + * From here msg is moved to send_ctx + * and we should not free it explicitly. + */ + + if (batch == &_batch) { + ret = smbdirect_connection_send_batch_flush(sc, batch, true); + if (ret) + goto flush_failed; + } + + return data_length; +err: + smbdirect_connection_free_send_io(msg); +flush_failed: +alloc_failed: + atomic_inc(&sc->send_io.credits.count); +credit_failed: + atomic_inc(&sc->send_io.lcredits.count); +lcredit_failed: + atomic_add(batch->credit, &sc->send_io.bcredits.count); + batch->credit = 0; +bcredit_failed: + return ret; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_single_iter); + +int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc) +{ + /* + * As an optimization, we don't wait for individual I/O to finish + * before sending the next one. + * Send them all and wait for pending send count to get to 0 + * that means all the I/Os have been out and we are good to return + */ + + wait_event(sc->send_io.pending.zero_wait_queue, + atomic_read(&sc->send_io.pending.count) == 0 || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbdirect_log_write(sc, SMBDIRECT_LOG_ERR, + "status=%s first_error=%1pe => %1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN)); + return -ENOTCONN; + } + + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_wait_zero_pending); + +int smbdirect_connection_send_iter(struct smbdirect_socket *sc, + struct iov_iter *iter, + unsigned int flags, + bool need_invalidate, + unsigned int remote_key) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_send_batch batch; + int total_count = iov_iter_count(iter); + int ret; + int error = 0; + __be32 hdr; + + if (WARN_ONCE(flags, "unexpected flags=0x%x\n", flags)) + return -EINVAL; /* no flags support for now */ + + if (WARN_ON_ONCE(iov_iter_rw(iter) != ITER_SOURCE)) + return -EINVAL; /* It's a bug in upper layer to get there */ + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbdirect_log_write(sc, SMBDIRECT_LOG_INFO, + "status=%s first_error=%1pe => %1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN)); + return -ENOTCONN; + } + + /* + * For now we expect the iter to have the full + * message, including a 4 byte length header. + */ + if (iov_iter_count(iter) <= 4) + return -EINVAL; + if (!copy_from_iter_full(&hdr, sizeof(hdr), iter)) + return -EFAULT; + if (iov_iter_count(iter) != be32_to_cpu(hdr)) + return -EINVAL; + + /* + * The size must fit into the negotiated + * fragmented send size. + */ + if (iov_iter_count(iter) > sp->max_fragmented_send_size) + return -EMSGSIZE; + + smbdirect_log_write(sc, SMBDIRECT_LOG_INFO, + "Sending (RDMA): length=%zu\n", + iov_iter_count(iter)); + + smbdirect_connection_send_batch_init(&batch, need_invalidate, remote_key); + while (iov_iter_count(iter)) { + ret = smbdirect_connection_send_single_iter(sc, + &batch, + iter, + flags, + iov_iter_count(iter)); + if (unlikely(ret < 0)) { + error = ret; + break; + } + } + + ret = smbdirect_connection_send_batch_flush(sc, &batch, true); + if (unlikely(ret && !error)) + error = ret; + + /* + * As an optimization, we don't wait for individual I/O to finish + * before sending the next one. + * Send them all and wait for pending send count to get to 0 + * that means all the I/Os have been out and we are good to return + */ + + ret = smbdirect_connection_send_wait_zero_pending(sc); + if (unlikely(ret && !error)) + error = ret; + + if (unlikely(error)) + return error; + + return total_count; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_send_iter); + +static void smbdirect_connection_send_io_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_send_io *msg = + container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); + struct smbdirect_socket *sc = msg->socket; + struct smbdirect_send_io *sibling, *next; + int lcredits = 0; + + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_INFO, + "smbdirect_send_io completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + + if (unlikely(!(msg->wr.send_flags & IB_SEND_SIGNALED))) { + /* + * This happens when smbdirect_send_io is a sibling + * before the final message, it is signaled on + * error anyway, so we need to skip + * smbdirect_connection_free_send_io here, + * otherwise is will destroy the memory + * of the siblings too, which will cause + * use after free problems for the others + * triggered from ib_drain_qp(). + */ + if (wc->status != IB_WC_SUCCESS) + goto skip_free; + + /* + * This should not happen! + * But we better just close the + * connection... + */ + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "unexpected send completion wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + /* + * Free possible siblings and then the main send_io + */ + list_for_each_entry_safe(sibling, next, &msg->sibling_list, sibling_list) { + list_del_init(&sibling->sibling_list); + smbdirect_connection_free_send_io(sibling); + lcredits += 1; + } + /* Note this frees wc->wr_cqe, but not wc */ + smbdirect_connection_free_send_io(msg); + lcredits += 1; + + if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_SEND))) { +skip_free: + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_log_rdma_send(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + return; + } + + atomic_add(lcredits, &sc->send_io.lcredits.count); + wake_up(&sc->send_io.lcredits.wait_queue); + + if (atomic_dec_and_test(&sc->send_io.pending.count)) + wake_up(&sc->send_io.pending.zero_wait_queue); +} + +static void smbdirect_connection_send_immediate_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, idle.immediate_work); + int ret; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return; + + smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO, + "send an empty message\n"); + sc->statistics.send_empty++; + ret = smbdirect_connection_send_single_iter(sc, NULL, NULL, 0, 0); + if (ret < 0) { + smbdirect_log_write(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_send_single_iter ret=%1pe\n", + SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_socket_schedule_cleanup(sc, ret); + } +} + +int smbdirect_connection_post_recv_io(struct smbdirect_recv_io *msg) +{ + struct smbdirect_socket *sc = msg->socket; + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct ib_recv_wr recv_wr = { + .wr_cqe = &msg->cqe, + .sg_list = &msg->sge, + .num_sge = 1, + }; + int ret; + + if (unlikely(sc->first_error)) + return sc->first_error; + + msg->sge.addr = ib_dma_map_single(sc->ib.dev, + msg->packet, + sp->max_recv_size, + DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(sc->ib.dev, msg->sge.addr); + if (ret) + return ret; + + msg->sge.length = sp->max_recv_size; + msg->sge.lkey = sc->ib.pd->local_dma_lkey; + + ret = ib_post_recv(sc->ib.qp, &recv_wr, NULL); + if (ret) { + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR, + "ib_post_recv failed ret=%d (%1pe)\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + ib_dma_unmap_single(sc->ib.dev, + msg->sge.addr, + msg->sge.length, + DMA_FROM_DEVICE); + msg->sge.length = 0; + smbdirect_socket_schedule_cleanup(sc, ret); + } + + return ret; +} + +void smbdirect_connection_recv_io_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_recv_io *recv_io = + container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); + struct smbdirect_socket *sc = recv_io->socket; + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_data_transfer *data_transfer; + int current_recv_credits; + u16 old_recv_credit_target; + u16 credits_requested; + u16 credits_granted; + u16 flags; + u32 data_offset; + u32 data_length; + u32 remaining_data_length; + + if (unlikely(wc->status != IB_WC_SUCCESS || WARN_ON_ONCE(wc->opcode != IB_WC_RECV))) { + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s (%d) wc->opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, wc->opcode); + goto error; + } + + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_INFO, + "recv_io=0x%p type=%d wc status=%s wc opcode %d byte_len=%d pkey_index=%u\n", + recv_io, sc->recv_io.expected, + ib_wc_status_msg(wc->status), wc->opcode, + wc->byte_len, wc->pkey_index); + + /* + * Reset timer to the keepalive interval in + * order to trigger our next keepalive message. + */ + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + mod_delayed_work(sc->workqueues.idle, &sc->idle.timer_work, + msecs_to_jiffies(sp->keepalive_interval_msec)); + + ib_dma_sync_single_for_cpu(sc->ib.dev, + recv_io->sge.addr, + recv_io->sge.length, + DMA_FROM_DEVICE); + + if (unlikely(wc->byte_len < + offsetof(struct smbdirect_data_transfer, padding))) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "wc->byte_len=%u < %zu\n", + wc->byte_len, + offsetof(struct smbdirect_data_transfer, padding)); + goto error; + } + + data_transfer = (struct smbdirect_data_transfer *)recv_io->packet; + credits_requested = le16_to_cpu(data_transfer->credits_requested); + credits_granted = le16_to_cpu(data_transfer->credits_granted); + flags = le16_to_cpu(data_transfer->flags); + remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); + data_length = le32_to_cpu(data_transfer->data_length); + + smbdirect_log_incoming(sc, SMBDIRECT_LOG_INFO, + "DataIn: %s=%u, %s=%u, %s=0x%x, %s=%u, %s=%u, %s=%u\n", + "CreditsRequested", + credits_requested, + "CreditsGranted", + credits_granted, + "Flags", + flags, + "RemainingDataLength", + remaining_data_length, + "DataOffset", + data_offset, + "DataLength", + data_length); + + if (unlikely(credits_requested == 0)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: credits_requested == 0\n"); + goto error; + } + + if (unlikely(data_offset % 8 != 0)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "invalid: data_offset=%u (0x%x) not aligned to 8\n", + data_offset, data_offset); + goto error; + } + + if (unlikely(wc->byte_len < data_offset || + (u64)wc->byte_len < (u64)data_offset + data_length)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "wc->byte_len=%u < date_offset=%u + data_length=%u\n", + wc->byte_len, data_offset, data_length); + goto error; + } + + if (unlikely(remaining_data_length > sp->max_fragmented_recv_size || + data_length > sp->max_fragmented_recv_size || + (u64)remaining_data_length + (u64)data_length > (u64)sp->max_fragmented_recv_size)) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "remaining_data_length=%u + data_length=%u > max_fragmented=%u\n", + remaining_data_length, data_length, sp->max_fragmented_recv_size); + goto error; + } + + if (data_length) { + if (sc->recv_io.reassembly.full_packet_received) + recv_io->first_segment = true; + + if (remaining_data_length) + sc->recv_io.reassembly.full_packet_received = false; + else + sc->recv_io.reassembly.full_packet_received = true; + } + + atomic_dec(&sc->recv_io.posted.count); + current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count); + + /* + * We take the value from the peer, which is checked to be higher than 0, + * but we limit it to the max value we support in order to have + * the main logic simpler. + */ + old_recv_credit_target = sc->recv_io.credits.target; + sc->recv_io.credits.target = credits_requested; + sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, + sp->recv_credit_max); + if (credits_granted) { + atomic_add(credits_granted, &sc->send_io.credits.count); + /* + * We have new send credits granted from remote peer + * If any sender is waiting for credits, unblock it + */ + wake_up(&sc->send_io.credits.wait_queue); + } + + /* Send an immediate response right away if requested */ + if (flags & SMBDIRECT_FLAG_RESPONSE_REQUESTED) { + smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO, + "schedule send of immediate response\n"); + queue_work(sc->workqueues.immediate, &sc->idle.immediate_work); + } + + /* + * If this is a packet with data playload place the data in + * reassembly queue and wake up the reading thread + */ + if (data_length) { + if (current_recv_credits <= (sc->recv_io.credits.target / 4) || + sc->recv_io.credits.target > old_recv_credit_target) + queue_work(sc->workqueues.refill, &sc->recv_io.posted.refill_work); + + smbdirect_connection_reassembly_append_recv_io(sc, recv_io, data_length); + wake_up(&sc->recv_io.reassembly.wait_queue); + } else + smbdirect_connection_put_recv_io(recv_io); + + return; + +error: + /* + * Make sure smbdirect_connection_put_recv_io() does not + * start recv_io.posted.refill_work. + */ + disable_work(&sc->recv_io.posted.refill_work); + smbdirect_connection_put_recv_io(recv_io); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); +} + +int smbdirect_connection_recv_io_refill(struct smbdirect_socket *sc) +{ + int missing; + int posted = 0; + + if (unlikely(sc->first_error)) + return sc->first_error; + + /* + * Find out how much smbdirect_recv_io buffers we should post. + * + * Note that sc->recv_io.credits.target is the value + * from the peer and it can in theory change over time, + * but it is forced to be at least 1 and at max + * sp->recv_credit_max. + * + * So it can happen that missing will be lower than 0, + * which means the peer has recently lowered its desired + * target, while be already granted a higher number of credits. + * + * Note 'posted' is the number of smbdirect_recv_io buffers + * posted within this function, while sc->recv_io.posted.count + * is the overall value of posted smbdirect_recv_io buffers. + * + * We try to post as much buffers as missing, but + * this is limited if a lot of smbdirect_recv_io buffers + * are still in the sc->recv_io.reassembly.list instead of + * the sc->recv_io.free.list. + * + */ + missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.posted.count); + while (posted < missing) { + struct smbdirect_recv_io *recv_io; + int ret; + + /* + * It's ok if smbdirect_connection_get_recv_io() + * returns NULL, it means smbdirect_recv_io structures + * are still be in the reassembly.list. + */ + recv_io = smbdirect_connection_get_recv_io(sc); + if (!recv_io) + break; + + recv_io->first_segment = false; + + ret = smbdirect_connection_post_recv_io(recv_io); + if (ret) { + smbdirect_log_rdma_recv(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_post_recv_io failed rc=%d (%1pe)\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_connection_put_recv_io(recv_io); + return ret; + } + + atomic_inc(&sc->recv_io.posted.count); + posted += 1; + } + + /* If nothing was posted we're done */ + if (posted == 0) + return 0; + + atomic_add(posted, &sc->recv_io.credits.available); + + /* + * If the last send credit is waiting for credits + * it can grant we need to wake it up + */ + if (atomic_read(&sc->send_io.bcredits.count) == 0 && + atomic_read(&sc->send_io.credits.count) == 0) + wake_up(&sc->send_io.credits.wait_queue); + + /* + * If we posted at least one smbdirect_recv_io buffer, + * we need to inform the peer about it and grant + * additional credits. + * + * However there is one case where we don't want to + * do that. + * + * If only a single credit was missing before + * reaching the requested target, we should not + * post an immediate send, as that would cause + * endless ping pong once a keep alive exchange + * is started. + * + * However if sc->recv_io.credits.target is only 1, + * the peer has no credit left and we need to + * grant the credit anyway. + */ + if (missing == 1 && sc->recv_io.credits.target != 1) + return 0; + + return posted; +} + +static void smbdirect_connection_recv_io_refill_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); + int posted; + + posted = smbdirect_connection_recv_io_refill(sc); + if (unlikely(posted < 0)) { + smbdirect_socket_schedule_cleanup(sc, posted); + return; + } + if (posted > 0) { + smbdirect_log_keep_alive(sc, SMBDIRECT_LOG_INFO, + "schedule send of an empty message\n"); + queue_work(sc->workqueues.immediate, &sc->idle.immediate_work); + } +} + +int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, + struct msghdr *msg, + unsigned int flags) +{ + struct smbdirect_recv_io *response; + struct smbdirect_data_transfer *data_transfer; + size_t size = iov_iter_count(&msg->msg_iter); + int to_copy, to_read, data_read, offset; + u32 data_length, remaining_data_length, data_offset; + int ret; + + if (WARN_ONCE(flags, "unexpected flags=0x%x\n", flags)) + return -EINVAL; /* no flags support for now */ + + if (WARN_ON_ONCE(iov_iter_rw(&msg->msg_iter) != ITER_DEST)) + return -EINVAL; /* It's a bug in upper layer to get there */ + +again: + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "status=%s first_error=%1pe => %1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + SMBDIRECT_DEBUG_ERR_PTR(-ENOTCONN)); + return -ENOTCONN; + } + + /* + * No need to hold the reassembly queue lock all the time as we are + * the only one reading from the front of the queue. The transport + * may add more entries to the back of the queue at the same time + */ + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "size=%zd sc->recv_io.reassembly.data_length=%d\n", + size, sc->recv_io.reassembly.data_length); + if (sc->recv_io.reassembly.data_length >= size) { + int queue_length; + int queue_removed = 0; + unsigned long flags; + + /* + * Need to make sure reassembly_data_length is read before + * reading reassembly_queue_length and calling + * smbdirect_connection_reassembly_first_recv_io. This call is lock free + * as we never read at the end of the queue which are being + * updated in SOFTIRQ as more data is received + */ + virt_rmb(); + queue_length = sc->recv_io.reassembly.queue_length; + data_read = 0; + to_read = size; + offset = sc->recv_io.reassembly.first_entry_offset; + while (data_read < size) { + response = smbdirect_connection_reassembly_first_recv_io(sc); + data_transfer = (void *)response->packet; + data_length = le32_to_cpu(data_transfer->data_length); + remaining_data_length = + le32_to_cpu( + data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); + + /* + * The upper layer expects RFC1002 length at the + * beginning of the payload. Return it to indicate + * the total length of the packet. This minimize the + * change to upper layer packet processing logic. This + * will be eventually remove when an intermediate + * transport layer is added + */ + if (response->first_segment && size == 4) { + unsigned int rfc1002_len = + data_length + remaining_data_length; + __be32 rfc1002_hdr = cpu_to_be32(rfc1002_len); + + if (copy_to_iter(&rfc1002_hdr, sizeof(rfc1002_hdr), + &msg->msg_iter) != sizeof(rfc1002_hdr)) + return -EFAULT; + data_read = 4; + response->first_segment = false; + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "returning rfc1002 length %d\n", + rfc1002_len); + goto read_rfc1002_done; + } + + to_copy = min_t(int, data_length - offset, to_read); + if (copy_to_iter((u8 *)data_transfer + data_offset + offset, + to_copy, &msg->msg_iter) != to_copy) + return -EFAULT; + + /* move on to the next buffer? */ + if (to_copy == data_length - offset) { + queue_length--; + /* + * No need to lock if we are not at the + * end of the queue + */ + if (queue_length) + list_del(&response->list); + else { + spin_lock_irqsave( + &sc->recv_io.reassembly.lock, flags); + list_del(&response->list); + spin_unlock_irqrestore( + &sc->recv_io.reassembly.lock, flags); + } + queue_removed++; + sc->statistics.dequeue_reassembly_queue++; + smbdirect_connection_put_recv_io(response); + offset = 0; + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "smbdirect_connection_put_recv_io offset=0\n"); + } else + offset += to_copy; + + to_read -= to_copy; + data_read += to_copy; + + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "memcpy %d bytes len-ofs=%u => todo=%u done=%u ofs=%u\n", + to_copy, data_length - offset, + to_read, data_read, offset); + } + + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + sc->recv_io.reassembly.data_length -= data_read; + sc->recv_io.reassembly.queue_length -= queue_removed; + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + + sc->recv_io.reassembly.first_entry_offset = offset; + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "returning data_read=%d reassembly_length=%d first_ofs=%u\n", + data_read, sc->recv_io.reassembly.data_length, + sc->recv_io.reassembly.first_entry_offset); +read_rfc1002_done: + return data_read; + } + + smbdirect_log_read(sc, SMBDIRECT_LOG_INFO, + "wait_event on more data\n"); + ret = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, + sc->recv_io.reassembly.data_length >= size || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + /* Don't return any data if interrupted */ + if (ret) + return ret; + + goto again; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_recvmsg); + +static bool smbdirect_map_sges_single_page(struct smbdirect_map_sges *state, + struct page *page, size_t off, size_t len) +{ + struct ib_sge *sge; + u64 addr; + + if (state->num_sge >= state->max_sge) + return false; + + addr = ib_dma_map_page(state->device, page, + off, len, state->direction); + if (ib_dma_mapping_error(state->device, addr)) + return false; + + sge = &state->sge[state->num_sge++]; + sge->addr = addr; + sge->length = len; + sge->lkey = state->local_dma_lkey; + + return true; +} + +/* + * Extract page fragments from a BVEC-class iterator and add them to an ib_sge + * list. The pages are not pinned. + */ +static ssize_t smbdirect_map_sges_from_bvec(struct iov_iter *iter, + struct smbdirect_map_sges *state, + ssize_t maxsize) +{ + const struct bio_vec *bv = iter->bvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + size_t off, len; + bool ok; + + len = bv[i].bv_len; + if (start >= len) { + start -= len; + continue; + } + + len = min_t(size_t, maxsize, len - start); + off = bv[i].bv_offset + start; + + ok = smbdirect_map_sges_single_page(state, + bv[i].bv_page, + off, + len); + if (!ok) + return -EIO; + + ret += len; + maxsize -= len; + if (state->num_sge >= state->max_sge || maxsize <= 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract fragments from a KVEC-class iterator and add them to an ib_sge list. + * This can deal with vmalloc'd buffers as well as kmalloc'd or static buffers. + * The pages are not pinned. + */ +static ssize_t smbdirect_map_sges_from_kvec(struct iov_iter *iter, + struct smbdirect_map_sges *state, + ssize_t maxsize) +{ + const struct kvec *kv = iter->kvec; + unsigned long start = iter->iov_offset; + unsigned int i; + ssize_t ret = 0; + + for (i = 0; i < iter->nr_segs; i++) { + struct page *page; + unsigned long kaddr; + size_t off, len, seg; + + len = kv[i].iov_len; + if (start >= len) { + start -= len; + continue; + } + + kaddr = (unsigned long)kv[i].iov_base + start; + off = kaddr & ~PAGE_MASK; + len = min_t(size_t, maxsize, len - start); + kaddr &= PAGE_MASK; + + maxsize -= len; + do { + bool ok; + + seg = min_t(size_t, len, PAGE_SIZE - off); + + if (is_vmalloc_or_module_addr((void *)kaddr)) + page = vmalloc_to_page((void *)kaddr); + else + page = virt_to_page((void *)kaddr); + + ok = smbdirect_map_sges_single_page(state, page, off, seg); + if (!ok) + return -EIO; + + ret += seg; + len -= seg; + kaddr += PAGE_SIZE; + off = 0; + } while (len > 0 && state->num_sge < state->max_sge); + + if (state->num_sge >= state->max_sge || maxsize <= 0) + break; + start = 0; + } + + if (ret > 0) + iov_iter_advance(iter, ret); + return ret; +} + +/* + * Extract folio fragments from a FOLIOQ-class iterator and add them to an + * ib_sge list. The folios are not pinned. + */ +static ssize_t smbdirect_map_sges_from_folioq(struct iov_iter *iter, + struct smbdirect_map_sges *state, + ssize_t maxsize) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; + ssize_t ret = 0; + size_t offset = iter->iov_offset; + + if (WARN_ON_ONCE(!folioq)) + return -EIO; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + if (WARN_ON_ONCE(!folioq)) + return -EIO; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t fsize = folioq_folio_size(folioq, slot); + + if (offset < fsize) { + size_t part = umin(maxsize, fsize - offset); + bool ok; + + ok = smbdirect_map_sges_single_page(state, + folio_page(folio, 0), + offset, + part); + if (!ok) + return -EIO; + + offset += part; + ret += part; + maxsize -= part; + } + + if (offset >= fsize) { + offset = 0; + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (!folioq->next) { + WARN_ON_ONCE(ret < iter->count); + break; + } + folioq = folioq->next; + slot = 0; + } + } + } while (state->num_sge < state->max_sge && maxsize > 0); + + iter->folioq = folioq; + iter->folioq_slot = slot; + iter->iov_offset = offset; + iter->count -= ret; + return ret; +} + +/* + * Extract page fragments from up to the given amount of the source iterator + * and build up an ib_sge list that refers to all of those bits. The ib_sge list + * is appended to, up to the maximum number of elements set in the parameter + * block. + * + * The extracted page fragments are not pinned or ref'd in any way; if an + * IOVEC/UBUF-type iterator is to be used, it should be converted to a + * BVEC-type iterator and the pages pinned, ref'd or otherwise held in some + * way. + */ +static ssize_t smbdirect_map_sges_from_iter(struct iov_iter *iter, size_t len, + struct smbdirect_map_sges *state) +{ + ssize_t ret; + size_t before = state->num_sge; + + if (WARN_ON_ONCE(iov_iter_rw(iter) != ITER_SOURCE)) + return -EIO; + + switch (iov_iter_type(iter)) { + case ITER_BVEC: + ret = smbdirect_map_sges_from_bvec(iter, state, len); + break; + case ITER_KVEC: + ret = smbdirect_map_sges_from_kvec(iter, state, len); + break; + case ITER_FOLIOQ: + ret = smbdirect_map_sges_from_folioq(iter, state, len); + break; + default: + WARN_ONCE(1, "iov_iter_type[%u]\n", iov_iter_type(iter)); + return -EIO; + } + + if (ret < 0) { + while (state->num_sge > before) { + struct ib_sge *sge = &state->sge[state->num_sge--]; + + ib_dma_unmap_page(state->device, + sge->addr, + sge->length, + state->direction); + } + } + + return ret; +} diff --git a/fs/smb/common/smbdirect/smbdirect_debug.c b/fs/smb/common/smbdirect/smbdirect_debug.c new file mode 100644 index 000000000000..d8664fd7f71a --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_debug.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" +#include + +void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, + unsigned int rdma_readwrite_threshold, + struct seq_file *m) +{ + const struct smbdirect_socket_parameters *sp; + + if (!sc) + return; + sp = &sc->parameters; + + seq_puts(m, "\n"); + seq_printf(m, "SMBDirect protocol version: 0x%x ", + SMBDIRECT_V1); + seq_printf(m, "transport status: %s (%u)", + smbdirect_socket_status_string(sc->status), + sc->status); + + seq_puts(m, "\n"); + seq_printf(m, "Conn receive_credit_max: %u ", + sp->recv_credit_max); + seq_printf(m, "send_credit_target: %u max_send_size: %u", + sp->send_credit_target, + sp->max_send_size); + + seq_puts(m, "\n"); + seq_printf(m, "Conn max_fragmented_recv_size: %u ", + sp->max_fragmented_recv_size); + seq_printf(m, "max_fragmented_send_size: %u max_receive_size:%u", + sp->max_fragmented_send_size, + sp->max_recv_size); + + seq_puts(m, "\n"); + seq_printf(m, "Conn keep_alive_interval: %u ", + sp->keepalive_interval_msec * 1000); + seq_printf(m, "max_readwrite_size: %u rdma_readwrite_threshold: %u", + sp->max_read_write_size, + rdma_readwrite_threshold); + + seq_puts(m, "\n"); + seq_printf(m, "Debug count_get_receive_buffer: %llu ", + sc->statistics.get_receive_buffer); + seq_printf(m, "count_put_receive_buffer: %llu count_send_empty: %llu", + sc->statistics.put_receive_buffer, + sc->statistics.send_empty); + + seq_puts(m, "\n"); + seq_printf(m, "Read Queue count_enqueue_reassembly_queue: %llu ", + sc->statistics.enqueue_reassembly_queue); + seq_printf(m, "count_dequeue_reassembly_queue: %llu ", + sc->statistics.dequeue_reassembly_queue); + seq_printf(m, "reassembly_data_length: %u ", + sc->recv_io.reassembly.data_length); + seq_printf(m, "reassembly_queue_length: %u", + sc->recv_io.reassembly.queue_length); + + seq_puts(m, "\n"); + seq_printf(m, "Current Credits send_credits: %u ", + atomic_read(&sc->send_io.credits.count)); + seq_printf(m, "receive_credits: %u receive_credit_target: %u", + atomic_read(&sc->recv_io.credits.count), + sc->recv_io.credits.target); + + seq_puts(m, "\n"); + seq_printf(m, "Pending send_pending: %u ", + atomic_read(&sc->send_io.pending.count)); + + seq_puts(m, "\n"); + seq_printf(m, "MR responder_resources: %u ", + sp->responder_resources); + seq_printf(m, "max_frmr_depth: %u mr_type: 0x%x", + sp->max_frmr_depth, + sc->mr_io.type); + + seq_puts(m, "\n"); + seq_printf(m, "MR mr_ready_count: %u mr_used_count: %u", + atomic_read(&sc->mr_io.ready.count), + atomic_read(&sc->mr_io.used.count)); +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_legacy_debug_proc_show); diff --git a/fs/smb/common/smbdirect/smbdirect_devices.c b/fs/smb/common/smbdirect/smbdirect_devices.c new file mode 100644 index 000000000000..aaab99e9c045 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_devices.c @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + * Copyright (c) 2025 Stefan Metzmacher + */ + +#include "smbdirect_internal.h" + +static u8 smbdirect_ib_device_rdma_capable_node_type(struct ib_device *ib_dev) +{ + if (!smbdirect_frwr_is_supported(&ib_dev->attrs)) + return RDMA_NODE_UNSPECIFIED; + + switch (ib_dev->node_type) { + case RDMA_NODE_IB_CA: /* Infiniband, RoCE v1 and v2 */ + case RDMA_NODE_RNIC: /* iWarp */ + return ib_dev->node_type; + } + + return RDMA_NODE_UNSPECIFIED; +} + +static int smbdirect_ib_client_add(struct ib_device *ib_dev) +{ + u8 node_type = smbdirect_ib_device_rdma_capable_node_type(ib_dev); + struct smbdirect_device *sdev; + const char *node_str; + const char *action; + u32 pidx; + + switch (node_type) { + case RDMA_NODE_IB_CA: + node_str = "IB_CA"; + action = "added"; + break; + case RDMA_NODE_RNIC: + node_str = "RNIC"; + action = "added"; + break; + case RDMA_NODE_UNSPECIFIED: + node_str = "UNSPECIFIED"; + action = "ignored"; + break; + default: + node_str = "UNKNOWN"; + action = "ignored"; + node_type = RDMA_NODE_UNSPECIFIED; + break; + } + + pr_info("ib_dev[%.*s]: %s: %s %s=%u %s=0x%llx %s=0x%llx %s=0x%llx\n", + IB_DEVICE_NAME_MAX, + ib_dev->name, + action, + node_str, + "max_fast_reg_page_list_len", + ib_dev->attrs.max_fast_reg_page_list_len, + "device_cap_flags", + ib_dev->attrs.device_cap_flags, + "kernel_cap_flags", + ib_dev->attrs.kernel_cap_flags, + "page_size_cap", + ib_dev->attrs.page_size_cap); + + if (node_type == RDMA_NODE_UNSPECIFIED) + return 0; + + pr_info("ib_dev[%.*s]: %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u %s=%u\n", + IB_DEVICE_NAME_MAX, + ib_dev->name, + "num_ports", + rdma_end_port(ib_dev), + "max_qp_rd_atom", + ib_dev->attrs.max_qp_rd_atom, + "max_qp_init_rd_atom", + ib_dev->attrs.max_qp_init_rd_atom, + "max_sgl_rd", + ib_dev->attrs.max_sgl_rd, + "max_sge_rd", + ib_dev->attrs.max_sge_rd, + "max_cqe", + ib_dev->attrs.max_cqe, + "max_qp_wr", + ib_dev->attrs.max_qp_wr, + "max_send_sge", + ib_dev->attrs.max_send_sge, + "max_recv_sge", + ib_dev->attrs.max_recv_sge); + + rdma_for_each_port(ib_dev, pidx) { + const struct ib_port_immutable *ib_pi = + ib_port_immutable_read(ib_dev, pidx); + u32 core_cap_flags = ib_pi ? ib_pi->core_cap_flags : 0; + + pr_info("ib_dev[%.*s]PORT[%u]: %s=%u %s=%u %s=%u %s=%u %s=%u %s=0x%x\n", + IB_DEVICE_NAME_MAX, + ib_dev->name, + pidx, + "iwarp", + rdma_protocol_iwarp(ib_dev, pidx), + "ib", + rdma_protocol_ib(ib_dev, pidx), + "roce", + rdma_protocol_roce(ib_dev, pidx), + "v1", + rdma_protocol_roce_eth_encap(ib_dev, pidx), + "v2", + rdma_protocol_roce_udp_encap(ib_dev, pidx), + "core_cap_flags", + core_cap_flags); + } + + sdev = kzalloc_obj(*sdev); + if (!sdev) + return -ENOMEM; + sdev->ib_dev = ib_dev; + snprintf(sdev->ib_name, ARRAY_SIZE(sdev->ib_name), "%.*s", + IB_DEVICE_NAME_MAX, ib_dev->name); + + write_lock(&smbdirect_globals.devices.lock); + list_add(&sdev->list, &smbdirect_globals.devices.list); + write_unlock(&smbdirect_globals.devices.lock); + + return 0; +} + +static void smbdirect_ib_client_remove(struct ib_device *ib_dev, void *client_data) +{ + struct smbdirect_device *sdev, *tmp; + + write_lock(&smbdirect_globals.devices.lock); + list_for_each_entry_safe(sdev, tmp, &smbdirect_globals.devices.list, list) { + if (sdev->ib_dev == ib_dev) { + list_del(&sdev->list); + pr_info("ib_dev[%.*s] removed\n", + IB_DEVICE_NAME_MAX, sdev->ib_name); + kfree(sdev); + break; + } + } + write_unlock(&smbdirect_globals.devices.lock); +} + +static void smbdirect_ib_client_rename(struct ib_device *ib_dev, void *client_data) +{ + struct smbdirect_device *sdev; + + write_lock(&smbdirect_globals.devices.lock); + list_for_each_entry(sdev, &smbdirect_globals.devices.list, list) { + if (sdev->ib_dev == ib_dev) { + pr_info("ib_dev[%.*s] renamed to [%.*s]\n", + IB_DEVICE_NAME_MAX, sdev->ib_name, + IB_DEVICE_NAME_MAX, ib_dev->name); + snprintf(sdev->ib_name, ARRAY_SIZE(sdev->ib_name), "%.*s", + IB_DEVICE_NAME_MAX, ib_dev->name); + break; + } + } + write_unlock(&smbdirect_globals.devices.lock); +} + +static struct ib_client smbdirect_ib_client = { + .name = "smbdirect_ib_client", + .add = smbdirect_ib_client_add, + .remove = smbdirect_ib_client_remove, + .rename = smbdirect_ib_client_rename, +}; + +static u8 smbdirect_netdev_find_rdma_capable_node_type(struct net_device *netdev) +{ + struct smbdirect_device *sdev; + u8 node_type = RDMA_NODE_UNSPECIFIED; + + read_lock(&smbdirect_globals.devices.lock); + list_for_each_entry(sdev, &smbdirect_globals.devices.list, list) { + u32 pi; + + rdma_for_each_port(sdev->ib_dev, pi) { + struct net_device *ndev; + + ndev = ib_device_get_netdev(sdev->ib_dev, pi); + if (!ndev) + continue; + + if (ndev == netdev) { + dev_put(ndev); + node_type = sdev->ib_dev->node_type; + goto out; + } + dev_put(ndev); + } + } +out: + read_unlock(&smbdirect_globals.devices.lock); + + if (node_type == RDMA_NODE_UNSPECIFIED) { + struct ib_device *ibdev; + + ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); + if (ibdev) { + node_type = smbdirect_ib_device_rdma_capable_node_type(ibdev); + ib_device_put(ibdev); + } + } + + return node_type; +} + +/* + * Returns RDMA_NODE_UNSPECIFIED when the netdev has + * no support for smbdirect capable rdma. + * + * Otherwise RDMA_NODE_RNIC is returned for iwarp devices + * and RDMA_NODE_IB_CA or Infiniband and RoCE (v1 and v2) + */ +u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev) +{ + struct net_device *lower_dev; + struct list_head *iter; + u8 node_type = RDMA_NODE_UNSPECIFIED; + + node_type = smbdirect_netdev_find_rdma_capable_node_type(netdev); + if (node_type != RDMA_NODE_UNSPECIFIED) + return node_type; + + /* check if netdev is bridge or VLAN */ + if (netif_is_bridge_master(netdev) || netdev->priv_flags & IFF_802_1Q_VLAN) + netdev_for_each_lower_dev(netdev, lower_dev, iter) { + node_type = smbdirect_netdev_find_rdma_capable_node_type(lower_dev); + if (node_type != RDMA_NODE_UNSPECIFIED) + return node_type; + } + + /* check if netdev is IPoIB safely without layer violation */ + if (netdev->type == ARPHRD_INFINIBAND) + return RDMA_NODE_IB_CA; + + return RDMA_NODE_UNSPECIFIED; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_netdev_rdma_capable_node_type); + +__init int smbdirect_devices_init(void) +{ + int ret; + + rwlock_init(&smbdirect_globals.devices.lock); + INIT_LIST_HEAD(&smbdirect_globals.devices.list); + + ret = ib_register_client(&smbdirect_ib_client); + if (ret) { + pr_crit("failed to ib_register_client: %d %1pe\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + return 0; +} + +__exit void smbdirect_devices_exit(void) +{ + struct smbdirect_device *sdev, *tmp; + + /* + * On exist we just cleanup so that + * smbdirect_ib_client_remove() won't + * print removals of devices. + */ + write_lock(&smbdirect_globals.devices.lock); + list_for_each_entry_safe(sdev, tmp, &smbdirect_globals.devices.list, list) { + list_del(&sdev->list); + kfree(sdev); + } + write_unlock(&smbdirect_globals.devices.lock); + + ib_unregister_client(&smbdirect_ib_client); +} diff --git a/fs/smb/common/smbdirect/smbdirect_internal.h b/fs/smb/common/smbdirect/smbdirect_internal.h new file mode 100644 index 000000000000..30a1b8643657 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_internal.h @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2025, Stefan Metzmacher + */ + +#ifndef __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ +#define __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "smbdirect.h" +#include "smbdirect_pdu.h" +#include "smbdirect_public.h" + +#include + +struct smbdirect_module_state { + struct mutex mutex; + + struct { + struct workqueue_struct *accept; + struct workqueue_struct *connect; + struct workqueue_struct *idle; + struct workqueue_struct *refill; + struct workqueue_struct *immediate; + struct workqueue_struct *cleanup; + } workqueues; + + struct { + rwlock_t lock; + struct list_head list; + } devices; +}; + +extern struct smbdirect_module_state smbdirect_globals; + +#include "smbdirect_socket.h" + +struct smbdirect_device { + struct list_head list; + struct ib_device *ib_dev; + /* + * copy of ib_dev->name, + * in order to print renames + */ + char ib_name[IB_DEVICE_NAME_MAX]; +}; + +int smbdirect_socket_init_new(struct net *net, struct smbdirect_socket *sc); + +int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc); + +void __smbdirect_socket_schedule_cleanup(struct smbdirect_socket *sc, + const char *macro_name, + unsigned int lvl, + const char *func, + unsigned int line, + int error, + enum smbdirect_socket_status *force_status); +#define smbdirect_socket_schedule_cleanup(__sc, __error) \ + __smbdirect_socket_schedule_cleanup(__sc, \ + "smbdirect_socket_schedule_cleanup", SMBDIRECT_LOG_ERR, \ + __func__, __LINE__, __error, NULL) +#define smbdirect_socket_schedule_cleanup_lvl(__sc, __lvl, __error) \ + __smbdirect_socket_schedule_cleanup(__sc, \ + "smbdirect_socket_schedule_cleanup_lvl", __lvl, \ + __func__, __LINE__, __error, NULL) +#define smbdirect_socket_schedule_cleanup_status(__sc, __lvl, __error, __status) do { \ + enum smbdirect_socket_status __force_status = __status; \ + __smbdirect_socket_schedule_cleanup(__sc, \ + "smbdirect_socket_schedule_cleanup_status", __lvl, \ + __func__, __LINE__, __error, &__force_status); \ +} while (0) + +void smbdirect_socket_destroy_sync(struct smbdirect_socket *sc); + +int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc, + enum smbdirect_socket_status expected_status, + int unexpected_errno, + wait_queue_head_t *waitq, + atomic_t *total_credits, + int needed); + +void smbdirect_connection_rdma_established(struct smbdirect_socket *sc); + +void smbdirect_connection_negotiation_done(struct smbdirect_socket *sc); + +int smbdirect_connection_create_qp(struct smbdirect_socket *sc); + +void smbdirect_connection_destroy_qp(struct smbdirect_socket *sc); + +int smbdirect_connection_create_mem_pools(struct smbdirect_socket *sc); + +void smbdirect_connection_destroy_mem_pools(struct smbdirect_socket *sc); + +struct smbdirect_send_io *smbdirect_connection_alloc_send_io(struct smbdirect_socket *sc); + +void smbdirect_connection_free_send_io(struct smbdirect_send_io *msg); + +struct smbdirect_recv_io *smbdirect_connection_get_recv_io(struct smbdirect_socket *sc); + +void smbdirect_connection_put_recv_io(struct smbdirect_recv_io *msg); + +void smbdirect_connection_reassembly_append_recv_io(struct smbdirect_socket *sc, + struct smbdirect_recv_io *msg, + u32 data_length); + +struct smbdirect_recv_io * +smbdirect_connection_reassembly_first_recv_io(struct smbdirect_socket *sc); + +void smbdirect_connection_negotiate_rdma_resources(struct smbdirect_socket *sc, + u8 peer_initiator_depth, + u8 peer_responder_resources, + const struct rdma_conn_param *param); + +void smbdirect_connection_idle_timer_work(struct work_struct *work); + +u16 smbdirect_connection_grant_recv_credits(struct smbdirect_socket *sc); + +int smbdirect_connection_post_send_wr(struct smbdirect_socket *sc, + struct ib_send_wr *wr); + +int smbdirect_connection_post_recv_io(struct smbdirect_recv_io *msg); + +void smbdirect_connection_recv_io_done(struct ib_cq *cq, struct ib_wc *wc); + +int smbdirect_connection_recv_io_refill(struct smbdirect_socket *sc); + +int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc); + +void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc); + +int smbdirect_accept_connect_request(struct smbdirect_socket *sc, + const struct rdma_conn_param *param); + +void smbdirect_accept_negotiate_finish(struct smbdirect_socket *sc, u32 ntstatus); + +__init int smbdirect_devices_init(void); +__exit void smbdirect_devices_exit(void); + +#endif /* __FS_SMB_COMMON_SMBDIRECT_INTERNAL_H__ */ diff --git a/fs/smb/common/smbdirect/smbdirect_listen.c b/fs/smb/common/smbdirect/smbdirect_listen.c new file mode 100644 index 000000000000..05c7902e7020 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_listen.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" + +static int smbdirect_listen_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event); + +int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog) +{ + int ret; + + if (backlog < 0) + return -EINVAL; + if (!backlog) + backlog = 1; /* use 1 as default for now */ + + if (sc->first_error) + return -EINVAL; + + if (sc->status != SMBDIRECT_SOCKET_CREATED) + return -EINVAL; + + if (WARN_ON_ONCE(!sc->rdma.cm_id)) + return -EINVAL; + + if (sc->rdma.cm_id->device) + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "try to listen on addr: %pISpsfc dev: %.*s\n", + &sc->rdma.cm_id->route.addr.src_addr, + IB_DEVICE_NAME_MAX, + sc->rdma.cm_id->device->name); + else + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "try to listen on addr: %pISpsfc\n", + &sc->rdma.cm_id->route.addr.src_addr); + + /* already checked above */ + WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); + sc->status = SMBDIRECT_SOCKET_LISTENING; + sc->rdma.expected_event = RDMA_CM_EVENT_CONNECT_REQUEST; + rdma_lock_handler(sc->rdma.cm_id); + sc->rdma.cm_id->event_handler = smbdirect_listen_rdma_event_handler; + rdma_unlock_handler(sc->rdma.cm_id); + + ret = rdma_listen(sc->rdma.cm_id, backlog); + if (ret) { + sc->first_error = ret; + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + if (sc->rdma.cm_id->device) + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "listening failed %1pe on addr: %pISpsfc dev: %.*s\n", + SMBDIRECT_DEBUG_ERR_PTR(ret), + &sc->rdma.cm_id->route.addr.src_addr, + IB_DEVICE_NAME_MAX, + sc->rdma.cm_id->device->name); + else + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "listening failed %1pe on addr: %pISpsfc\n", + SMBDIRECT_DEBUG_ERR_PTR(ret), + &sc->rdma.cm_id->route.addr.src_addr); + return ret; + } + + /* + * This is a value > 0, checked above, + * so we are able to use sc->listen.backlog == -1, + * as indication that the socket was never + * a listener. + */ + sc->listen.backlog = backlog; + + if (sc->rdma.cm_id->device) + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "listening on addr: %pISpsfc dev: %.*s\n", + &sc->rdma.cm_id->route.addr.src_addr, + IB_DEVICE_NAME_MAX, + sc->rdma.cm_id->device->name); + else + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "listening on addr: %pISpsfc\n", + &sc->rdma.cm_id->route.addr.src_addr); + + /* + * The rest happens async via smbdirect_listen_rdma_event_handler() + */ + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_listen); + +static int smbdirect_new_rdma_event_handler(struct rdma_cm_id *new_id, + struct rdma_cm_event *event) +{ + int ret = -ESTALE; + + /* + * This should be replaced before any real work + * starts! So it should never be called! + */ + + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ret = -ENETDOWN; + if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status))) + ret = event->status; + WARN_ONCE(1, + "%s should not be called! event=%s status=%d => ret=%1pe\n", + __func__, + rdma_event_msg(event->event), + event->status, + SMBDIRECT_DEBUG_ERR_PTR(ret)); + return -ESTALE; +} + +static int smbdirect_listen_connect_request(struct smbdirect_socket *lsc, + struct rdma_cm_id *new_id, + const struct rdma_cm_event *event); + +static int smbdirect_listen_rdma_event_handler(struct rdma_cm_id *new_id, + struct rdma_cm_event *event) +{ + struct smbdirect_socket *lsc = new_id->context; + int ret; + + if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) { + new_id->context = NULL; + new_id->event_handler = smbdirect_new_rdma_event_handler; + } else + new_id = NULL; + + /* + * cma_cm_event_handler() has + * lockdep_assert_held(&id_priv->handler_mutex); + * + * Mutexes are not allowed in interrupts, + * and we rely on not being in an interrupt here, + * as we might sleep. + */ + WARN_ON_ONCE(in_interrupt()); + + if (event->status || event->event != lsc->rdma.expected_event) { + ret = -ECONNABORTED; + + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ret = -ENETDOWN; + if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status))) + ret = event->status; + + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR, + "%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n", + smbdirect_socket_status_string(lsc->status), + SMBDIRECT_DEBUG_ERR_PTR(lsc->first_error), + rdma_event_msg(lsc->rdma.expected_event), + rdma_event_msg(event->event), + event->status, + SMBDIRECT_DEBUG_ERR_PTR(ret)); + + /* + * In case of error return it and let the caller + * destroy new_id + */ + smbdirect_socket_schedule_cleanup(lsc, ret); + return new_id ? ret : 0; + } + + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_INFO, + "%s (first_error=%1pe) event=%s\n", + smbdirect_socket_status_string(lsc->status), + SMBDIRECT_DEBUG_ERR_PTR(lsc->first_error), + rdma_event_msg(event->event)); + + /* + * In case of error return it and let the caller + * destroy new_id + */ + if (lsc->first_error) + return new_id ? lsc->first_error : 0; + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + WARN_ON_ONCE(lsc->status != SMBDIRECT_SOCKET_LISTENING); + + /* + * In case of error return it and let the caller + * destroy new_id + */ + ret = smbdirect_listen_connect_request(lsc, new_id, event); + if (ret) + return ret; + return 0; + + default: + break; + } + + /* + * This is an internal error + */ + WARN_ON_ONCE(lsc->rdma.expected_event != RDMA_CM_EVENT_CONNECT_REQUEST); + smbdirect_socket_schedule_cleanup(lsc, -EINVAL); + return 0; +} + +static int smbdirect_listen_connect_request(struct smbdirect_socket *lsc, + struct rdma_cm_id *new_id, + const struct rdma_cm_event *event) +{ + const struct smbdirect_socket_parameters *lsp = &lsc->parameters; + struct smbdirect_socket *nsc; + unsigned long flags; + size_t backlog = max_t(size_t, 1, lsc->listen.backlog); + size_t psockets; + size_t rsockets; + int ret; + + if (!smbdirect_frwr_is_supported(&new_id->device->attrs)) { + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR, + "Fast Registration Work Requests (FRWR) is not supported device %.*s\n", + IB_DEVICE_NAME_MAX, + new_id->device->name); + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR, + "Device capability flags = %llx max_fast_reg_page_list_len = %u\n", + new_id->device->attrs.device_cap_flags, + new_id->device->attrs.max_fast_reg_page_list_len); + return -EPROTONOSUPPORT; + } + + if (lsp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB && + !rdma_ib_or_roce(new_id->device, new_id->port_num)) { + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR, + "Not IB: device: %.*s IW:%u local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, + new_id->device->name, + rdma_protocol_iwarp(new_id->device, new_id->port_num), + &new_id->route.addr.src_addr, + &new_id->route.addr.dst_addr); + return -EPROTONOSUPPORT; + } + if (lsp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW && + !rdma_protocol_iwarp(new_id->device, new_id->port_num)) { + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR, + "Not IW: device: %.*s IB:%u local: %pISpsfc remote: %pISpsfc\n", + IB_DEVICE_NAME_MAX, + new_id->device->name, + rdma_ib_or_roce(new_id->device, new_id->port_num), + &new_id->route.addr.src_addr, + &new_id->route.addr.dst_addr); + return -EPROTONOSUPPORT; + } + + spin_lock_irqsave(&lsc->listen.lock, flags); + psockets = list_count_nodes(&lsc->listen.pending); + rsockets = list_count_nodes(&lsc->listen.ready); + spin_unlock_irqrestore(&lsc->listen.lock, flags); + + if (psockets > backlog || + rsockets > backlog || + (psockets + rsockets) > backlog) { + smbdirect_log_rdma_event(lsc, SMBDIRECT_LOG_ERR, + "Backlog[%d][%zu] full pending[%zu] ready[%zu]\n", + lsc->listen.backlog, backlog, psockets, rsockets); + return -EBUSY; + } + + ret = smbdirect_socket_create_accepting(new_id, &nsc); + if (ret) + goto socket_init_failed; + + nsc->logging = lsc->logging; + ret = smbdirect_socket_set_initial_parameters(nsc, &lsc->parameters); + if (ret) + goto set_params_failed; + ret = smbdirect_socket_set_kernel_settings(nsc, + lsc->ib.poll_ctx, + lsc->send_io.mem.gfp_mask); + if (ret) + goto set_settings_failed; + + spin_lock_irqsave(&lsc->listen.lock, flags); + list_add_tail(&nsc->accept.list, &lsc->listen.pending); + nsc->accept.listener = lsc; + spin_unlock_irqrestore(&lsc->listen.lock, flags); + + ret = smbdirect_accept_connect_request(nsc, &event->param.conn); + if (ret) + goto accept_connect_failed; + + return 0; + +accept_connect_failed: + spin_lock_irqsave(&lsc->listen.lock, flags); + list_del_init(&nsc->accept.list); + nsc->accept.listener = NULL; + spin_unlock_irqrestore(&lsc->listen.lock, flags); +set_settings_failed: +set_params_failed: + /* + * The caller will destroy new_id + */ + nsc->ib.dev = NULL; + nsc->rdma.cm_id = NULL; + smbdirect_socket_release(nsc); +socket_init_failed: + return ret; +} diff --git a/fs/smb/common/smbdirect/smbdirect_main.c b/fs/smb/common/smbdirect/smbdirect_main.c new file mode 100644 index 000000000000..fe6e8d93c34c --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_main.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" +#include + +struct smbdirect_module_state smbdirect_globals = { + .mutex = __MUTEX_INITIALIZER(smbdirect_globals.mutex), +}; + +static __init int smbdirect_module_init(void) +{ + int ret = -ENOMEM; + + pr_notice("subsystem loading...\n"); + mutex_lock(&smbdirect_globals.mutex); + + smbdirect_globals.workqueues.accept = alloc_workqueue("smbdirect-accept", + WQ_SYSFS | + WQ_PERCPU | + WQ_POWER_EFFICIENT, + 0); + if (smbdirect_globals.workqueues.accept == NULL) + goto alloc_accept_wq_failed; + + smbdirect_globals.workqueues.connect = alloc_workqueue("smbdirect-connect", + WQ_SYSFS | + WQ_PERCPU | + WQ_POWER_EFFICIENT, + 0); + if (smbdirect_globals.workqueues.connect == NULL) + goto alloc_connect_wq_failed; + + smbdirect_globals.workqueues.idle = alloc_workqueue("smbdirect-idle", + WQ_SYSFS | + WQ_PERCPU | + WQ_POWER_EFFICIENT, + 0); + if (smbdirect_globals.workqueues.idle == NULL) + goto alloc_idle_wq_failed; + + smbdirect_globals.workqueues.refill = alloc_workqueue("smbdirect-refill", + WQ_HIGHPRI | + WQ_SYSFS | + WQ_PERCPU | + WQ_POWER_EFFICIENT, + 0); + if (smbdirect_globals.workqueues.refill == NULL) + goto alloc_refill_wq_failed; + + smbdirect_globals.workqueues.immediate = alloc_workqueue("smbdirect-immediate", + WQ_HIGHPRI | + WQ_SYSFS | + WQ_PERCPU | + WQ_POWER_EFFICIENT, + 0); + if (smbdirect_globals.workqueues.immediate == NULL) + goto alloc_immediate_wq_failed; + + smbdirect_globals.workqueues.cleanup = alloc_workqueue("smbdirect-cleanup", + WQ_MEM_RECLAIM | + WQ_HIGHPRI | + WQ_SYSFS | + WQ_PERCPU | + WQ_POWER_EFFICIENT, + 0); + if (smbdirect_globals.workqueues.cleanup == NULL) + goto alloc_cleanup_wq_failed; + + ret = smbdirect_devices_init(); + if (ret) + goto devices_init_failed; + + mutex_unlock(&smbdirect_globals.mutex); + pr_notice("subsystem loaded\n"); + return 0; + +devices_init_failed: + destroy_workqueue(smbdirect_globals.workqueues.cleanup); +alloc_cleanup_wq_failed: + destroy_workqueue(smbdirect_globals.workqueues.immediate); +alloc_immediate_wq_failed: + destroy_workqueue(smbdirect_globals.workqueues.refill); +alloc_refill_wq_failed: + destroy_workqueue(smbdirect_globals.workqueues.idle); +alloc_idle_wq_failed: + destroy_workqueue(smbdirect_globals.workqueues.connect); +alloc_connect_wq_failed: + destroy_workqueue(smbdirect_globals.workqueues.accept); +alloc_accept_wq_failed: + mutex_unlock(&smbdirect_globals.mutex); + pr_crit("failed to loaded: %d (%1pe)\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; +} + +static __exit void smbdirect_module_exit(void) +{ + pr_notice("subsystem unloading...\n"); + mutex_lock(&smbdirect_globals.mutex); + + smbdirect_devices_exit(); + + destroy_workqueue(smbdirect_globals.workqueues.accept); + destroy_workqueue(smbdirect_globals.workqueues.connect); + destroy_workqueue(smbdirect_globals.workqueues.idle); + destroy_workqueue(smbdirect_globals.workqueues.refill); + destroy_workqueue(smbdirect_globals.workqueues.immediate); + destroy_workqueue(smbdirect_globals.workqueues.cleanup); + + mutex_unlock(&smbdirect_globals.mutex); + pr_notice("subsystem unloaded\n"); +} + +module_init(smbdirect_module_init); +module_exit(smbdirect_module_exit); + +MODULE_DESCRIPTION("smbdirect subsystem"); +MODULE_LICENSE("GPL"); diff --git a/fs/smb/common/smbdirect/smbdirect_mr.c b/fs/smb/common/smbdirect/smbdirect_mr.c new file mode 100644 index 000000000000..fa9be8089925 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_mr.c @@ -0,0 +1,493 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" + +/* + * Allocate MRs used for RDMA read/write + * The number of MRs will not exceed hardware capability in responder_resources + * All MRs are kept in mr_list. The MR can be recovered after it's used + * Recovery is done in smbd_mr_recovery_work. The content of list entry changes + * as MRs are used and recovered for I/O, but the list links will not change + */ +int smbdirect_connection_create_mr_list(struct smbdirect_socket *sc) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_mr_io *mr; + int ret; + u32 i; + + if (sp->responder_resources == 0) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "responder_resources negotiated as 0\n"); + return -EINVAL; + } + + /* Allocate more MRs (2x) than hardware responder_resources */ + for (i = 0; i < sp->responder_resources * 2; i++) { + mr = kzalloc_obj(*mr); + if (!mr) { + ret = -ENOMEM; + goto kzalloc_mr_failed; + } + + kref_init(&mr->kref); + mutex_init(&mr->mutex); + + mr->mr = ib_alloc_mr(sc->ib.pd, + sc->mr_io.type, + sp->max_frmr_depth); + if (IS_ERR(mr->mr)) { + ret = PTR_ERR(mr->mr); + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "ib_alloc_mr failed ret=%d (%1pe) type=0x%x max_frmr_depth=%u\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret), + sc->mr_io.type, sp->max_frmr_depth); + goto ib_alloc_mr_failed; + } + mr->sgt.sgl = kzalloc_objs(struct scatterlist, sp->max_frmr_depth); + if (!mr->sgt.sgl) { + ret = -ENOMEM; + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "failed to allocate sgl, max_frmr_depth=%u\n", + sp->max_frmr_depth); + goto kcalloc_sgl_failed; + } + mr->state = SMBDIRECT_MR_READY; + mr->socket = sc; + + list_add_tail(&mr->list, &sc->mr_io.all.list); + atomic_inc(&sc->mr_io.ready.count); + } + + return 0; + +kcalloc_sgl_failed: + ib_dereg_mr(mr->mr); +ib_alloc_mr_failed: + mutex_destroy(&mr->mutex); + kfree(mr); +kzalloc_mr_failed: + smbdirect_connection_destroy_mr_list(sc); + return ret; +} + +static void smbdirect_mr_io_disable_locked(struct smbdirect_mr_io *mr) +{ + struct smbdirect_socket *sc = mr->socket; + + lockdep_assert_held(&mr->mutex); + + if (mr->state == SMBDIRECT_MR_DISABLED) + return; + + if (mr->mr) + ib_dereg_mr(mr->mr); + if (mr->sgt.nents) + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + kfree(mr->sgt.sgl); + + mr->mr = NULL; + mr->sgt.sgl = NULL; + mr->sgt.nents = 0; + + mr->state = SMBDIRECT_MR_DISABLED; +} + +static void smbdirect_mr_io_free_locked(struct kref *kref) +{ + struct smbdirect_mr_io *mr = + container_of(kref, struct smbdirect_mr_io, kref); + + lockdep_assert_held(&mr->mutex); + + /* + * smbdirect_mr_io_disable_locked() should already be called! + */ + if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) + smbdirect_mr_io_disable_locked(mr); + + mutex_unlock(&mr->mutex); + mutex_destroy(&mr->mutex); + kfree(mr); +} + +void smbdirect_connection_destroy_mr_list(struct smbdirect_socket *sc) +{ + struct smbdirect_mr_io *mr, *tmp; + LIST_HEAD(all_list); + unsigned long flags; + + spin_lock_irqsave(&sc->mr_io.all.lock, flags); + list_splice_tail_init(&sc->mr_io.all.list, &all_list); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + + list_for_each_entry_safe(mr, tmp, &all_list, list) { + mutex_lock(&mr->mutex); + + smbdirect_mr_io_disable_locked(mr); + list_del(&mr->list); + mr->socket = NULL; + + /* + * No kref_put_mutex() as it's already locked. + * + * If smbdirect_mr_io_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbdirect_mr_io_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock. + * + * If the mr is still registered it will + * be dangling (detached from the connection + * waiting for smbd_deregister_mr() to be + * called in order to free the memory. + */ + if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) + mutex_unlock(&mr->mutex); + } +} + +/* + * Get a MR from mr_list. This function waits until there is at least one MR + * available in the list. There may be several CPUs issuing I/O trying to get MR + * at the same time, mr_list_lock is used to protect this situation. + */ +static struct smbdirect_mr_io * +smbdirect_connection_get_mr_io(struct smbdirect_socket *sc) +{ + struct smbdirect_mr_io *mr; + unsigned long flags; + int ret; + +again: + ret = wait_event_interruptible(sc->mr_io.ready.wait_queue, + atomic_read(&sc->mr_io.ready.count) || + sc->status != SMBDIRECT_SOCKET_CONNECTED); + if (ret) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "wait_event_interruptible ret=%d (%1pe)\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + return NULL; + } + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "sc->status=%s sc->first_error=%1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + return NULL; + } + + spin_lock_irqsave(&sc->mr_io.all.lock, flags); + list_for_each_entry(mr, &sc->mr_io.all.list, list) { + if (mr->state == SMBDIRECT_MR_READY) { + mr->state = SMBDIRECT_MR_REGISTERED; + kref_get(&mr->kref); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + atomic_dec(&sc->mr_io.ready.count); + atomic_inc(&sc->mr_io.used.count); + return mr; + } + } + + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + /* + * It is possible that we could fail to get MR because other processes may + * try to acquire a MR at the same time. If this is the case, retry it. + */ + goto again; +} + +static void smbdirect_connection_mr_io_register_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_mr_io *mr = + container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); + struct smbdirect_socket *sc = mr->socket; + + if (wc->status != IB_WC_SUCCESS) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "wc->status=%s opcode=%d\n", + ib_wc_status_msg(wc->status), wc->opcode); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + } +} + +static void smbdirect_connection_mr_io_local_inv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smbdirect_mr_io *mr = + container_of(wc->wr_cqe, struct smbdirect_mr_io, cqe); + struct smbdirect_socket *sc = mr->socket; + + mr->state = SMBDIRECT_MR_INVALIDATED; + if (wc->status != IB_WC_SUCCESS) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "invalidate failed status=%s\n", + ib_wc_status_msg(wc->status)); + smbdirect_socket_schedule_cleanup(sc, -ECONNABORTED); + } + complete(&mr->invalidate_done); +} + +/* + * Transcribe the pages from an iterator into an MR scatterlist. + */ +static int smbdirect_iter_to_sgt(struct iov_iter *iter, + struct sg_table *sgt, + unsigned int max_sg) +{ + int ret; + + memset(sgt->sgl, 0, max_sg * sizeof(struct scatterlist)); + + ret = extract_iter_to_sg(iter, iov_iter_count(iter), sgt, max_sg, 0); + WARN_ON(ret < 0); + if (sgt->nents > 0) + sg_mark_end(&sgt->sgl[sgt->nents - 1]); + + return ret; +} + +/* + * Register memory for RDMA read/write + * iter: the buffer to register memory with + * writing: true if this is a RDMA write (SMB read), false for RDMA read + * need_invalidate: true if this MR needs to be locally invalidated after I/O + * return value: the MR registered, NULL if failed. + */ +struct smbdirect_mr_io * +smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, + struct iov_iter *iter, + bool writing, + bool need_invalidate) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + struct smbdirect_mr_io *mr; + int ret, num_pages; + struct ib_reg_wr *reg_wr; + + num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); + if (num_pages > sp->max_frmr_depth) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "num_pages=%d max_frmr_depth=%d\n", + num_pages, sp->max_frmr_depth); + WARN_ON_ONCE(1); + return NULL; + } + + mr = smbdirect_connection_get_mr_io(sc); + if (!mr) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "smbdirect_connection_get_mr_io returning NULL\n"); + return NULL; + } + + mutex_lock(&mr->mutex); + + mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + mr->need_invalidate = need_invalidate; + mr->sgt.nents = 0; + mr->sgt.orig_nents = 0; + + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_INFO, + "num_pages=%u count=%zu depth=%u\n", + num_pages, iov_iter_count(iter), sp->max_frmr_depth); + smbdirect_iter_to_sgt(iter, &mr->sgt, sp->max_frmr_depth); + + ret = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + if (!ret) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "ib_dma_map_sg num_pages=%u dir=%x ret=%d (%1pe)\n", + num_pages, mr->dir, ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + goto dma_map_error; + } + + ret = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); + if (ret != mr->sgt.nents) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "ib_map_mr_sg failed ret = %d nents = %u\n", + ret, mr->sgt.nents); + goto map_mr_error; + } + + ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); + reg_wr = &mr->wr; + reg_wr->wr.opcode = IB_WR_REG_MR; + mr->cqe.done = smbdirect_connection_mr_io_register_done; + reg_wr->wr.wr_cqe = &mr->cqe; + reg_wr->wr.num_sge = 0; + reg_wr->wr.send_flags = IB_SEND_SIGNALED; + reg_wr->mr = mr->mr; + reg_wr->key = mr->mr->rkey; + reg_wr->access = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + + /* + * There is no need for waiting for complemtion on ib_post_send + * on IB_WR_REG_MR. Hardware enforces a barrier and order of execution + * on the next ib_post_send when we actually send I/O to remote peer + */ + ret = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); + if (!ret) { + /* + * smbdirect_connection_get_mr_io() gave us a reference + * via kref_get(&mr->kref), we keep that and let + * the caller use smbdirect_connection_deregister_mr_io() + * to remove it again. + */ + mutex_unlock(&mr->mutex); + return mr; + } + + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "ib_post_send failed ret=%d (%1pe) reg_wr->key=0x%x\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret), reg_wr->key); + +map_mr_error: + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + +dma_map_error: + mr->sgt.nents = 0; + mr->state = SMBDIRECT_MR_ERROR; + atomic_dec(&sc->mr_io.used.count); + + smbdirect_socket_schedule_cleanup(sc, ret); + + /* + * smbdirect_connection_get_mr_io() gave us a reference + * via kref_get(&mr->kref), we need to remove it again + * on error. + * + * No kref_put_mutex() as it's already locked. + * + * If smbdirect_mr_io_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbdirect_mr_io_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock. + */ + if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) + mutex_unlock(&mr->mutex); + return NULL; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_register_mr_io); + +void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1) +{ + mutex_lock(&mr->mutex); + if (mr->state == SMBDIRECT_MR_REGISTERED) { + v1->offset = cpu_to_le64(mr->mr->iova); + v1->token = cpu_to_le32(mr->mr->rkey); + v1->length = cpu_to_le32(mr->mr->length); + } else { + v1->offset = cpu_to_le64(U64_MAX); + v1->token = cpu_to_le32(U32_MAX); + v1->length = cpu_to_le32(U32_MAX); + } + mutex_unlock(&mr->mutex); +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_mr_io_fill_buffer_descriptor); + +/* + * Deregister a MR after I/O is done + * This function may wait if remote invalidation is not used + * and we have to locally invalidate the buffer to prevent data is being + * modified by remote peer after upper layer consumes it + */ +void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr) +{ + struct smbdirect_socket *sc = mr->socket; + int ret = 0; + +lock_again: + mutex_lock(&mr->mutex); + if (mr->state == SMBDIRECT_MR_DISABLED) + goto put_kref; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbdirect_mr_io_disable_locked(mr); + goto put_kref; + } + + if (mr->need_invalidate) { + struct ib_send_wr *wr = &mr->inv_wr; + + /* Need to finish local invalidation before returning */ + wr->opcode = IB_WR_LOCAL_INV; + mr->cqe.done = smbdirect_connection_mr_io_local_inv_done; + wr->wr_cqe = &mr->cqe; + wr->num_sge = 0; + wr->ex.invalidate_rkey = mr->mr->rkey; + wr->send_flags = IB_SEND_SIGNALED; + + init_completion(&mr->invalidate_done); + ret = ib_post_send(sc->ib.qp, wr, NULL); + if (ret) { + smbdirect_log_rdma_mr(sc, SMBDIRECT_LOG_ERR, + "ib_post_send failed ret=%d (%1pe)\n", + ret, SMBDIRECT_DEBUG_ERR_PTR(ret)); + smbdirect_mr_io_disable_locked(mr); + smbdirect_socket_schedule_cleanup(sc, ret); + goto done; + } + + /* + * We still hold the reference to mr + * so we can unlock while waiting. + */ + mutex_unlock(&mr->mutex); + wait_for_completion(&mr->invalidate_done); + mr->need_invalidate = false; + goto lock_again; + } else + /* + * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED + * and defer to mr_recovery_work to recover the MR for next use + */ + mr->state = SMBDIRECT_MR_INVALIDATED; + + if (mr->sgt.nents) { + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + mr->sgt.nents = 0; + } + + WARN_ONCE(mr->state != SMBDIRECT_MR_INVALIDATED, + "mr->state[%u] != SMBDIRECT_MR_INVALIDATED[%u]\n", + mr->state, SMBDIRECT_MR_INVALIDATED); + mr->state = SMBDIRECT_MR_READY; + if (atomic_inc_return(&sc->mr_io.ready.count) == 1) + wake_up(&sc->mr_io.ready.wait_queue); + +done: + atomic_dec(&sc->mr_io.used.count); + +put_kref: + /* + * No kref_put_mutex() as it's already locked. + * + * If smbdirect_mr_io_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbdirect_mr_io_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock + * and keep the mr in SMBDIRECT_MR_READY or + * SMBDIRECT_MR_ERROR state. + */ + if (!kref_put(&mr->kref, smbdirect_mr_io_free_locked)) + mutex_unlock(&mr->mutex); +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_deregister_mr_io); diff --git a/fs/smb/common/smbdirect/smbdirect_pdu.h b/fs/smb/common/smbdirect/smbdirect_pdu.h index ae9fdb05ce23..7693ba337873 100644 --- a/fs/smb/common/smbdirect/smbdirect_pdu.h +++ b/fs/smb/common/smbdirect/smbdirect_pdu.h @@ -8,6 +8,10 @@ #define SMBDIRECT_V1 0x0100 +/* SMBD minimum receive size and fragmented sized defined in [MS-SMBD] */ +#define SMBDIRECT_MIN_RECEIVE_SIZE 128 +#define SMBDIRECT_MIN_FRAGMENTED_SIZE 131072 + /* SMBD negotiation request packet [MS-SMBD] 2.2.1 */ struct smbdirect_negotiate_req { __le16 min_version; diff --git a/fs/smb/common/smbdirect/smbdirect_public.h b/fs/smb/common/smbdirect/smbdirect_public.h new file mode 100644 index 000000000000..50088155e7c3 --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_public.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2025, Stefan Metzmacher + */ + +#ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ +#define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ + +struct smbdirect_buffer_descriptor_v1; +struct smbdirect_socket_parameters; + +struct smbdirect_socket; +struct smbdirect_send_batch; +struct smbdirect_mr_io; + +#define __SMBDIRECT_EXPORT_SYMBOL__(__sym) EXPORT_SYMBOL_FOR_MODULES(__sym, "cifs,ksmbd") + +#include + +u8 smbdirect_netdev_rdma_capable_node_type(struct net_device *netdev); + +bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs); + +int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc); + +int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc); + +int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, + const struct smbdirect_socket_parameters *sp); + +const struct smbdirect_socket_parameters * +smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc); + +int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, + enum ib_poll_context poll_ctx, + gfp_t gfp_mask); + +#define SMBDIRECT_LOG_ERR 0x0 +#define SMBDIRECT_LOG_INFO 0x1 + +#define SMBDIRECT_LOG_OUTGOING 0x1 +#define SMBDIRECT_LOG_INCOMING 0x2 +#define SMBDIRECT_LOG_READ 0x4 +#define SMBDIRECT_LOG_WRITE 0x8 +#define SMBDIRECT_LOG_RDMA_SEND 0x10 +#define SMBDIRECT_LOG_RDMA_RECV 0x20 +#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 +#define SMBDIRECT_LOG_RDMA_EVENT 0x80 +#define SMBDIRECT_LOG_RDMA_MR 0x100 +#define SMBDIRECT_LOG_RDMA_RW 0x200 +#define SMBDIRECT_LOG_NEGOTIATE 0x400 +void smbdirect_socket_set_logging(struct smbdirect_socket *sc, + void *private_ptr, + bool (*needed)(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls), + void (*vaprintf)(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf)); + +bool smbdirect_connection_is_connected(struct smbdirect_socket *sc); + +int smbdirect_connection_wait_for_connected(struct smbdirect_socket *sc); + +int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr); + +void smbdirect_socket_shutdown(struct smbdirect_socket *sc); + +void smbdirect_socket_release(struct smbdirect_socket *sc); + +int smbdirect_connection_send_batch_flush(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + bool is_last); + +/* + * This is only temporary and only needed + * as long as the client still requires + * to use smbdirect_connection_send_single_iter() + */ +struct smbdirect_send_batch_storage { + union { + struct list_head __msg_list; + __aligned_u64 __space[5]; + }; +}; + +struct smbdirect_send_batch * +smbdirect_init_send_batch_storage(struct smbdirect_send_batch_storage *storage, + bool need_invalidate_rkey, + unsigned int remote_key); + +int smbdirect_connection_send_single_iter(struct smbdirect_socket *sc, + struct smbdirect_send_batch *batch, + struct iov_iter *iter, + unsigned int flags, + u32 remaining_data_length); + +int smbdirect_connection_send_wait_zero_pending(struct smbdirect_socket *sc); + +int smbdirect_connection_send_iter(struct smbdirect_socket *sc, + struct iov_iter *iter, + unsigned int flags, + bool need_invalidate, + unsigned int remote_key); + +int smbdirect_connection_recvmsg(struct smbdirect_socket *sc, + struct msghdr *msg, + unsigned int flags); + +int smbdirect_connect(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_connect_sync(struct smbdirect_socket *sc, + const struct sockaddr *dst); + +int smbdirect_socket_listen(struct smbdirect_socket *sc, int backlog); + +struct smbdirect_socket *smbdirect_socket_accept(struct smbdirect_socket *lsc, + long timeo, + struct proto_accept_arg *arg); + +int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, + void *buf, size_t buf_len, + struct smbdirect_buffer_descriptor_v1 *desc, + size_t desc_len, + bool is_read); + +struct smbdirect_mr_io * +smbdirect_connection_register_mr_io(struct smbdirect_socket *sc, + struct iov_iter *iter, + bool writing, + bool need_invalidate); + +void smbdirect_mr_io_fill_buffer_descriptor(struct smbdirect_mr_io *mr, + struct smbdirect_buffer_descriptor_v1 *v1); + +void smbdirect_connection_deregister_mr_io(struct smbdirect_mr_io *mr); + +void smbdirect_connection_legacy_debug_proc_show(struct smbdirect_socket *sc, + unsigned int rdma_readwrite_threshold, + struct seq_file *m); + +#endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_PUBLIC_H__ */ diff --git a/fs/smb/common/smbdirect/smbdirect_rw.c b/fs/smb/common/smbdirect/smbdirect_rw.c new file mode 100644 index 000000000000..3b2eb8c48efc --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_rw.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" + +static int smbdirect_connection_wait_for_rw_credits(struct smbdirect_socket *sc, + int credits) +{ + return smbdirect_socket_wait_for_credits(sc, + SMBDIRECT_SOCKET_CONNECTED, + -ENOTCONN, + &sc->rw_io.credits.wait_queue, + &sc->rw_io.credits.count, + credits); +} + +static int smbdirect_connection_calc_rw_credits(struct smbdirect_socket *sc, + const void *buf, + size_t len) +{ + return DIV_ROUND_UP(smbdirect_get_buf_page_count(buf, len), + sc->rw_io.credits.num_pages); +} + +static int smbdirect_connection_rdma_get_sg_list(void *buf, + size_t size, + struct scatterlist *sg_list, + size_t nentries) +{ + bool high = is_vmalloc_addr(buf); + struct page *page; + size_t offset, len; + int i = 0; + + if (size == 0 || nentries < smbdirect_get_buf_page_count(buf, size)) + return -EINVAL; + + offset = offset_in_page(buf); + buf -= offset; + while (size > 0) { + len = min_t(size_t, PAGE_SIZE - offset, size); + if (high) + page = vmalloc_to_page(buf); + else + page = kmap_to_page(buf); + + if (!sg_list) + return -EINVAL; + sg_set_page(sg_list, page, len, offset); + sg_list = sg_next(sg_list); + + buf += PAGE_SIZE; + size -= len; + offset = 0; + i++; + } + + return i; +} + +static void smbdirect_connection_rw_io_free(struct smbdirect_rw_io *msg, + enum dma_data_direction dir) +{ + struct smbdirect_socket *sc = msg->socket; + + rdma_rw_ctx_destroy(&msg->rdma_ctx, + sc->ib.qp, + sc->ib.qp->port, + msg->sgt.sgl, + msg->sgt.nents, + dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); +} + +static void smbdirect_connection_rdma_rw_done(struct ib_cq *cq, struct ib_wc *wc, + enum dma_data_direction dir) +{ + struct smbdirect_rw_io *msg = + container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); + struct smbdirect_socket *sc = msg->socket; + + if (wc->status != IB_WC_SUCCESS) { + msg->error = -EIO; + pr_err("read/write error. opcode = %d, status = %s(%d)\n", + wc->opcode, ib_wc_status_msg(wc->status), wc->status); + if (wc->status != IB_WC_WR_FLUSH_ERR) + smbdirect_socket_schedule_cleanup(sc, msg->error); + } + + complete(msg->completion); +} + +static void smbdirect_connection_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) +{ + smbdirect_connection_rdma_rw_done(cq, wc, DMA_FROM_DEVICE); +} + +static void smbdirect_connection_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + smbdirect_connection_rdma_rw_done(cq, wc, DMA_TO_DEVICE); +} + +int smbdirect_connection_rdma_xmit(struct smbdirect_socket *sc, + void *buf, size_t buf_len, + struct smbdirect_buffer_descriptor_v1 *desc, + size_t desc_len, + bool is_read) +{ + const struct smbdirect_socket_parameters *sp = &sc->parameters; + enum dma_data_direction direction = is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + struct smbdirect_rw_io *msg, *next_msg; + size_t i; + int ret; + DECLARE_COMPLETION_ONSTACK(completion); + struct ib_send_wr *first_wr; + LIST_HEAD(msg_list); + u8 *desc_buf; + int credits_needed; + size_t desc_buf_len, desc_num = 0; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) + return -ENOTCONN; + + if (buf_len > sp->max_read_write_size) + return -EINVAL; + + /* calculate needed credits */ + credits_needed = 0; + desc_buf = buf; + for (i = 0; i < desc_len / sizeof(*desc); i++) { + if (!buf_len) + break; + + desc_buf_len = le32_to_cpu(desc[i].length); + if (!desc_buf_len) + return -EINVAL; + + if (desc_buf_len > buf_len) { + desc_buf_len = buf_len; + desc[i].length = cpu_to_le32(desc_buf_len); + buf_len = 0; + } + + credits_needed += smbdirect_connection_calc_rw_credits(sc, + desc_buf, + desc_buf_len); + desc_buf += desc_buf_len; + buf_len -= desc_buf_len; + desc_num++; + } + + smbdirect_log_rdma_rw(sc, SMBDIRECT_LOG_INFO, + "RDMA %s, len %zu, needed credits %d\n", + str_read_write(is_read), buf_len, credits_needed); + + ret = smbdirect_connection_wait_for_rw_credits(sc, credits_needed); + if (ret < 0) + return ret; + + /* build rdma_rw_ctx for each descriptor */ + desc_buf = buf; + for (i = 0; i < desc_num; i++) { + size_t page_count; + + msg = kzalloc_flex(*msg, sg_list, SG_CHUNK_SIZE, + sc->rw_io.mem.gfp_mask); + if (!msg) { + ret = -ENOMEM; + goto out; + } + + desc_buf_len = le32_to_cpu(desc[i].length); + page_count = smbdirect_get_buf_page_count(desc_buf, desc_buf_len); + + msg->socket = sc; + msg->cqe.done = is_read ? + smbdirect_connection_rdma_read_done : + smbdirect_connection_rdma_write_done; + msg->completion = &completion; + + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + page_count, + msg->sg_list, + SG_CHUNK_SIZE); + if (ret) { + ret = -ENOMEM; + goto free_msg; + } + + ret = smbdirect_connection_rdma_get_sg_list(desc_buf, + desc_buf_len, + msg->sgt.sgl, + msg->sgt.orig_nents); + if (ret < 0) + goto free_table; + + ret = rdma_rw_ctx_init(&msg->rdma_ctx, + sc->ib.qp, + sc->ib.qp->port, + msg->sgt.sgl, + page_count, + 0, + le64_to_cpu(desc[i].offset), + le32_to_cpu(desc[i].token), + direction); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + goto free_table; + } + + list_add_tail(&msg->list, &msg_list); + desc_buf += desc_buf_len; + } + + /* concatenate work requests of rdma_rw_ctxs */ + first_wr = NULL; + list_for_each_entry_reverse(msg, &msg_list, list) { + first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, + sc->ib.qp, + sc->ib.qp->port, + &msg->cqe, + first_wr); + } + + ret = ib_post_send(sc->ib.qp, first_wr, NULL); + if (ret) { + pr_err("failed to post send wr for RDMA R/W: %d\n", ret); + goto out; + } + + msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); + wait_for_completion(&completion); + ret = msg->error; +out: + list_for_each_entry_safe(msg, next_msg, &msg_list, list) { + list_del(&msg->list); + smbdirect_connection_rw_io_free(msg, direction); + } + atomic_add(credits_needed, &sc->rw_io.credits.count); + wake_up(&sc->rw_io.credits.wait_queue); + return ret; + +free_table: + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); +free_msg: + kfree(msg); + goto out; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_connection_rdma_xmit); diff --git a/fs/smb/common/smbdirect/smbdirect_socket.c b/fs/smb/common/smbdirect/smbdirect_socket.c new file mode 100644 index 000000000000..9153e1dbf53d --- /dev/null +++ b/fs/smb/common/smbdirect/smbdirect_socket.c @@ -0,0 +1,743 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (c) 2025, Stefan Metzmacher + */ + +#include "smbdirect_internal.h" + +bool smbdirect_frwr_is_supported(const struct ib_device_attr *attrs) +{ + /* + * Test if FRWR (Fast Registration Work Requests) is supported on the + * device This implementation requires FRWR on RDMA read/write return + * value: true if it is supported + */ + + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + return false; + if (attrs->max_fast_reg_page_list_len == 0) + return false; + return true; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_frwr_is_supported); + +static void smbdirect_socket_cleanup_work(struct work_struct *work); + +static int smbdirect_socket_rdma_event_handler(struct rdma_cm_id *id, + struct rdma_cm_event *event) +{ + struct smbdirect_socket *sc = id->context; + int ret = -ESTALE; + + /* + * This should be replaced before any real work + * starts! So it should never be called! + */ + + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) + ret = -ENETDOWN; + if (IS_ERR(SMBDIRECT_DEBUG_ERR_PTR(event->status))) + ret = event->status; + pr_err("%s (first_error=%1pe, expected=%s) => event=%s status=%d => ret=%1pe\n", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error), + rdma_event_msg(sc->rdma.expected_event), + rdma_event_msg(event->event), + event->status, + SMBDIRECT_DEBUG_ERR_PTR(ret)); + WARN_ONCE(1, "%s should not be called!\n", __func__); + sc->rdma.cm_id = NULL; + return -ESTALE; +} + +int smbdirect_socket_init_new(struct net *net, struct smbdirect_socket *sc) +{ + struct rdma_cm_id *id; + int ret; + + smbdirect_socket_init(sc); + + id = rdma_create_id(net, + smbdirect_socket_rdma_event_handler, + sc, + RDMA_PS_TCP, + IB_QPT_RC); + if (IS_ERR(id)) { + pr_err("%s: rdma_create_id() failed %1pe\n", __func__, id); + return PTR_ERR(id); + } + + ret = rdma_set_afonly(id, 1); + if (ret) { + rdma_destroy_id(id); + pr_err("%s: rdma_set_afonly() failed %1pe\n", + __func__, SMBDIRECT_DEBUG_ERR_PTR(ret)); + return ret; + } + + sc->rdma.cm_id = id; + + INIT_WORK(&sc->disconnect_work, smbdirect_socket_cleanup_work); + + return 0; +} + +int smbdirect_socket_create_kern(struct net *net, struct smbdirect_socket **_sc) +{ + struct smbdirect_socket *sc; + int ret; + + ret = -ENOMEM; + sc = kzalloc_obj(*sc); + if (!sc) + goto alloc_failed; + + ret = smbdirect_socket_init_new(net, sc); + if (ret) + goto init_failed; + + kref_init(&sc->refs.destroy); + + *_sc = sc; + return 0; + +init_failed: + kfree(sc); +alloc_failed: + return ret; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_kern); + +int smbdirect_socket_init_accepting(struct rdma_cm_id *id, struct smbdirect_socket *sc) +{ + smbdirect_socket_init(sc); + + sc->rdma.cm_id = id; + sc->rdma.cm_id->context = sc; + sc->rdma.cm_id->event_handler = smbdirect_socket_rdma_event_handler; + + sc->ib.dev = sc->rdma.cm_id->device; + + INIT_WORK(&sc->disconnect_work, smbdirect_socket_cleanup_work); + + return 0; +} + +int smbdirect_socket_create_accepting(struct rdma_cm_id *id, struct smbdirect_socket **_sc) +{ + struct smbdirect_socket *sc; + int ret; + + ret = -ENOMEM; + sc = kzalloc_obj(*sc); + if (!sc) + goto alloc_failed; + + ret = smbdirect_socket_init_accepting(id, sc); + if (ret) + goto init_failed; + + kref_init(&sc->refs.destroy); + + *_sc = sc; + return 0; + +init_failed: + kfree(sc); +alloc_failed: + return ret; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_create_accepting); + +int smbdirect_socket_set_initial_parameters(struct smbdirect_socket *sc, + const struct smbdirect_socket_parameters *sp) +{ + /* + * This is only allowed before connect or accept + */ + WARN_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + if (sc->status != SMBDIRECT_SOCKET_CREATED) + return -EINVAL; + + if (sp->flags & ~SMBDIRECT_FLAG_PORT_RANGE_MASK) + return -EINVAL; + + if (sp->initiator_depth > U8_MAX) + return -EINVAL; + if (sp->responder_resources > U8_MAX) + return -EINVAL; + + if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB && + sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) + return -EINVAL; + else if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB) + rdma_restrict_node_type(sc->rdma.cm_id, RDMA_NODE_IB_CA); + else if (sp->flags & SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW) + rdma_restrict_node_type(sc->rdma.cm_id, RDMA_NODE_RNIC); + + /* + * Make a copy of the callers parameters + * from here we only work on the copy + * + * TODO: do we want consistency checking? + */ + sc->parameters = *sp; + + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_initial_parameters); + +const struct smbdirect_socket_parameters * +smbdirect_socket_get_current_parameters(struct smbdirect_socket *sc) +{ + return &sc->parameters; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_get_current_parameters); + +int smbdirect_socket_set_kernel_settings(struct smbdirect_socket *sc, + enum ib_poll_context poll_ctx, + gfp_t gfp_mask) +{ + /* + * This is only allowed before connect or accept + */ + WARN_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + if (sc->status != SMBDIRECT_SOCKET_CREATED) + return -EINVAL; + + sc->ib.poll_ctx = poll_ctx; + + sc->send_io.mem.gfp_mask = gfp_mask; + sc->recv_io.mem.gfp_mask = gfp_mask; + sc->rw_io.mem.gfp_mask = gfp_mask; + + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_kernel_settings); + +void smbdirect_socket_set_logging(struct smbdirect_socket *sc, + void *private_ptr, + bool (*needed)(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls), + void (*vaprintf)(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf)) +{ + sc->logging.private_ptr = private_ptr; + sc->logging.needed = needed; + sc->logging.vaprintf = vaprintf; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_set_logging); + +static void smbdirect_socket_wake_up_all(struct smbdirect_socket *sc) +{ + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + wake_up_all(&sc->status_wait); + wake_up_all(&sc->listen.wait_queue); + wake_up_all(&sc->send_io.bcredits.wait_queue); + wake_up_all(&sc->send_io.lcredits.wait_queue); + wake_up_all(&sc->send_io.credits.wait_queue); + wake_up_all(&sc->send_io.pending.zero_wait_queue); + wake_up_all(&sc->recv_io.reassembly.wait_queue); + wake_up_all(&sc->rw_io.credits.wait_queue); + wake_up_all(&sc->mr_io.ready.wait_queue); +} + +void __smbdirect_socket_schedule_cleanup(struct smbdirect_socket *sc, + const char *macro_name, + unsigned int lvl, + const char *func, + unsigned int line, + int error, + enum smbdirect_socket_status *force_status) +{ + struct smbdirect_socket *psc, *tsc; + unsigned long flags; + bool was_first = false; + + if (!sc->first_error) { + ___smbdirect_log_generic(sc, func, line, + lvl, + SMBDIRECT_LOG_RDMA_EVENT, + "%s(%1pe%s%s) called from %s in line=%u status=%s\n", + macro_name, + SMBDIRECT_DEBUG_ERR_PTR(error), + force_status ? ", " : "", + force_status ? smbdirect_socket_status_string(*force_status) : "", + func, line, + smbdirect_socket_status_string(sc->status)); + if (error) + sc->first_error = error; + else + sc->first_error = -ECONNABORTED; + was_first = true; + } + + /* + * make sure other work (than disconnect_work) + * is not queued again but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->connect.work); + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->idle.immediate_work); + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + disable_delayed_work(&sc->idle.timer_work); + + /* + * In case we were a listener we need to + * disconnect all pending and ready sockets + * + * First we move ready sockets to pending again. + */ + spin_lock_irqsave(&sc->listen.lock, flags); + list_splice_init(&sc->listen.ready, &sc->listen.pending); + list_for_each_entry_safe(psc, tsc, &sc->listen.pending, accept.list) + smbdirect_socket_schedule_cleanup(psc, sc->first_error); + spin_unlock_irqrestore(&sc->listen.lock, flags); + + switch (sc->status) { + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_ERROR: + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + /* + * Keep the current error status + */ + break; + + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; + break; + + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; + break; + + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; + break; + + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_LISTENING: + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_CONNECTED: + sc->status = SMBDIRECT_SOCKET_ERROR; + break; + } + + if (force_status && (was_first || *force_status > sc->status)) + sc->status = *force_status; + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbdirect_socket_wake_up_all(sc); + + queue_work(sc->workqueues.cleanup, &sc->disconnect_work); +} + +static void smbdirect_socket_cleanup_work(struct work_struct *work) +{ + struct smbdirect_socket *sc = + container_of(work, struct smbdirect_socket, disconnect_work); + struct smbdirect_socket *psc, *tsc; + unsigned long flags; + + /* + * This should not never be called in an interrupt! + */ + WARN_ON_ONCE(in_interrupt()); + + if (!sc->first_error) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_ERR, + "%s called with first_error==0\n", + smbdirect_socket_status_string(sc->status)); + + sc->first_error = -ECONNABORTED; + } + + /* + * make sure this and other work is not queued again + * but here we don't block and avoid + * disable[_delayed]_work_sync() + */ + disable_work(&sc->disconnect_work); + disable_work(&sc->connect.work); + disable_work(&sc->recv_io.posted.refill_work); + disable_work(&sc->idle.immediate_work); + sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; + disable_delayed_work(&sc->idle.timer_work); + + /* + * In case we were a listener we need to + * disconnect all pending and ready sockets + * + * First we move ready sockets to pending again. + */ + spin_lock_irqsave(&sc->listen.lock, flags); + list_splice_init(&sc->listen.ready, &sc->listen.pending); + list_for_each_entry_safe(psc, tsc, &sc->listen.pending, accept.list) + smbdirect_socket_schedule_cleanup(psc, sc->first_error); + spin_unlock_irqrestore(&sc->listen.lock, flags); + + switch (sc->status) { + case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: + case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: + case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: + case SMBDIRECT_SOCKET_CONNECTED: + case SMBDIRECT_SOCKET_ERROR: + sc->status = SMBDIRECT_SOCKET_DISCONNECTING; + /* + * Make sure we hold the callback lock + * im order to coordinate with the + * rdma_event handlers, typically + * smbdirect_connection_rdma_event_handler(), + * and smbdirect_socket_destroy(). + * + * So that the order of ib_drain_qp() + * and rdma_disconnect() is controlled + * by the mutex. + */ + rdma_lock_handler(sc->rdma.cm_id); + rdma_disconnect(sc->rdma.cm_id); + rdma_unlock_handler(sc->rdma.cm_id); + break; + + case SMBDIRECT_SOCKET_CREATED: + case SMBDIRECT_SOCKET_LISTENING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: + case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: + case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: + case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: + /* + * rdma_{accept,connect}() never reached + * RDMA_CM_EVENT_ESTABLISHED + */ + sc->status = SMBDIRECT_SOCKET_DISCONNECTED; + break; + + case SMBDIRECT_SOCKET_DISCONNECTING: + case SMBDIRECT_SOCKET_DISCONNECTED: + case SMBDIRECT_SOCKET_DESTROYED: + break; + } + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + */ + smbdirect_socket_wake_up_all(sc); +} + +static void smbdirect_socket_destroy(struct smbdirect_socket *sc) +{ + struct smbdirect_socket *psc, *tsc; + size_t psockets; + struct smbdirect_recv_io *recv_io; + struct smbdirect_recv_io *recv_tmp; + LIST_HEAD(all_list); + unsigned long flags; + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + + /* + * This should not never be called in an interrupt! + */ + WARN_ON_ONCE(in_interrupt()); + + if (sc->status == SMBDIRECT_SOCKET_DESTROYED) + return; + + WARN_ONCE(sc->status != SMBDIRECT_SOCKET_DISCONNECTED, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + + /* + * The listener should clear this before we reach this + */ + WARN_ONCE(sc->accept.listener, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + + /* + * Wake up all waiters in all wait queues + * in order to notice the broken connection. + * + * Most likely this was already called via + * smbdirect_socket_cleanup_work(), but call it again... + */ + smbdirect_socket_wake_up_all(sc); + + disable_work_sync(&sc->disconnect_work); + disable_work_sync(&sc->connect.work); + disable_work_sync(&sc->recv_io.posted.refill_work); + disable_work_sync(&sc->idle.immediate_work); + disable_delayed_work_sync(&sc->idle.timer_work); + + if (sc->rdma.cm_id) + rdma_lock_handler(sc->rdma.cm_id); + + if (sc->ib.qp) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "drain qp\n"); + ib_drain_qp(sc->ib.qp); + } + + /* + * In case we were a listener we need to + * disconnect all pending and ready sockets + * + * We move ready sockets to pending again. + */ + spin_lock_irqsave(&sc->listen.lock, flags); + list_splice_tail_init(&sc->listen.ready, &all_list); + list_splice_tail_init(&sc->listen.pending, &all_list); + spin_unlock_irqrestore(&sc->listen.lock, flags); + psockets = list_count_nodes(&all_list); + if (sc->listen.backlog != -1) /* was a listener */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "release %zu pending sockets\n", psockets); + list_for_each_entry_safe(psc, tsc, &all_list, accept.list) { + list_del_init(&psc->accept.list); + psc->accept.listener = NULL; + smbdirect_socket_release(psc); + } + if (sc->listen.backlog != -1) /* was a listener */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "released %zu pending sockets\n", psockets); + INIT_LIST_HEAD(&all_list); + + /* It's not possible for upper layer to get to reassembly */ + if (sc->listen.backlog == -1) /* was not a listener */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "drain the reassembly queue\n"); + spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); + list_splice_tail_init(&sc->recv_io.reassembly.list, &all_list); + spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); + list_for_each_entry_safe(recv_io, recv_tmp, &all_list, list) + smbdirect_connection_put_recv_io(recv_io); + sc->recv_io.reassembly.data_length = 0; + + if (sc->listen.backlog == -1) /* was not a listener */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "freeing mr list\n"); + smbdirect_connection_destroy_mr_list(sc); + + if (sc->listen.backlog == -1) /* was not a listener */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "destroying qp\n"); + smbdirect_connection_destroy_qp(sc); + if (sc->rdma.cm_id) { + rdma_unlock_handler(sc->rdma.cm_id); + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "destroying cm_id\n"); + rdma_destroy_id(sc->rdma.cm_id); + sc->rdma.cm_id = NULL; + } + + if (sc->listen.backlog == -1) /* was not a listener */ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "destroying mem pools\n"); + smbdirect_connection_destroy_mem_pools(sc); + + sc->status = SMBDIRECT_SOCKET_DESTROYED; + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "rdma session destroyed\n"); +} + +void smbdirect_socket_destroy_sync(struct smbdirect_socket *sc) +{ + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); + + /* + * This should not never be called in an interrupt! + */ + WARN_ON_ONCE(in_interrupt()); + + /* + * First we try to disable the work + * without disable_work_sync() in a + * non blocking way, if it's already + * running it will be handles by + * disable_work_sync() below. + * + * Here we just want to make sure queue_work() in + * smbdirect_socket_schedule_cleanup_lvl() + * is a no-op. + */ + disable_work(&sc->disconnect_work); + + if (!sc->first_error) + /* + * SMBDIRECT_LOG_INFO is enough here + * as this is the typical case where + * we terminate the connection ourself. + */ + smbdirect_socket_schedule_cleanup_lvl(sc, + SMBDIRECT_LOG_INFO, + -ESHUTDOWN); + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "cancelling and disable disconnect_work\n"); + disable_work_sync(&sc->disconnect_work); + + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "destroying rdma session\n"); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) + smbdirect_socket_cleanup_work(&sc->disconnect_work); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) { + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "wait for transport being disconnected\n"); + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "waited for transport being disconnected\n"); + } + + /* + * Once we reached SMBDIRECT_SOCKET_DISCONNECTED, + * we should call smbdirect_socket_destroy() + */ + smbdirect_socket_destroy(sc); + smbdirect_log_rdma_event(sc, SMBDIRECT_LOG_INFO, + "status=%s first_error=%1pe", + smbdirect_socket_status_string(sc->status), + SMBDIRECT_DEBUG_ERR_PTR(sc->first_error)); +} + +int smbdirect_socket_bind(struct smbdirect_socket *sc, struct sockaddr *addr) +{ + int ret; + + if (sc->status != SMBDIRECT_SOCKET_CREATED) + return -EINVAL; + + ret = rdma_bind_addr(sc->rdma.cm_id, addr); + if (ret) + return ret; + + return 0; +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_bind); + +void smbdirect_socket_shutdown(struct smbdirect_socket *sc) +{ + smbdirect_socket_schedule_cleanup_lvl(sc, SMBDIRECT_LOG_INFO, -ESHUTDOWN); +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_shutdown); + +static void smbdirect_socket_release_disconnect(struct kref *kref) +{ + struct smbdirect_socket *sc = + container_of(kref, struct smbdirect_socket, refs.disconnect); + + /* + * For now do a sync disconnect/destroy + */ + smbdirect_socket_destroy_sync(sc); +} + +static void smbdirect_socket_release_destroy(struct kref *kref) +{ + struct smbdirect_socket *sc = + container_of(kref, struct smbdirect_socket, refs.destroy); + + /* + * Do a sync disconnect/destroy... + * hopefully a no-op, as it should be already + * in DESTROYED state, before we free the memory. + */ + smbdirect_socket_destroy_sync(sc); + kfree(sc); +} + +void smbdirect_socket_release(struct smbdirect_socket *sc) +{ + /* + * We expect only 1 disconnect reference + * and if it is already 0, it's a use after free! + */ + WARN_ON_ONCE(kref_read(&sc->refs.disconnect) != 1); + WARN_ON(!kref_put(&sc->refs.disconnect, smbdirect_socket_release_disconnect)); + + /* + * This may not trigger smbdirect_socket_release_destroy(), + * if struct smbdirect_socket is embedded in another structure + * indicated by REFCOUNT_MAX. + */ + kref_put(&sc->refs.destroy, smbdirect_socket_release_destroy); +} +__SMBDIRECT_EXPORT_SYMBOL__(smbdirect_socket_release); + +int smbdirect_socket_wait_for_credits(struct smbdirect_socket *sc, + enum smbdirect_socket_status expected_status, + int unexpected_errno, + wait_queue_head_t *waitq, + atomic_t *total_credits, + int needed) +{ + int ret; + + if (WARN_ON_ONCE(needed < 0)) + return -EINVAL; + + do { + if (atomic_sub_return(needed, total_credits) >= 0) + return 0; + + atomic_add(needed, total_credits); + ret = wait_event_interruptible(*waitq, + atomic_read(total_credits) >= needed || + sc->status != expected_status); + + if (sc->status != expected_status) + return unexpected_errno; + else if (ret < 0) + return ret; + } while (true); +} diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index 22184e53d445..c09eddd8ad16 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -6,10 +6,18 @@ #ifndef __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ #define __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ +#include +#include +#include +#include +#include +#include +#include #include enum smbdirect_socket_status { SMBDIRECT_SOCKET_CREATED, + SMBDIRECT_SOCKET_LISTENING, SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED, SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING, SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED, @@ -35,6 +43,8 @@ const char *smbdirect_socket_status_string(enum smbdirect_socket_status status) switch (status) { case SMBDIRECT_SOCKET_CREATED: return "CREATED"; + case SMBDIRECT_SOCKET_LISTENING: + return "LISTENING"; case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: return "RESOLVE_ADDR_NEEDED"; case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: @@ -99,18 +109,59 @@ struct smbdirect_socket { int first_error; /* - * This points to the workqueue to + * This points to the workqueues to * be used for this socket. - * It can be per socket (on the client) - * or point to a global workqueue (on the server) */ - struct workqueue_struct *workqueue; + struct { + struct workqueue_struct *accept; + struct workqueue_struct *connect; + struct workqueue_struct *idle; + struct workqueue_struct *refill; + struct workqueue_struct *immediate; + struct workqueue_struct *cleanup; + } workqueues; struct work_struct disconnect_work; + /* + * The reference counts. + */ + struct { + /* + * This holds the references by the + * frontend, typically the smb layer. + * + * It is typically 1 and a disconnect + * will happen if it reaches 0. + */ + struct kref disconnect; + + /* + * This holds the reference by the + * backend, the code that manages + * the lifetime of the whole + * struct smbdirect_socket, + * if this reaches 0 it can will + * be freed. + * + * Can be REFCOUNT_MAX is part + * of another structure. + * + * This is equal or higher than + * the disconnect refcount. + */ + struct kref destroy; + } refs; + /* RDMA related */ struct { struct rdma_cm_id *cm_id; + /* + * The expected event in our current + * cm_id->event_handler, all other events + * are treated as an error. + */ + enum rdma_cm_event_type expected_event; /* * This is for iWarp MPA v1 */ @@ -120,6 +171,7 @@ struct smbdirect_socket { /* IB verbs related */ struct { struct ib_pd *pd; + enum ib_poll_context poll_ctx; struct ib_cq *send_cq; struct ib_cq *recv_cq; @@ -149,6 +201,35 @@ struct smbdirect_socket { struct delayed_work timer_work; } idle; + /* + * The state for listen sockets + */ + struct { + spinlock_t lock; + struct list_head pending; + struct list_head ready; + wait_queue_head_t wait_queue; + /* + * This starts as -1 and a value != -1 + * means this socket was in LISTENING state + * before. Note the valid backlog can + * only be > 0. + */ + int backlog; + } listen; + + /* + * The state for sockets waiting + * for accept, either still waiting + * for the negotiation to finish + * or already ready with a usable + * connection. + */ + struct { + struct smbdirect_socket *listener; + struct list_head list; + } accept; + /* * The state for posted send buffers */ @@ -158,8 +239,9 @@ struct smbdirect_socket { * smbdirect_send_io buffers */ struct { - struct kmem_cache *cache; - mempool_t *pool; + struct kmem_cache *cache; + mempool_t *pool; + gfp_t gfp_mask; } mem; /* @@ -194,10 +276,6 @@ struct smbdirect_socket { */ struct { atomic_t count; - /* - * woken when count is decremented - */ - wait_queue_head_t dec_wait_queue; /* * woken when count reached zero */ @@ -223,8 +301,9 @@ struct smbdirect_socket { * smbdirect_recv_io buffers */ struct { - struct kmem_cache *cache; - mempool_t *pool; + struct kmem_cache *cache; + mempool_t *pool; + gfp_t gfp_mask; } mem; /* @@ -310,19 +389,20 @@ struct smbdirect_socket { struct { atomic_t count; } used; - - struct work_struct recovery_work; - - /* Used by transport to wait until all MRs are returned */ - struct { - wait_queue_head_t wait_queue; - } cleanup; } mr_io; /* * The state for RDMA read/write requests on the server */ struct { + /* + * Memory hints for + * smbdirect_rw_io structs + */ + struct { + gfp_t gfp_mask; + } mem; + /* * The credit state for the send side */ @@ -352,20 +432,6 @@ struct smbdirect_socket { } statistics; struct { -#define SMBDIRECT_LOG_ERR 0x0 -#define SMBDIRECT_LOG_INFO 0x1 - -#define SMBDIRECT_LOG_OUTGOING 0x1 -#define SMBDIRECT_LOG_INCOMING 0x2 -#define SMBDIRECT_LOG_READ 0x4 -#define SMBDIRECT_LOG_WRITE 0x8 -#define SMBDIRECT_LOG_RDMA_SEND 0x10 -#define SMBDIRECT_LOG_RDMA_RECV 0x20 -#define SMBDIRECT_LOG_KEEP_ALIVE 0x40 -#define SMBDIRECT_LOG_RDMA_EVENT 0x80 -#define SMBDIRECT_LOG_RDMA_MR 0x100 -#define SMBDIRECT_LOG_RDMA_RW 0x200 -#define SMBDIRECT_LOG_NEGOTIATE 0x400 void *private_ptr; bool (*needed)(struct smbdirect_socket *sc, void *private_ptr, @@ -493,9 +559,23 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) init_waitqueue_head(&sc->status_wait); + sc->workqueues.accept = smbdirect_globals.workqueues.accept; + sc->workqueues.connect = smbdirect_globals.workqueues.connect; + sc->workqueues.idle = smbdirect_globals.workqueues.idle; + sc->workqueues.refill = smbdirect_globals.workqueues.refill; + sc->workqueues.immediate = smbdirect_globals.workqueues.immediate; + sc->workqueues.cleanup = smbdirect_globals.workqueues.cleanup; + INIT_WORK(&sc->disconnect_work, __smbdirect_socket_disabled_work); disable_work_sync(&sc->disconnect_work); + kref_init(&sc->refs.disconnect); + sc->refs.destroy = (struct kref) KREF_INIT(REFCOUNT_MAX); + + sc->rdma.expected_event = RDMA_CM_EVENT_INTERNAL; + + sc->ib.poll_ctx = IB_POLL_UNBOUND_WORKQUEUE; + spin_lock_init(&sc->connect.lock); INIT_WORK(&sc->connect.work, __smbdirect_socket_disabled_work); disable_work_sync(&sc->connect.work); @@ -505,6 +585,16 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) INIT_DELAYED_WORK(&sc->idle.timer_work, __smbdirect_socket_disabled_work); disable_delayed_work_sync(&sc->idle.timer_work); + spin_lock_init(&sc->listen.lock); + INIT_LIST_HEAD(&sc->listen.pending); + INIT_LIST_HEAD(&sc->listen.ready); + sc->listen.backlog = -1; /* not a listener */ + init_waitqueue_head(&sc->listen.wait_queue); + + INIT_LIST_HEAD(&sc->accept.list); + + sc->send_io.mem.gfp_mask = GFP_KERNEL; + atomic_set(&sc->send_io.bcredits.count, 0); init_waitqueue_head(&sc->send_io.bcredits.wait_queue); @@ -515,9 +605,10 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) init_waitqueue_head(&sc->send_io.credits.wait_queue); atomic_set(&sc->send_io.pending.count, 0); - init_waitqueue_head(&sc->send_io.pending.dec_wait_queue); init_waitqueue_head(&sc->send_io.pending.zero_wait_queue); + sc->recv_io.mem.gfp_mask = GFP_KERNEL; + INIT_LIST_HEAD(&sc->recv_io.free.list); spin_lock_init(&sc->recv_io.free.lock); @@ -532,6 +623,7 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) spin_lock_init(&sc->recv_io.reassembly.lock); init_waitqueue_head(&sc->recv_io.reassembly.wait_queue); + sc->rw_io.mem.gfp_mask = GFP_KERNEL; atomic_set(&sc->rw_io.credits.count, 0); init_waitqueue_head(&sc->rw_io.credits.wait_queue); @@ -540,9 +632,6 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) atomic_set(&sc->mr_io.ready.count, 0); init_waitqueue_head(&sc->mr_io.ready.wait_queue); atomic_set(&sc->mr_io.used.count, 0); - INIT_WORK(&sc->mr_io.recovery_work, __smbdirect_socket_disabled_work); - disable_work_sync(&sc->mr_io.recovery_work); - init_waitqueue_head(&sc->mr_io.cleanup.wait_queue); sc->logging.private_ptr = NULL; sc->logging.needed = __smbdirect_log_needed; @@ -602,6 +691,11 @@ static __always_inline void smbdirect_socket_init(struct smbdirect_socket *sc) #define SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status) \ __SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status, /* nothing */) +#ifndef __SMBDIRECT_SOCKET_DISCONNECT +#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) \ + smbdirect_socket_schedule_cleanup(__sc, -ECONNABORTED) +#endif /* ! __SMBDIRECT_SOCKET_DISCONNECT */ + #define SMBDIRECT_CHECK_STATUS_DISCONNECT(__sc, __expected_status) \ __SMBDIRECT_CHECK_STATUS_WARN(__sc, __expected_status, \ __SMBDIRECT_SOCKET_DISCONNECT(__sc);) @@ -720,4 +814,19 @@ struct smbdirect_rw_io { struct scatterlist sg_list[]; }; +static inline size_t smbdirect_get_buf_page_count(const void *buf, size_t size) +{ + return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - + (uintptr_t)buf / PAGE_SIZE; +} + +/* + * Maximum number of retries on data transfer operations + */ +#define SMBDIRECT_RDMA_CM_RETRY 6 +/* + * No need to retry on Receiver Not Ready since SMB_DIRECT manages credits + */ +#define SMBDIRECT_RDMA_CM_RNR_RETRY 0 + #endif /* __FS_SMB_COMMON_SMBDIRECT_SMBDIRECT_SOCKET_H__ */ diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index 96aa8e2a8770..37387410e5bb 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -47,8 +47,9 @@ if SMB_SERVER config SMB_SERVER_SMBDIRECT bool "Support for SMB Direct protocol" - depends on SMB_SERVER=m && INFINIBAND && INFINIBAND_ADDR_TRANS || SMB_SERVER=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y - select SG_POOL + depends on SMB_SERVER && INFINIBAND && INFINIBAND_ADDR_TRANS + depends on SMB_SERVER=m || INFINIBAND=y + select SMB_COMMON_SMBDIRECT default n help diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 26cfce344861..a26899d12df1 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -376,9 +376,6 @@ int ksmbd_conn_handler_loop(void *p) mutex_init(&conn->srv_mutex); __module_get(THIS_MODULE); - if (t->ops->prepare && t->ops->prepare(t)) - goto out; - max_req = server_conf.max_inflight_req; conn->last_active = jiffies; set_freezable(); @@ -470,7 +467,6 @@ recheck: } } -out: ksmbd_conn_set_releasing(conn); /* Wait till all reference dropped to the Server object*/ ksmbd_debug(CONN, "Wait for all pending requests(%d)\n", atomic_read(&conn->r_count)); @@ -566,6 +562,5 @@ void ksmbd_conn_transport_destroy(void) ksmbd_tcp_destroy(); ksmbd_rdma_stop_listening(); stop_sessions(); - ksmbd_rdma_destroy(); mutex_unlock(&init_lock); } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 1e2587036bca..ae21a1bd4c70 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -127,7 +127,6 @@ struct ksmbd_conn_ops { }; struct ksmbd_transport_ops { - int (*prepare)(struct ksmbd_transport *t); void (*disconnect)(struct ksmbd_transport *t); void (*shutdown)(struct ksmbd_transport *t); int (*read)(struct ksmbd_transport *t, char *buf, diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 10ae77dae5a1..ee32e61b6d3c 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -24,7 +24,6 @@ #include "asn1.h" #include "connection.h" #include "transport_ipc.h" -#include "../common/smbdirect/smbdirect.h" #include "transport_rdma.h" #include "vfs.h" #include "vfs_cache.h" diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index dbc8dedb85dc..706a2c897948 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -11,30 +11,19 @@ #include #include -#include -#include -#include #include -#include -#include -#include - -#define __SMBDIRECT_SOCKET_DISCONNECT(__sc) smb_direct_disconnect_rdma_connection(__sc) #include "glob.h" #include "connection.h" #include "smb_common.h" #include "../common/smb2status.h" -#include "../common/smbdirect/smbdirect.h" -#include "../common/smbdirect/smbdirect_pdu.h" -#include "../common/smbdirect/smbdirect_socket.h" #include "transport_rdma.h" +#include "../common/smbdirect/smbdirect_public.h" + #define SMB_DIRECT_PORT_IWARP 5445 #define SMB_DIRECT_PORT_INFINIBAND 445 -#define SMB_DIRECT_VERSION_LE cpu_to_le16(SMBDIRECT_V1) - /* SMB_DIRECT negotiation timeout (for the server) in seconds */ #define SMB_DIRECT_NEGOTIATE_TIMEOUT 5 @@ -50,11 +39,6 @@ */ #define SMB_DIRECT_CM_INITIATOR_DEPTH 8 -/* Maximum number of retries on data transfer operations */ -#define SMB_DIRECT_CM_RETRY 6 -/* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */ -#define SMB_DIRECT_CM_RNR_RETRY 0 - /* * User configurable initial values per SMB_DIRECT transport connection * as defined in [MS-SMBD] 3.1.1.1 @@ -93,27 +77,79 @@ static int smb_direct_max_receive_size = 1364; static int smb_direct_max_read_write_size = SMBD_DEFAULT_IOSIZE; -static LIST_HEAD(smb_direct_device_list); -static DEFINE_RWLOCK(smb_direct_device_lock); - -struct smb_direct_device { - struct ib_device *ib_dev; - struct list_head list; -}; - static struct smb_direct_listener { int port; - struct rdma_cm_id *cm_id; -} smb_direct_ib_listener, smb_direct_iw_listener; -static struct workqueue_struct *smb_direct_wq; + struct task_struct *thread; + + struct smbdirect_socket *socket; +} smb_direct_ib_listener, smb_direct_iw_listener; struct smb_direct_transport { struct ksmbd_transport transport; - struct smbdirect_socket socket; + struct smbdirect_socket *socket; }; +static bool smb_direct_logging_needed(struct smbdirect_socket *sc, + void *private_ptr, + unsigned int lvl, + unsigned int cls) +{ + if (lvl <= SMBDIRECT_LOG_ERR) + return true; + + if (lvl > SMBDIRECT_LOG_INFO) + return false; + + switch (cls) { + /* + * These were more or less also logged before + * the move to common code. + * + * SMBDIRECT_LOG_RDMA_MR was not used, but + * that's client only code and we should + * notice if it's used on the server... + */ + case SMBDIRECT_LOG_RDMA_EVENT: + case SMBDIRECT_LOG_RDMA_SEND: + case SMBDIRECT_LOG_RDMA_RECV: + case SMBDIRECT_LOG_WRITE: + case SMBDIRECT_LOG_READ: + case SMBDIRECT_LOG_NEGOTIATE: + case SMBDIRECT_LOG_OUTGOING: + case SMBDIRECT_LOG_RDMA_RW: + case SMBDIRECT_LOG_RDMA_MR: + return true; + /* + * These were not logged before the move + * to common code. + */ + case SMBDIRECT_LOG_KEEP_ALIVE: + case SMBDIRECT_LOG_INCOMING: + return false; + } + + /* + * Log all unknown messages + */ + return true; +} + +static void smb_direct_logging_vaprintf(struct smbdirect_socket *sc, + const char *func, + unsigned int line, + void *private_ptr, + unsigned int lvl, + unsigned int cls, + struct va_format *vaf) +{ + if (lvl <= SMBDIRECT_LOG_ERR) + pr_err("%pV", vaf); + else + ksmbd_debug(RDMA, "%pV", vaf); +} + #define KSMBD_TRANS(t) (&(t)->transport) #define SMBD_TRANS(t) (container_of(t, \ struct smb_direct_transport, transport)) @@ -129,321 +165,30 @@ void init_smbd_max_io_size(unsigned int sz) unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { struct smb_direct_transport *t; - struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; + const struct smbdirect_socket_parameters *sp; if (kt->ops != &ksmbd_smb_direct_transport_ops) return 0; t = SMBD_TRANS(kt); - sc = &t->socket; - sp = &sc->parameters; + sp = smbdirect_socket_get_current_parameters(t->socket); return sp->max_read_write_size; } -static inline int get_buf_page_count(void *buf, int size) -{ - return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - - (uintptr_t)buf / PAGE_SIZE; -} - -static void smb_direct_destroy_pools(struct smbdirect_socket *sc); -static void smb_direct_post_recv_credits(struct work_struct *work); -static int smb_direct_post_send_data(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx, - struct kvec *iov, int niov, - int remaining_data_length); - -static inline void -*smbdirect_recv_io_payload(struct smbdirect_recv_io *recvmsg) -{ - return (void *)recvmsg->packet; -} - -static struct -smbdirect_recv_io *get_free_recvmsg(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *recvmsg = NULL; - unsigned long flags; - - spin_lock_irqsave(&sc->recv_io.free.lock, flags); - if (!list_empty(&sc->recv_io.free.list)) { - recvmsg = list_first_entry(&sc->recv_io.free.list, - struct smbdirect_recv_io, - list); - list_del(&recvmsg->list); - } - spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - return recvmsg; -} - -static void put_recvmsg(struct smbdirect_socket *sc, - struct smbdirect_recv_io *recvmsg) -{ - unsigned long flags; - - if (likely(recvmsg->sge.length != 0)) { - ib_dma_unmap_single(sc->ib.dev, - recvmsg->sge.addr, - recvmsg->sge.length, - DMA_FROM_DEVICE); - recvmsg->sge.length = 0; - } - - spin_lock_irqsave(&sc->recv_io.free.lock, flags); - list_add(&recvmsg->list, &sc->recv_io.free.list); - spin_unlock_irqrestore(&sc->recv_io.free.lock, flags); - - queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); -} - -static void enqueue_reassembly(struct smbdirect_socket *sc, - struct smbdirect_recv_io *recvmsg, - int data_length) -{ - unsigned long flags; - - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - list_add_tail(&recvmsg->list, &sc->recv_io.reassembly.list); - sc->recv_io.reassembly.queue_length++; - /* - * Make sure reassembly_data_length is updated after list and - * reassembly_queue_length are updated. On the dequeue side - * reassembly_data_length is checked without a lock to determine - * if reassembly_queue_length and list is up to date - */ - virt_wmb(); - sc->recv_io.reassembly.data_length += data_length; - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); -} - -static struct smbdirect_recv_io *get_first_reassembly(struct smbdirect_socket *sc) -{ - if (!list_empty(&sc->recv_io.reassembly.list)) - return list_first_entry(&sc->recv_io.reassembly.list, - struct smbdirect_recv_io, list); - else - return NULL; -} - -static void smb_direct_disconnect_wake_up_all(struct smbdirect_socket *sc) -{ - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - */ - wake_up_all(&sc->status_wait); - wake_up_all(&sc->send_io.bcredits.wait_queue); - wake_up_all(&sc->send_io.lcredits.wait_queue); - wake_up_all(&sc->send_io.credits.wait_queue); - wake_up_all(&sc->send_io.pending.zero_wait_queue); - wake_up_all(&sc->recv_io.reassembly.wait_queue); - wake_up_all(&sc->rw_io.credits.wait_queue); -} - -static void smb_direct_disconnect_rdma_work(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, disconnect_work); - - if (sc->first_error == 0) - sc->first_error = -ECONNABORTED; - - /* - * make sure this and other work is not queued again - * but here we don't block and avoid - * disable[_delayed]_work_sync() - */ - disable_work(&sc->disconnect_work); - disable_work(&sc->connect.work); - disable_work(&sc->recv_io.posted.refill_work); - disable_delayed_work(&sc->idle.timer_work); - disable_work(&sc->idle.immediate_work); - - switch (sc->status) { - case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: - case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: - case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: - case SMBDIRECT_SOCKET_CONNECTED: - case SMBDIRECT_SOCKET_ERROR: - sc->status = SMBDIRECT_SOCKET_DISCONNECTING; - rdma_disconnect(sc->rdma.cm_id); - break; - - case SMBDIRECT_SOCKET_CREATED: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: - case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: - /* - * rdma_accept() never reached - * RDMA_CM_EVENT_ESTABLISHED - */ - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - break; - - case SMBDIRECT_SOCKET_DISCONNECTING: - case SMBDIRECT_SOCKET_DISCONNECTED: - case SMBDIRECT_SOCKET_DESTROYED: - break; - } - - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - */ - smb_direct_disconnect_wake_up_all(sc); -} - -static void -smb_direct_disconnect_rdma_connection(struct smbdirect_socket *sc) -{ - if (sc->first_error == 0) - sc->first_error = -ECONNABORTED; - - /* - * make sure other work (than disconnect_work) is - * not queued again but here we don't block and avoid - * disable[_delayed]_work_sync() - */ - disable_work(&sc->connect.work); - disable_work(&sc->recv_io.posted.refill_work); - disable_work(&sc->idle.immediate_work); - disable_delayed_work(&sc->idle.timer_work); - - switch (sc->status) { - case SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED: - case SMBDIRECT_SOCKET_NEGOTIATE_FAILED: - case SMBDIRECT_SOCKET_ERROR: - case SMBDIRECT_SOCKET_DISCONNECTING: - case SMBDIRECT_SOCKET_DISCONNECTED: - case SMBDIRECT_SOCKET_DESTROYED: - /* - * Keep the current error status - */ - break; - - case SMBDIRECT_SOCKET_RESOLVE_ADDR_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ADDR_RUNNING: - sc->status = SMBDIRECT_SOCKET_RESOLVE_ADDR_FAILED; - break; - - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_NEEDED: - case SMBDIRECT_SOCKET_RESOLVE_ROUTE_RUNNING: - sc->status = SMBDIRECT_SOCKET_RESOLVE_ROUTE_FAILED; - break; - - case SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED: - case SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING: - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_FAILED; - break; - - case SMBDIRECT_SOCKET_NEGOTIATE_NEEDED: - case SMBDIRECT_SOCKET_NEGOTIATE_RUNNING: - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - break; - - case SMBDIRECT_SOCKET_CREATED: - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - break; - - case SMBDIRECT_SOCKET_CONNECTED: - sc->status = SMBDIRECT_SOCKET_ERROR; - break; - } - - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - */ - smb_direct_disconnect_wake_up_all(sc); - - queue_work(sc->workqueue, &sc->disconnect_work); -} - -static void smb_direct_send_immediate_work(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, idle.immediate_work); - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return; - - smb_direct_post_send_data(sc, NULL, NULL, 0, 0); -} - -static void smb_direct_idle_connection_timer(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, idle.timer_work.work); - struct smbdirect_socket_parameters *sp = &sc->parameters; - - if (sc->idle.keepalive != SMBDIRECT_KEEPALIVE_NONE) { - smb_direct_disconnect_rdma_connection(sc); - return; - } - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return; - - /* - * Now use the keepalive timeout (instead of keepalive interval) - * in order to wait for a response - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_timeout_msec)); - queue_work(sc->workqueue, &sc->idle.immediate_work); -} - -static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) +static struct smb_direct_transport *alloc_transport(struct smbdirect_socket *sc) { struct smb_direct_transport *t; - struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; struct ksmbd_conn *conn; t = kzalloc_obj(*t, KSMBD_DEFAULT_GFP); if (!t) return NULL; - sc = &t->socket; - smbdirect_socket_init(sc); - sp = &sc->parameters; - - sc->workqueue = smb_direct_wq; - - INIT_WORK(&sc->disconnect_work, smb_direct_disconnect_rdma_work); - - sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; - sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; - sp->responder_resources = 1; - sp->recv_credit_max = smb_direct_receive_credit_max; - sp->send_credit_target = smb_direct_send_credit_target; - sp->max_send_size = smb_direct_max_send_size; - sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; - sp->max_recv_size = smb_direct_max_receive_size; - sp->max_read_write_size = smb_direct_max_read_write_size; - sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; - sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; - - sc->rdma.cm_id = cm_id; - cm_id->context = sc; - - sc->ib.dev = sc->rdma.cm_id->device; - - INIT_DELAYED_WORK(&sc->idle.timer_work, smb_direct_idle_connection_timer); + t->socket = sc; conn = ksmbd_conn_alloc(); if (!conn) - goto err; + goto conn_alloc_failed; down_write(&conn_list_lock); hash_add(conn_list, &conn->hlist, 0); @@ -452,1165 +197,45 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; + return t; -err: + +conn_alloc_failed: kfree(t); return NULL; } static void smb_direct_free_transport(struct ksmbd_transport *kt) { - kfree(SMBD_TRANS(kt)); + struct smb_direct_transport *t = SMBD_TRANS(kt); + + smbdirect_socket_release(t->socket); + kfree(t); } static void free_transport(struct smb_direct_transport *t) { - struct smbdirect_socket *sc = &t->socket; - struct smbdirect_recv_io *recvmsg; - - disable_work_sync(&sc->disconnect_work); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) - smb_direct_disconnect_rdma_work(&sc->disconnect_work); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) - wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); - - /* - * Wake up all waiters in all wait queues - * in order to notice the broken connection. - * - * Most likely this was already called via - * smb_direct_disconnect_rdma_work(), but call it again... - */ - smb_direct_disconnect_wake_up_all(sc); - - disable_work_sync(&sc->connect.work); - disable_work_sync(&sc->recv_io.posted.refill_work); - disable_delayed_work_sync(&sc->idle.timer_work); - disable_work_sync(&sc->idle.immediate_work); - - if (sc->rdma.cm_id) - rdma_lock_handler(sc->rdma.cm_id); - - if (sc->ib.qp) { - ib_drain_qp(sc->ib.qp); - sc->ib.qp = NULL; - rdma_destroy_qp(sc->rdma.cm_id); - } - - ksmbd_debug(RDMA, "drain the reassembly queue\n"); - do { - unsigned long flags; - - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - recvmsg = get_first_reassembly(sc); - if (recvmsg) { - list_del(&recvmsg->list); - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - put_recvmsg(sc, recvmsg); - } else { - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - } - } while (recvmsg); - sc->recv_io.reassembly.data_length = 0; - - if (sc->ib.send_cq) - ib_free_cq(sc->ib.send_cq); - if (sc->ib.recv_cq) - ib_free_cq(sc->ib.recv_cq); - if (sc->ib.pd) - ib_dealloc_pd(sc->ib.pd); - if (sc->rdma.cm_id) { - rdma_unlock_handler(sc->rdma.cm_id); - rdma_destroy_id(sc->rdma.cm_id); - } - - smb_direct_destroy_pools(sc); + smbdirect_socket_shutdown(t->socket); ksmbd_conn_free(KSMBD_TRANS(t)->conn); } -static struct smbdirect_send_io -*smb_direct_alloc_sendmsg(struct smbdirect_socket *sc) -{ - struct smbdirect_send_io *msg; - - msg = mempool_alloc(sc->send_io.mem.pool, KSMBD_DEFAULT_GFP); - if (!msg) - return ERR_PTR(-ENOMEM); - msg->socket = sc; - INIT_LIST_HEAD(&msg->sibling_list); - msg->num_sge = 0; - return msg; -} - -static void smb_direct_free_sendmsg(struct smbdirect_socket *sc, - struct smbdirect_send_io *msg) -{ - int i; - - /* - * The list needs to be empty! - * The caller should take care of it. - */ - WARN_ON_ONCE(!list_empty(&msg->sibling_list)); - - if (msg->num_sge > 0) { - ib_dma_unmap_single(sc->ib.dev, - msg->sge[0].addr, msg->sge[0].length, - DMA_TO_DEVICE); - for (i = 1; i < msg->num_sge; i++) - ib_dma_unmap_page(sc->ib.dev, - msg->sge[i].addr, msg->sge[i].length, - DMA_TO_DEVICE); - } - mempool_free(msg, sc->send_io.mem.pool); -} - -static int smb_direct_check_recvmsg(struct smbdirect_recv_io *recvmsg) -{ - struct smbdirect_socket *sc = recvmsg->socket; - - switch (sc->recv_io.expected) { - case SMBDIRECT_EXPECT_DATA_TRANSFER: { - struct smbdirect_data_transfer *req = - (struct smbdirect_data_transfer *)recvmsg->packet; - struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet - + le32_to_cpu(req->data_offset)); - ksmbd_debug(RDMA, - "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n", - le16_to_cpu(req->credits_granted), - le16_to_cpu(req->credits_requested), - req->data_length, req->remaining_data_length, - hdr->ProtocolId, hdr->Command); - return 0; - } - case SMBDIRECT_EXPECT_NEGOTIATE_REQ: { - struct smbdirect_negotiate_req *req = - (struct smbdirect_negotiate_req *)recvmsg->packet; - ksmbd_debug(RDMA, - "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", - le16_to_cpu(req->min_version), - le16_to_cpu(req->max_version), - le16_to_cpu(req->credits_requested), - le32_to_cpu(req->preferred_send_size), - le32_to_cpu(req->max_receive_size), - le32_to_cpu(req->max_fragmented_size)); - if (le16_to_cpu(req->min_version) > 0x0100 || - le16_to_cpu(req->max_version) < 0x0100) - return -EOPNOTSUPP; - if (le16_to_cpu(req->credits_requested) <= 0 || - le32_to_cpu(req->max_receive_size) <= 128 || - le32_to_cpu(req->max_fragmented_size) <= - 128 * 1024) - return -ECONNABORTED; - - return 0; - } - case SMBDIRECT_EXPECT_NEGOTIATE_REP: - /* client only */ - break; - } - - /* This is an internal error */ - return -EINVAL; -} - -static void recv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbdirect_recv_io *recvmsg; - struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; - - recvmsg = container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); - sc = recvmsg->socket; - sp = &sc->parameters; - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - put_recvmsg(sc, recvmsg); - if (wc->status != IB_WC_WR_FLUSH_ERR) { - pr_err("Recv error. status='%s (%d)' opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, - wc->opcode); - smb_direct_disconnect_rdma_connection(sc); - } - return; - } - - ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, - wc->opcode); - - ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, - recvmsg->sge.length, DMA_FROM_DEVICE); - - /* - * Reset timer to the keepalive interval in - * order to trigger our next keepalive message. - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); - - switch (sc->recv_io.expected) { - case SMBDIRECT_EXPECT_NEGOTIATE_REQ: - /* see smb_direct_negotiate_recv_done */ - break; - case SMBDIRECT_EXPECT_DATA_TRANSFER: { - struct smbdirect_data_transfer *data_transfer = - (struct smbdirect_data_transfer *)recvmsg->packet; - u32 remaining_data_length, data_offset, data_length; - int current_recv_credits; - u16 old_recv_credit_target; - - if (wc->byte_len < - offsetof(struct smbdirect_data_transfer, padding)) { - put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(sc); - return; - } - - remaining_data_length = le32_to_cpu(data_transfer->remaining_data_length); - data_length = le32_to_cpu(data_transfer->data_length); - data_offset = le32_to_cpu(data_transfer->data_offset); - if (wc->byte_len < data_offset || - wc->byte_len < (u64)data_offset + data_length) { - put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(sc); - return; - } - if (remaining_data_length > sp->max_fragmented_recv_size || - data_length > sp->max_fragmented_recv_size || - (u64)remaining_data_length + (u64)data_length > - (u64)sp->max_fragmented_recv_size) { - put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(sc); - return; - } - - if (data_length) { - if (sc->recv_io.reassembly.full_packet_received) - recvmsg->first_segment = true; - - if (le32_to_cpu(data_transfer->remaining_data_length)) - sc->recv_io.reassembly.full_packet_received = false; - else - sc->recv_io.reassembly.full_packet_received = true; - } - - atomic_dec(&sc->recv_io.posted.count); - current_recv_credits = atomic_dec_return(&sc->recv_io.credits.count); - - old_recv_credit_target = sc->recv_io.credits.target; - sc->recv_io.credits.target = - le16_to_cpu(data_transfer->credits_requested); - sc->recv_io.credits.target = - min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); - sc->recv_io.credits.target = - max_t(u16, sc->recv_io.credits.target, 1); - atomic_add(le16_to_cpu(data_transfer->credits_granted), - &sc->send_io.credits.count); - - if (le16_to_cpu(data_transfer->flags) & - SMBDIRECT_FLAG_RESPONSE_REQUESTED) - queue_work(sc->workqueue, &sc->idle.immediate_work); - - if (atomic_read(&sc->send_io.credits.count) > 0) - wake_up(&sc->send_io.credits.wait_queue); - - if (data_length) { - if (current_recv_credits <= (sc->recv_io.credits.target / 4) || - sc->recv_io.credits.target > old_recv_credit_target) - queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); - - enqueue_reassembly(sc, recvmsg, (int)data_length); - wake_up(&sc->recv_io.reassembly.wait_queue); - } else - put_recvmsg(sc, recvmsg); - - return; - } - case SMBDIRECT_EXPECT_NEGOTIATE_REP: - /* client only */ - break; - } - - /* - * This is an internal error! - */ - WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_DATA_TRANSFER); - put_recvmsg(sc, recvmsg); - smb_direct_disconnect_rdma_connection(sc); -} - -static void smb_direct_negotiate_recv_work(struct work_struct *work); - -static void smb_direct_negotiate_recv_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbdirect_recv_io *recv_io = - container_of(wc->wr_cqe, struct smbdirect_recv_io, cqe); - struct smbdirect_socket *sc = recv_io->socket; - unsigned long flags; - - /* - * reset the common recv_done for later reuse. - */ - recv_io->cqe.done = recv_done; - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { - put_recvmsg(sc, recv_io); - if (wc->status != IB_WC_WR_FLUSH_ERR) { - pr_err("Negotiate Recv error. status='%s (%d)' opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, - wc->opcode); - smb_direct_disconnect_rdma_connection(sc); - } - return; - } - - ksmbd_debug(RDMA, "Negotiate Recv completed. status='%s (%d)', opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, - wc->opcode); - - ib_dma_sync_single_for_cpu(sc->ib.dev, - recv_io->sge.addr, - recv_io->sge.length, - DMA_FROM_DEVICE); - - /* - * This is an internal error! - */ - if (WARN_ON_ONCE(sc->recv_io.expected != SMBDIRECT_EXPECT_NEGOTIATE_REQ)) { - put_recvmsg(sc, recv_io); - smb_direct_disconnect_rdma_connection(sc); - return; - } - - /* - * Don't reset timer to the keepalive interval in - * this will be done in smb_direct_negotiate_recv_work. - */ - - /* - * Only remember the recv_io if it has enough bytes, - * this gives smb_direct_negotiate_recv_work enough - * information in order to disconnect if it was not - * valid. - */ - sc->recv_io.reassembly.full_packet_received = true; - if (wc->byte_len >= sizeof(struct smbdirect_negotiate_req)) - enqueue_reassembly(sc, recv_io, 0); - else - put_recvmsg(sc, recv_io); - - /* - * Some drivers (at least mlx5_ib and irdma in roce mode) - * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, - * we need to adjust our expectation in that case. - * - * So we defer further processing of the negotiation - * to smb_direct_negotiate_recv_work(). - * - * If we are already in SMBDIRECT_SOCKET_NEGOTIATE_NEEDED - * we queue the work directly otherwise - * smb_direct_cm_handler() will do it, when - * RDMA_CM_EVENT_ESTABLISHED arrived. - */ - spin_lock_irqsave(&sc->connect.lock, flags); - if (!sc->first_error) { - INIT_WORK(&sc->connect.work, smb_direct_negotiate_recv_work); - if (sc->status == SMBDIRECT_SOCKET_NEGOTIATE_NEEDED) - queue_work(sc->workqueue, &sc->connect.work); - } - spin_unlock_irqrestore(&sc->connect.lock, flags); -} - -static void smb_direct_negotiate_recv_work(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, connect.work); - const struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_recv_io *recv_io; - - if (sc->first_error) - return; - - ksmbd_debug(RDMA, "Negotiate Recv Work running\n"); - - /* - * Reset timer to the keepalive interval in - * order to trigger our next keepalive message. - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_NONE; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_interval_msec)); - - /* - * If smb_direct_negotiate_recv_done() detected an - * invalid request we want to disconnect. - */ - recv_io = get_first_reassembly(sc); - if (!recv_io) { - smb_direct_disconnect_rdma_connection(sc); - return; - } - - if (SMBDIRECT_CHECK_STATUS_WARN(sc, SMBDIRECT_SOCKET_NEGOTIATE_NEEDED)) { - smb_direct_disconnect_rdma_connection(sc); - return; - } - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_RUNNING; - wake_up(&sc->status_wait); -} - -static int smb_direct_post_recv(struct smbdirect_socket *sc, - struct smbdirect_recv_io *recvmsg) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct ib_recv_wr wr; - int ret; - - recvmsg->sge.addr = ib_dma_map_single(sc->ib.dev, - recvmsg->packet, - sp->max_recv_size, - DMA_FROM_DEVICE); - ret = ib_dma_mapping_error(sc->ib.dev, recvmsg->sge.addr); - if (ret) - return ret; - recvmsg->sge.length = sp->max_recv_size; - recvmsg->sge.lkey = sc->ib.pd->local_dma_lkey; - - wr.wr_cqe = &recvmsg->cqe; - wr.next = NULL; - wr.sg_list = &recvmsg->sge; - wr.num_sge = 1; - - ret = ib_post_recv(sc->ib.qp, &wr, NULL); - if (ret) { - pr_err("Can't post recv: %d\n", ret); - ib_dma_unmap_single(sc->ib.dev, - recvmsg->sge.addr, recvmsg->sge.length, - DMA_FROM_DEVICE); - recvmsg->sge.length = 0; - smb_direct_disconnect_rdma_connection(sc); - return ret; - } - return ret; -} - static int smb_direct_read(struct ksmbd_transport *t, char *buf, unsigned int size, int unused) { - struct smbdirect_recv_io *recvmsg; - struct smbdirect_data_transfer *data_transfer; - int to_copy, to_read, data_read, offset; - u32 data_length, remaining_data_length, data_offset; - int rc; struct smb_direct_transport *st = SMBD_TRANS(t); - struct smbdirect_socket *sc = &st->socket; - -again: - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { - pr_err("disconnected\n"); - return -ENOTCONN; - } - - /* - * No need to hold the reassembly queue lock all the time as we are - * the only one reading from the front of the queue. The transport - * may add more entries to the back of the queue at the same time - */ - if (sc->recv_io.reassembly.data_length >= size) { - int queue_length; - int queue_removed = 0; - unsigned long flags; - - /* - * Need to make sure reassembly_data_length is read before - * reading reassembly_queue_length and calling - * get_first_reassembly. This call is lock free - * as we never read at the end of the queue which are being - * updated in SOFTIRQ as more data is received - */ - virt_rmb(); - queue_length = sc->recv_io.reassembly.queue_length; - data_read = 0; - to_read = size; - offset = sc->recv_io.reassembly.first_entry_offset; - while (data_read < size) { - recvmsg = get_first_reassembly(sc); - data_transfer = smbdirect_recv_io_payload(recvmsg); - data_length = le32_to_cpu(data_transfer->data_length); - remaining_data_length = - le32_to_cpu(data_transfer->remaining_data_length); - data_offset = le32_to_cpu(data_transfer->data_offset); - - /* - * The upper layer expects RFC1002 length at the - * beginning of the payload. Return it to indicate - * the total length of the packet. This minimize the - * change to upper layer packet processing logic. This - * will be eventually remove when an intermediate - * transport layer is added - */ - if (recvmsg->first_segment && size == 4) { - unsigned int rfc1002_len = - data_length + remaining_data_length; - *((__be32 *)buf) = cpu_to_be32(rfc1002_len); - data_read = 4; - recvmsg->first_segment = false; - ksmbd_debug(RDMA, - "returning rfc1002 length %d\n", - rfc1002_len); - goto read_rfc1002_done; - } - - to_copy = min_t(int, data_length - offset, to_read); - memcpy(buf + data_read, (char *)data_transfer + data_offset + offset, - to_copy); - - /* move on to the next buffer? */ - if (to_copy == data_length - offset) { - queue_length--; - /* - * No need to lock if we are not at the - * end of the queue - */ - if (queue_length) { - list_del(&recvmsg->list); - } else { - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - list_del(&recvmsg->list); - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - } - queue_removed++; - put_recvmsg(sc, recvmsg); - offset = 0; - } else { - offset += to_copy; - } - - to_read -= to_copy; - data_read += to_copy; - } - - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - sc->recv_io.reassembly.data_length -= data_read; - sc->recv_io.reassembly.queue_length -= queue_removed; - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - - sc->recv_io.reassembly.first_entry_offset = offset; - ksmbd_debug(RDMA, - "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", - data_read, sc->recv_io.reassembly.data_length, - sc->recv_io.reassembly.first_entry_offset); -read_rfc1002_done: - return data_read; - } - - ksmbd_debug(RDMA, "wait_event on more data\n"); - rc = wait_event_interruptible(sc->recv_io.reassembly.wait_queue, - sc->recv_io.reassembly.data_length >= size || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (rc) - return -EINTR; - - goto again; -} - -static void smb_direct_post_recv_credits(struct work_struct *work) -{ - struct smbdirect_socket *sc = - container_of(work, struct smbdirect_socket, recv_io.posted.refill_work); - struct smbdirect_recv_io *recvmsg; - int credits = 0; + struct smbdirect_socket *sc = st->socket; + struct msghdr msg = { .msg_flags = 0, }; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; int ret; - if (atomic_read(&sc->recv_io.credits.count) < sc->recv_io.credits.target) { - while (true) { - recvmsg = get_free_recvmsg(sc); - if (!recvmsg) - break; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size); - recvmsg->first_segment = false; - - ret = smb_direct_post_recv(sc, recvmsg); - if (ret) { - pr_err("Can't post recv: %d\n", ret); - put_recvmsg(sc, recvmsg); - break; - } - credits++; - - atomic_inc(&sc->recv_io.posted.count); - } - } - - atomic_add(credits, &sc->recv_io.credits.available); - - /* - * If the last send credit is waiting for credits - * it can grant we need to wake it up - */ - if (credits && - atomic_read(&sc->send_io.bcredits.count) == 0 && - atomic_read(&sc->send_io.credits.count) == 0) - wake_up(&sc->send_io.credits.wait_queue); - - if (credits) - queue_work(sc->workqueue, &sc->idle.immediate_work); -} - -static void send_done(struct ib_cq *cq, struct ib_wc *wc) -{ - struct smbdirect_send_io *sendmsg, *sibling, *next; - struct smbdirect_socket *sc; - int lcredits = 0; - - sendmsg = container_of(wc->wr_cqe, struct smbdirect_send_io, cqe); - sc = sendmsg->socket; - - ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, - wc->opcode); - - if (unlikely(!(sendmsg->wr.send_flags & IB_SEND_SIGNALED))) { - /* - * This happens when smbdirect_send_io is a sibling - * before the final message, it is signaled on - * error anyway, so we need to skip - * smbdirect_connection_free_send_io here, - * otherwise is will destroy the memory - * of the siblings too, which will cause - * use after free problems for the others - * triggered from ib_drain_qp(). - */ - if (wc->status != IB_WC_SUCCESS) - goto skip_free; - - /* - * This should not happen! - * But we better just close the - * connection... - */ - pr_err("unexpected send completion wc->status=%s (%d) wc->opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, wc->opcode); - smb_direct_disconnect_rdma_connection(sc); - return; - } - - /* - * Free possible siblings and then the main send_io - */ - list_for_each_entry_safe(sibling, next, &sendmsg->sibling_list, sibling_list) { - list_del_init(&sibling->sibling_list); - smb_direct_free_sendmsg(sc, sibling); - lcredits += 1; - } - /* Note this frees wc->wr_cqe, but not wc */ - smb_direct_free_sendmsg(sc, sendmsg); - lcredits += 1; - - if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { -skip_free: - pr_err("Send error. status='%s (%d)', opcode=%d\n", - ib_wc_status_msg(wc->status), wc->status, - wc->opcode); - smb_direct_disconnect_rdma_connection(sc); - return; - } - - atomic_add(lcredits, &sc->send_io.lcredits.count); - wake_up(&sc->send_io.lcredits.wait_queue); - - if (atomic_dec_and_test(&sc->send_io.pending.count)) - wake_up(&sc->send_io.pending.zero_wait_queue); -} - -static int manage_credits_prior_sending(struct smbdirect_socket *sc) -{ - int missing; - int available; - int new_credits; - - if (atomic_read(&sc->recv_io.credits.count) >= sc->recv_io.credits.target) - return 0; - - missing = (int)sc->recv_io.credits.target - atomic_read(&sc->recv_io.credits.count); - available = atomic_xchg(&sc->recv_io.credits.available, 0); - new_credits = (u16)min3(U16_MAX, missing, available); - if (new_credits <= 0) { - /* - * If credits are available, but not granted - * we need to re-add them again. - */ - if (available) - atomic_add(available, &sc->recv_io.credits.available); - return 0; - } - - if (new_credits < available) { - /* - * Readd the remaining available again. - */ - available -= new_credits; - atomic_add(available, &sc->recv_io.credits.available); - } - - /* - * Remember we granted the credits - */ - atomic_add(new_credits, &sc->recv_io.credits.count); - return new_credits; -} - -static int manage_keep_alive_before_sending(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - - if (sc->idle.keepalive == SMBDIRECT_KEEPALIVE_PENDING) { - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_SENT; - /* - * Now use the keepalive timeout (instead of keepalive interval) - * in order to wait for a response - */ - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->keepalive_timeout_msec)); - return 1; - } - return 0; -} - -static int smb_direct_post_send(struct smbdirect_socket *sc, - struct ib_send_wr *wr) -{ - int ret; - - atomic_inc(&sc->send_io.pending.count); - ret = ib_post_send(sc->ib.qp, wr, NULL); - if (ret) { - pr_err("failed to post send: %d\n", ret); - smb_direct_disconnect_rdma_connection(sc); - } - return ret; -} - -static void smb_direct_send_ctx_init(struct smbdirect_send_batch *send_ctx, - bool need_invalidate_rkey, - unsigned int remote_key) -{ - INIT_LIST_HEAD(&send_ctx->msg_list); - send_ctx->wr_cnt = 0; - send_ctx->need_invalidate_rkey = need_invalidate_rkey; - send_ctx->remote_key = remote_key; - send_ctx->credit = 0; -} - -static int smb_direct_flush_send_list(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx, - bool is_last) -{ - struct smbdirect_send_io *first, *last; - int ret = 0; - - if (list_empty(&send_ctx->msg_list)) - goto release_credit; - - first = list_first_entry(&send_ctx->msg_list, - struct smbdirect_send_io, - sibling_list); - last = list_last_entry(&send_ctx->msg_list, - struct smbdirect_send_io, - sibling_list); - - if (send_ctx->need_invalidate_rkey) { - first->wr.opcode = IB_WR_SEND_WITH_INV; - first->wr.ex.invalidate_rkey = send_ctx->remote_key; - send_ctx->need_invalidate_rkey = false; - send_ctx->remote_key = 0; - } - - last->wr.send_flags = IB_SEND_SIGNALED; - last->wr.wr_cqe = &last->cqe; - - /* - * Remove last from send_ctx->msg_list - * and splice the rest of send_ctx->msg_list - * to last->sibling_list. - * - * send_ctx->msg_list is a valid empty list - * at the end. - */ - list_del_init(&last->sibling_list); - list_splice_tail_init(&send_ctx->msg_list, &last->sibling_list); - send_ctx->wr_cnt = 0; - - ret = smb_direct_post_send(sc, &first->wr); - if (ret) { - struct smbdirect_send_io *sibling, *next; - - list_for_each_entry_safe(sibling, next, &last->sibling_list, sibling_list) { - list_del_init(&sibling->sibling_list); - smb_direct_free_sendmsg(sc, sibling); - } - smb_direct_free_sendmsg(sc, last); - } - -release_credit: - if (is_last && !ret && send_ctx->credit) { - atomic_add(send_ctx->credit, &sc->send_io.bcredits.count); - send_ctx->credit = 0; - wake_up(&sc->send_io.bcredits.wait_queue); - } - - return ret; -} - -static int wait_for_credits(struct smbdirect_socket *sc, - wait_queue_head_t *waitq, atomic_t *total_credits, - int needed) -{ - int ret; - - do { - if (atomic_sub_return(needed, total_credits) >= 0) - return 0; - - atomic_add(needed, total_credits); - ret = wait_event_interruptible(*waitq, - atomic_read(total_credits) >= needed || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return -ENOTCONN; - else if (ret < 0) - return ret; - } while (true); -} - -static int wait_for_send_bcredit(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx) -{ - int ret; - - if (send_ctx->credit) - return 0; - - ret = wait_for_credits(sc, - &sc->send_io.bcredits.wait_queue, - &sc->send_io.bcredits.count, - 1); - if (ret) - return ret; - - send_ctx->credit = 1; - return 0; -} - -static int wait_for_send_lcredit(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx) -{ - if (send_ctx && (atomic_read(&sc->send_io.lcredits.count) <= 1)) { - int ret; - - ret = smb_direct_flush_send_list(sc, send_ctx, false); - if (ret) - return ret; - } - - return wait_for_credits(sc, - &sc->send_io.lcredits.wait_queue, - &sc->send_io.lcredits.count, - 1); -} - -static int wait_for_send_credits(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx) -{ - int ret; - - if (send_ctx && - (send_ctx->wr_cnt >= 16 || atomic_read(&sc->send_io.credits.count) <= 1)) { - ret = smb_direct_flush_send_list(sc, send_ctx, false); - if (ret) - return ret; - } - - return wait_for_credits(sc, &sc->send_io.credits.wait_queue, &sc->send_io.credits.count, 1); -} - -static int wait_for_rw_credits(struct smbdirect_socket *sc, int credits) -{ - return wait_for_credits(sc, - &sc->rw_io.credits.wait_queue, - &sc->rw_io.credits.count, - credits); -} - -static int calc_rw_credits(struct smbdirect_socket *sc, - char *buf, unsigned int len) -{ - return DIV_ROUND_UP(get_buf_page_count(buf, len), - sc->rw_io.credits.num_pages); -} - -static int smb_direct_create_header(struct smbdirect_socket *sc, - int size, int remaining_data_length, - int new_credits, - struct smbdirect_send_io **sendmsg_out) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_send_io *sendmsg; - struct smbdirect_data_transfer *packet; - int header_length; - int ret; - - sendmsg = smb_direct_alloc_sendmsg(sc); - if (IS_ERR(sendmsg)) - return PTR_ERR(sendmsg); - - /* Fill in the packet header */ - packet = (struct smbdirect_data_transfer *)sendmsg->packet; - packet->credits_requested = cpu_to_le16(sp->send_credit_target); - packet->credits_granted = cpu_to_le16(new_credits); - - packet->flags = 0; - if (manage_keep_alive_before_sending(sc)) - packet->flags |= cpu_to_le16(SMBDIRECT_FLAG_RESPONSE_REQUESTED); - - packet->reserved = 0; - if (!size) - packet->data_offset = 0; - else - packet->data_offset = cpu_to_le32(24); - packet->data_length = cpu_to_le32(size); - packet->remaining_data_length = cpu_to_le32(remaining_data_length); - packet->padding = 0; - - ksmbd_debug(RDMA, - "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", - le16_to_cpu(packet->credits_requested), - le16_to_cpu(packet->credits_granted), - le32_to_cpu(packet->data_offset), - le32_to_cpu(packet->data_length), - le32_to_cpu(packet->remaining_data_length)); - - /* Map the packet to DMA */ - header_length = sizeof(struct smbdirect_data_transfer); - /* If this is a packet without payload, don't send padding */ - if (!size) - header_length = - offsetof(struct smbdirect_data_transfer, padding); - - sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, - (void *)packet, - header_length, - DMA_TO_DEVICE); - ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); - if (ret) { - smb_direct_free_sendmsg(sc, sendmsg); - return ret; - } - - sendmsg->num_sge = 1; - sendmsg->sge[0].length = header_length; - sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; - - *sendmsg_out = sendmsg; - return 0; -} - -static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries) -{ - bool high = is_vmalloc_addr(buf); - struct page *page; - int offset, len; - int i = 0; - - if (size <= 0 || nentries < get_buf_page_count(buf, size)) - return -EINVAL; - - offset = offset_in_page(buf); - buf -= offset; - while (size > 0) { - len = min_t(int, PAGE_SIZE - offset, size); - if (high) - page = vmalloc_to_page(buf); - else - page = kmap_to_page(buf); - - if (!sg_list) - return -EINVAL; - sg_set_page(sg_list, page, len, offset); - sg_list = sg_next(sg_list); - - buf += PAGE_SIZE; - size -= len; - offset = 0; - i++; - } - return i; -} - -static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, - struct scatterlist *sg_list, int nentries, - enum dma_data_direction dir, int *npages) -{ - *npages = get_sg_list(buf, size, sg_list, nentries); - if (*npages < 0) - return -EINVAL; - return ib_dma_map_sg(device, sg_list, *npages, dir); -} - -static int post_sendmsg(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx, - struct smbdirect_send_io *msg) -{ - int i; - - for (i = 0; i < msg->num_sge; i++) - ib_dma_sync_single_for_device(sc->ib.dev, - msg->sge[i].addr, msg->sge[i].length, - DMA_TO_DEVICE); - - msg->cqe.done = send_done; - msg->wr.opcode = IB_WR_SEND; - msg->wr.sg_list = &msg->sge[0]; - msg->wr.num_sge = msg->num_sge; - msg->wr.next = NULL; - - if (send_ctx) { - msg->wr.wr_cqe = NULL; - msg->wr.send_flags = 0; - if (!list_empty(&send_ctx->msg_list)) { - struct smbdirect_send_io *last; - - last = list_last_entry(&send_ctx->msg_list, - struct smbdirect_send_io, - sibling_list); - last->wr.next = &msg->wr; - } - list_add_tail(&msg->sibling_list, &send_ctx->msg_list); - send_ctx->wr_cnt++; - return 0; - } - - msg->wr.wr_cqe = &msg->cqe; - msg->wr.send_flags = IB_SEND_SIGNALED; - return smb_direct_post_send(sc, &msg->wr); -} - -static int smb_direct_post_send_data(struct smbdirect_socket *sc, - struct smbdirect_send_batch *send_ctx, - struct kvec *iov, int niov, - int remaining_data_length) -{ - int i, j, ret; - struct smbdirect_send_io *msg; - int data_length; - struct scatterlist sg[SMBDIRECT_SEND_IO_MAX_SGE - 1]; - struct smbdirect_send_batch _send_ctx; - int new_credits; - - if (!send_ctx) { - smb_direct_send_ctx_init(&_send_ctx, false, 0); - send_ctx = &_send_ctx; - } - - ret = wait_for_send_bcredit(sc, send_ctx); - if (ret) - goto bcredit_failed; - - ret = wait_for_send_lcredit(sc, send_ctx); - if (ret) - goto lcredit_failed; - - ret = wait_for_send_credits(sc, send_ctx); - if (ret) - goto credit_failed; - - new_credits = manage_credits_prior_sending(sc); - if (new_credits == 0 && - atomic_read(&sc->send_io.credits.count) == 0 && - atomic_read(&sc->recv_io.credits.count) == 0) { - queue_work(sc->workqueue, &sc->recv_io.posted.refill_work); - ret = wait_event_interruptible(sc->send_io.credits.wait_queue, - atomic_read(&sc->send_io.credits.count) >= 1 || - atomic_read(&sc->recv_io.credits.available) >= 1 || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - ret = -ENOTCONN; - if (ret < 0) - goto credit_failed; - - new_credits = manage_credits_prior_sending(sc); - } - - data_length = 0; - for (i = 0; i < niov; i++) - data_length += iov[i].iov_len; - - ret = smb_direct_create_header(sc, data_length, remaining_data_length, - new_credits, &msg); - if (ret) - goto header_failed; - - for (i = 0; i < niov; i++) { - struct ib_sge *sge; - int sg_cnt; - int npages; - - sg_init_table(sg, SMBDIRECT_SEND_IO_MAX_SGE - 1); - sg_cnt = get_mapped_sg_list(sc->ib.dev, - iov[i].iov_base, iov[i].iov_len, - sg, SMBDIRECT_SEND_IO_MAX_SGE - 1, - DMA_TO_DEVICE, &npages); - if (sg_cnt <= 0) { - pr_err("failed to map buffer\n"); - ret = -ENOMEM; - goto err; - } else if (sg_cnt + msg->num_sge > SMBDIRECT_SEND_IO_MAX_SGE) { - pr_err("buffer not fitted into sges\n"); - ret = -E2BIG; - ib_dma_unmap_sg(sc->ib.dev, sg, npages, - DMA_TO_DEVICE); - goto err; - } - - for (j = 0; j < sg_cnt; j++) { - sge = &msg->sge[msg->num_sge]; - sge->addr = sg_dma_address(&sg[j]); - sge->length = sg_dma_len(&sg[j]); - sge->lkey = sc->ib.pd->local_dma_lkey; - msg->num_sge++; - } - } - - ret = post_sendmsg(sc, send_ctx, msg); - if (ret) - goto err; - - /* - * From here msg is moved to send_ctx - * and we should not free it explicitly. - */ - - if (send_ctx == &_send_ctx) { - ret = smb_direct_flush_send_list(sc, send_ctx, true); - if (ret) - goto flush_failed; - } - - return 0; -err: - smb_direct_free_sendmsg(sc, msg); -flush_failed: -header_failed: - atomic_inc(&sc->send_io.credits.count); -credit_failed: - atomic_inc(&sc->send_io.lcredits.count); -lcredit_failed: - atomic_add(send_ctx->credit, &sc->send_io.bcredits.count); - send_ctx->credit = 0; -bcredit_failed: + ret = smbdirect_connection_recvmsg(sc, &msg, 0); + if (ret == -ERESTARTSYS) + ret = -EINTR; return ret; } @@ -1619,319 +244,13 @@ static int smb_direct_writev(struct ksmbd_transport *t, bool need_invalidate, unsigned int remote_key) { struct smb_direct_transport *st = SMBD_TRANS(t); - struct smbdirect_socket *sc = &st->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; - size_t remaining_data_length; - size_t iov_idx; - size_t iov_ofs; - size_t max_iov_size = sp->max_send_size - - sizeof(struct smbdirect_data_transfer); - int ret; - struct smbdirect_send_batch send_ctx; - int error = 0; + struct smbdirect_socket *sc = st->socket; + struct iov_iter iter; - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return -ENOTCONN; + iov_iter_kvec(&iter, ITER_SOURCE, iov, niovs, buflen); - //FIXME: skip RFC1002 header.. - if (WARN_ON_ONCE(niovs <= 1 || iov[0].iov_len != 4)) - return -EINVAL; - buflen -= 4; - iov_idx = 1; - iov_ofs = 0; - - remaining_data_length = buflen; - ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); - - smb_direct_send_ctx_init(&send_ctx, need_invalidate, remote_key); - while (remaining_data_length) { - struct kvec vecs[SMBDIRECT_SEND_IO_MAX_SGE - 1]; /* minus smbdirect hdr */ - size_t possible_bytes = max_iov_size; - size_t possible_vecs; - size_t bytes = 0; - size_t nvecs = 0; - - /* - * For the last message remaining_data_length should be - * have been 0 already! - */ - if (WARN_ON_ONCE(iov_idx >= niovs)) { - error = -EINVAL; - goto done; - } - - /* - * We have 2 factors which limit the arguments we pass - * to smb_direct_post_send_data(): - * - * 1. The number of supported sges for the send, - * while one is reserved for the smbdirect header. - * And we currently need one SGE per page. - * 2. The number of negotiated payload bytes per send. - */ - possible_vecs = min_t(size_t, ARRAY_SIZE(vecs), niovs - iov_idx); - - while (iov_idx < niovs && possible_vecs && possible_bytes) { - struct kvec *v = &vecs[nvecs]; - int page_count; - - v->iov_base = ((u8 *)iov[iov_idx].iov_base) + iov_ofs; - v->iov_len = min_t(size_t, - iov[iov_idx].iov_len - iov_ofs, - possible_bytes); - page_count = get_buf_page_count(v->iov_base, v->iov_len); - if (page_count > possible_vecs) { - /* - * If the number of pages in the buffer - * is to much (because we currently require - * one SGE per page), we need to limit the - * length. - * - * We know possible_vecs is at least 1, - * so we always keep the first page. - * - * We need to calculate the number extra - * pages (epages) we can also keep. - * - * We calculate the number of bytes in the - * first page (fplen), this should never be - * larger than v->iov_len because page_count is - * at least 2, but adding a limitation feels - * better. - * - * Then we calculate the number of bytes (elen) - * we can keep for the extra pages. - */ - size_t epages = possible_vecs - 1; - size_t fpofs = offset_in_page(v->iov_base); - size_t fplen = min_t(size_t, PAGE_SIZE - fpofs, v->iov_len); - size_t elen = min_t(size_t, v->iov_len - fplen, epages*PAGE_SIZE); - - v->iov_len = fplen + elen; - page_count = get_buf_page_count(v->iov_base, v->iov_len); - if (WARN_ON_ONCE(page_count > possible_vecs)) { - /* - * Something went wrong in the above - * logic... - */ - error = -EINVAL; - goto done; - } - } - possible_vecs -= page_count; - nvecs += 1; - possible_bytes -= v->iov_len; - bytes += v->iov_len; - - iov_ofs += v->iov_len; - if (iov_ofs >= iov[iov_idx].iov_len) { - iov_idx += 1; - iov_ofs = 0; - } - } - - remaining_data_length -= bytes; - - ret = smb_direct_post_send_data(sc, &send_ctx, - vecs, nvecs, - remaining_data_length); - if (unlikely(ret)) { - error = ret; - goto done; - } - } - -done: - ret = smb_direct_flush_send_list(sc, &send_ctx, true); - if (unlikely(!ret && error)) - ret = error; - - /* - * As an optimization, we don't wait for individual I/O to finish - * before sending the next one. - * Send them all and wait for pending send count to get to 0 - * that means all the I/Os have been out and we are good to return - */ - - wait_event(sc->send_io.pending.zero_wait_queue, - atomic_read(&sc->send_io.pending.count) == 0 || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED && ret == 0) - ret = -ENOTCONN; - - return ret; -} - -static void smb_direct_free_rdma_rw_msg(struct smb_direct_transport *t, - struct smbdirect_rw_io *msg, - enum dma_data_direction dir) -{ - struct smbdirect_socket *sc = &t->socket; - - rdma_rw_ctx_destroy(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, - msg->sgt.sgl, msg->sgt.nents, dir); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); -} - -static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, - enum dma_data_direction dir) -{ - struct smbdirect_rw_io *msg = - container_of(wc->wr_cqe, struct smbdirect_rw_io, cqe); - struct smbdirect_socket *sc = msg->socket; - - if (wc->status != IB_WC_SUCCESS) { - msg->error = -EIO; - pr_err("read/write error. opcode = %d, status = %s(%d)\n", - wc->opcode, ib_wc_status_msg(wc->status), wc->status); - if (wc->status != IB_WC_WR_FLUSH_ERR) - smb_direct_disconnect_rdma_connection(sc); - } - - complete(msg->completion); -} - -static void read_done(struct ib_cq *cq, struct ib_wc *wc) -{ - read_write_done(cq, wc, DMA_FROM_DEVICE); -} - -static void write_done(struct ib_cq *cq, struct ib_wc *wc) -{ - read_write_done(cq, wc, DMA_TO_DEVICE); -} - -static int smb_direct_rdma_xmit(struct smb_direct_transport *t, - void *buf, int buf_len, - struct smbdirect_buffer_descriptor_v1 *desc, - unsigned int desc_len, - bool is_read) -{ - struct smbdirect_socket *sc = &t->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_rw_io *msg, *next_msg; - int i, ret; - DECLARE_COMPLETION_ONSTACK(completion); - struct ib_send_wr *first_wr; - LIST_HEAD(msg_list); - char *desc_buf; - int credits_needed; - unsigned int desc_buf_len, desc_num = 0; - - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return -ENOTCONN; - - if (buf_len > sp->max_read_write_size) - return -EINVAL; - - /* calculate needed credits */ - credits_needed = 0; - desc_buf = buf; - for (i = 0; i < desc_len / sizeof(*desc); i++) { - if (!buf_len) - break; - - desc_buf_len = le32_to_cpu(desc[i].length); - if (!desc_buf_len) - return -EINVAL; - - if (desc_buf_len > buf_len) { - desc_buf_len = buf_len; - desc[i].length = cpu_to_le32(desc_buf_len); - buf_len = 0; - } - - credits_needed += calc_rw_credits(sc, desc_buf, desc_buf_len); - desc_buf += desc_buf_len; - buf_len -= desc_buf_len; - desc_num++; - } - - ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", - str_read_write(is_read), buf_len, credits_needed); - - ret = wait_for_rw_credits(sc, credits_needed); - if (ret < 0) - return ret; - - /* build rdma_rw_ctx for each descriptor */ - desc_buf = buf; - for (i = 0; i < desc_num; i++) { - msg = kzalloc_flex(*msg, sg_list, SG_CHUNK_SIZE, - KSMBD_DEFAULT_GFP); - if (!msg) { - ret = -ENOMEM; - goto out; - } - - desc_buf_len = le32_to_cpu(desc[i].length); - - msg->socket = sc; - msg->cqe.done = is_read ? read_done : write_done; - msg->completion = &completion; - - msg->sgt.sgl = &msg->sg_list[0]; - ret = sg_alloc_table_chained(&msg->sgt, - get_buf_page_count(desc_buf, desc_buf_len), - msg->sg_list, SG_CHUNK_SIZE); - if (ret) { - ret = -ENOMEM; - goto free_msg; - } - - ret = get_sg_list(desc_buf, desc_buf_len, - msg->sgt.sgl, msg->sgt.orig_nents); - if (ret < 0) - goto free_table; - - ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, - msg->sgt.sgl, - get_buf_page_count(desc_buf, desc_buf_len), - 0, - le64_to_cpu(desc[i].offset), - le32_to_cpu(desc[i].token), - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - if (ret < 0) { - pr_err("failed to init rdma_rw_ctx: %d\n", ret); - goto free_table; - } - - list_add_tail(&msg->list, &msg_list); - desc_buf += desc_buf_len; - } - - /* concatenate work requests of rdma_rw_ctxs */ - first_wr = NULL; - list_for_each_entry_reverse(msg, &msg_list, list) { - first_wr = rdma_rw_ctx_wrs(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, - &msg->cqe, first_wr); - } - - ret = ib_post_send(sc->ib.qp, first_wr, NULL); - if (ret) { - pr_err("failed to post send wr for RDMA R/W: %d\n", ret); - goto out; - } - - msg = list_last_entry(&msg_list, struct smbdirect_rw_io, list); - wait_for_completion(&completion); - ret = msg->error; -out: - list_for_each_entry_safe(msg, next_msg, &msg_list, list) { - list_del(&msg->list); - smb_direct_free_rdma_rw_msg(t, msg, - is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - } - atomic_add(credits_needed, &sc->rw_io.credits.count); - wake_up(&sc->rw_io.credits.wait_queue); - return ret; - -free_table: - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); -free_msg: - kfree(msg); - goto out; + return smbdirect_connection_send_iter(sc, &iter, 0, + need_invalidate, remote_key); } static int smb_direct_rdma_write(struct ksmbd_transport *t, @@ -1939,8 +258,11 @@ static int smb_direct_rdma_write(struct ksmbd_transport *t, struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, - desc, desc_len, false); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = st->socket; + + return smbdirect_connection_rdma_xmit(sc, buf, buflen, + desc, desc_len, false); } static int smb_direct_rdma_read(struct ksmbd_transport *t, @@ -1948,16 +270,19 @@ static int smb_direct_rdma_read(struct ksmbd_transport *t, struct smbdirect_buffer_descriptor_v1 *desc, unsigned int desc_len) { - return smb_direct_rdma_xmit(SMBD_TRANS(t), buf, buflen, - desc, desc_len, true); + struct smb_direct_transport *st = SMBD_TRANS(t); + struct smbdirect_socket *sc = st->socket; + + return smbdirect_connection_rdma_xmit(sc, buf, buflen, + desc, desc_len, true); } static void smb_direct_disconnect(struct ksmbd_transport *t) { struct smb_direct_transport *st = SMBD_TRANS(t); - struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket *sc = st->socket; - ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", sc->rdma.cm_id); + ksmbd_debug(RDMA, "Disconnecting sc=%p\n", sc); free_transport(st); } @@ -1965,790 +290,26 @@ static void smb_direct_disconnect(struct ksmbd_transport *t) static void smb_direct_shutdown(struct ksmbd_transport *t) { struct smb_direct_transport *st = SMBD_TRANS(t); - struct smbdirect_socket *sc = &st->socket; + struct smbdirect_socket *sc = st->socket; - ksmbd_debug(RDMA, "smb-direct shutdown cm_id=%p\n", sc->rdma.cm_id); + ksmbd_debug(RDMA, "smb-direct shutdown sc=%p\n", sc); - smb_direct_disconnect_rdma_work(&sc->disconnect_work); + smbdirect_socket_shutdown(sc); } -static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) +static int smb_direct_new_connection(struct smb_direct_listener *listener, + struct smbdirect_socket *client_sc) { - struct smbdirect_socket *sc = cm_id->context; - unsigned long flags; - - ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", - cm_id, rdma_event_msg(event->event), event->event); - - switch (event->event) { - case RDMA_CM_EVENT_ESTABLISHED: { - /* - * Some drivers (at least mlx5_ib and irdma in roce mode) - * might post a recv completion before RDMA_CM_EVENT_ESTABLISHED, - * we need to adjust our expectation in that case. - * - * If smb_direct_negotiate_recv_done was called first - * it initialized sc->connect.work only for us to - * start, so that we turned into - * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, before - * smb_direct_negotiate_recv_work() runs. - * - * If smb_direct_negotiate_recv_done didn't happen - * yet. sc->connect.work is still be disabled and - * queue_work() is a no-op. - */ - if (SMBDIRECT_CHECK_STATUS_DISCONNECT(sc, SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING)) - break; - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_NEEDED; - spin_lock_irqsave(&sc->connect.lock, flags); - if (!sc->first_error) - queue_work(sc->workqueue, &sc->connect.work); - spin_unlock_irqrestore(&sc->connect.lock, flags); - wake_up(&sc->status_wait); - break; - } - case RDMA_CM_EVENT_DEVICE_REMOVAL: - case RDMA_CM_EVENT_DISCONNECTED: { - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - smb_direct_disconnect_rdma_work(&sc->disconnect_work); - if (sc->ib.qp) - ib_drain_qp(sc->ib.qp); - break; - } - case RDMA_CM_EVENT_CONNECT_ERROR: { - sc->status = SMBDIRECT_SOCKET_DISCONNECTED; - smb_direct_disconnect_rdma_work(&sc->disconnect_work); - break; - } - default: - pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n", - cm_id, rdma_event_msg(event->event), - event->event); - break; - } - return 0; -} - -static void smb_direct_qpair_handler(struct ib_event *event, void *context) -{ - struct smbdirect_socket *sc = context; - - ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", - sc->rdma.cm_id, ib_event_msg(event->event), event->event); - - switch (event->event) { - case IB_EVENT_CQ_ERR: - case IB_EVENT_QP_FATAL: - smb_direct_disconnect_rdma_connection(sc); - break; - default: - break; - } -} - -static int smb_direct_send_negotiate_response(struct smbdirect_socket *sc, - int failed) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_send_io *sendmsg; - struct smbdirect_negotiate_resp *resp; - int ret; - - sendmsg = smb_direct_alloc_sendmsg(sc); - if (IS_ERR(sendmsg)) - return -ENOMEM; - - resp = (struct smbdirect_negotiate_resp *)sendmsg->packet; - if (failed) { - memset(resp, 0, sizeof(*resp)); - resp->min_version = SMB_DIRECT_VERSION_LE; - resp->max_version = SMB_DIRECT_VERSION_LE; - resp->status = STATUS_NOT_SUPPORTED; - - sc->status = SMBDIRECT_SOCKET_NEGOTIATE_FAILED; - } else { - resp->status = STATUS_SUCCESS; - resp->min_version = SMB_DIRECT_VERSION_LE; - resp->max_version = SMB_DIRECT_VERSION_LE; - resp->negotiated_version = SMB_DIRECT_VERSION_LE; - resp->reserved = 0; - resp->credits_requested = - cpu_to_le16(sp->send_credit_target); - resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(sc)); - resp->max_readwrite_size = cpu_to_le32(sp->max_read_write_size); - resp->preferred_send_size = cpu_to_le32(sp->max_send_size); - resp->max_receive_size = cpu_to_le32(sp->max_recv_size); - resp->max_fragmented_size = - cpu_to_le32(sp->max_fragmented_recv_size); - - atomic_set(&sc->send_io.bcredits.count, 1); - sc->recv_io.expected = SMBDIRECT_EXPECT_DATA_TRANSFER; - sc->status = SMBDIRECT_SOCKET_CONNECTED; - } - - sendmsg->sge[0].addr = ib_dma_map_single(sc->ib.dev, - (void *)resp, sizeof(*resp), - DMA_TO_DEVICE); - ret = ib_dma_mapping_error(sc->ib.dev, sendmsg->sge[0].addr); - if (ret) { - smb_direct_free_sendmsg(sc, sendmsg); - return ret; - } - - sendmsg->num_sge = 1; - sendmsg->sge[0].length = sizeof(*resp); - sendmsg->sge[0].lkey = sc->ib.pd->local_dma_lkey; - - ret = post_sendmsg(sc, NULL, sendmsg); - if (ret) { - smb_direct_free_sendmsg(sc, sendmsg); - return ret; - } - - wait_event(sc->send_io.pending.zero_wait_queue, - atomic_read(&sc->send_io.pending.count) == 0 || - sc->status != SMBDIRECT_SOCKET_CONNECTED); - if (sc->status != SMBDIRECT_SOCKET_CONNECTED) - return -ENOTCONN; - - return 0; -} - -static int smb_direct_accept_client(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct rdma_conn_param conn_param; - __be32 ird_ord_hdr[2]; - int ret; - - /* - * smb_direct_handle_connect_request() - * already negotiated sp->initiator_depth - * and sp->responder_resources - */ - memset(&conn_param, 0, sizeof(conn_param)); - conn_param.initiator_depth = sp->initiator_depth; - conn_param.responder_resources = sp->responder_resources; - - if (sc->rdma.legacy_iwarp) { - ird_ord_hdr[0] = cpu_to_be32(conn_param.responder_resources); - ird_ord_hdr[1] = cpu_to_be32(conn_param.initiator_depth); - conn_param.private_data = ird_ord_hdr; - conn_param.private_data_len = sizeof(ird_ord_hdr); - } else { - conn_param.private_data = NULL; - conn_param.private_data_len = 0; - } - conn_param.retry_count = SMB_DIRECT_CM_RETRY; - conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; - conn_param.flow_control = 0; - - /* - * start with the negotiate timeout and SMBDIRECT_KEEPALIVE_PENDING - * so that the timer will cause a disconnect. - */ - sc->idle.keepalive = SMBDIRECT_KEEPALIVE_PENDING; - mod_delayed_work(sc->workqueue, &sc->idle.timer_work, - msecs_to_jiffies(sp->negotiate_timeout_msec)); - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED); - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING; - ret = rdma_accept(sc->rdma.cm_id, &conn_param); - if (ret) { - pr_err("error at rdma_accept: %d\n", ret); - return ret; - } - return 0; -} - -static int smb_direct_prepare_negotiation(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *recvmsg; - bool recv_posted = false; - int ret; - - WARN_ON_ONCE(sc->status != SMBDIRECT_SOCKET_CREATED); - sc->status = SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED; - - sc->recv_io.expected = SMBDIRECT_EXPECT_NEGOTIATE_REQ; - - recvmsg = get_free_recvmsg(sc); - if (!recvmsg) - return -ENOMEM; - recvmsg->cqe.done = smb_direct_negotiate_recv_done; - - ret = smb_direct_post_recv(sc, recvmsg); - if (ret) { - pr_err("Can't post recv: %d\n", ret); - goto out_err; - } - recv_posted = true; - - ret = smb_direct_accept_client(sc); - if (ret) { - pr_err("Can't accept client\n"); - goto out_err; - } - - return 0; -out_err: - /* - * If the recv was never posted, return it to the free list. - * If it was posted, leave it alone so disconnect teardown can - * drain the QP and complete it (flush) and the completion path - * will unmap it exactly once. - */ - if (!recv_posted) - put_recvmsg(sc, recvmsg); - return ret; -} - -static int smb_direct_init_params(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - int max_send_sges; - unsigned int maxpages; - - /* need 3 more sge. because a SMB_DIRECT header, SMB2 header, - * SMB2 response could be mapped. - */ - max_send_sges = DIV_ROUND_UP(sp->max_send_size, PAGE_SIZE) + 3; - if (max_send_sges > SMBDIRECT_SEND_IO_MAX_SGE) { - pr_err("max_send_size %d is too large\n", sp->max_send_size); - return -EINVAL; - } - - atomic_set(&sc->send_io.lcredits.count, sp->send_credit_target); - - maxpages = DIV_ROUND_UP(sp->max_read_write_size, PAGE_SIZE); - sc->rw_io.credits.max = rdma_rw_mr_factor(sc->ib.dev, - sc->rdma.cm_id->port_num, - maxpages); - sc->rw_io.credits.num_pages = DIV_ROUND_UP(maxpages, sc->rw_io.credits.max); - /* add one extra in order to handle unaligned pages */ - sc->rw_io.credits.max += 1; - - sc->recv_io.credits.target = 1; - - atomic_set(&sc->rw_io.credits.count, sc->rw_io.credits.max); - - return 0; -} - -static void smb_direct_destroy_pools(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *recvmsg; - - while ((recvmsg = get_free_recvmsg(sc))) - mempool_free(recvmsg, sc->recv_io.mem.pool); - - mempool_destroy(sc->recv_io.mem.pool); - sc->recv_io.mem.pool = NULL; - - kmem_cache_destroy(sc->recv_io.mem.cache); - sc->recv_io.mem.cache = NULL; - - mempool_destroy(sc->send_io.mem.pool); - sc->send_io.mem.pool = NULL; - - kmem_cache_destroy(sc->send_io.mem.cache); - sc->send_io.mem.cache = NULL; -} - -static int smb_direct_create_pools(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - char name[80]; - int i; - struct smbdirect_recv_io *recvmsg; - - snprintf(name, sizeof(name), "smbdirect_send_io_pool_%p", sc); - sc->send_io.mem.cache = kmem_cache_create(name, - sizeof(struct smbdirect_send_io) + - sizeof(struct smbdirect_negotiate_resp), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!sc->send_io.mem.cache) - return -ENOMEM; - - sc->send_io.mem.pool = mempool_create(sp->send_credit_target, - mempool_alloc_slab, mempool_free_slab, - sc->send_io.mem.cache); - if (!sc->send_io.mem.pool) - goto err; - - snprintf(name, sizeof(name), "smbdirect_recv_io_pool_%p", sc); - sc->recv_io.mem.cache = kmem_cache_create(name, - sizeof(struct smbdirect_recv_io) + - sp->max_recv_size, - 0, SLAB_HWCACHE_ALIGN, NULL); - if (!sc->recv_io.mem.cache) - goto err; - - sc->recv_io.mem.pool = - mempool_create(sp->recv_credit_max, mempool_alloc_slab, - mempool_free_slab, sc->recv_io.mem.cache); - if (!sc->recv_io.mem.pool) - goto err; - - for (i = 0; i < sp->recv_credit_max; i++) { - recvmsg = mempool_alloc(sc->recv_io.mem.pool, KSMBD_DEFAULT_GFP); - if (!recvmsg) - goto err; - recvmsg->socket = sc; - recvmsg->sge.length = 0; - list_add(&recvmsg->list, &sc->recv_io.free.list); - } - - return 0; -err: - smb_direct_destroy_pools(sc); - return -ENOMEM; -} - -static u32 smb_direct_rdma_rw_send_wrs(struct ib_device *dev, const struct ib_qp_init_attr *attr) -{ - /* - * This could be split out of rdma_rw_init_qp() - * and be a helper function next to rdma_rw_mr_factor() - * - * We can't check unlikely(rdma_rw_force_mr) here, - * but that is most likely 0 anyway. - */ - u32 factor; - - WARN_ON_ONCE(attr->port_num == 0); - - /* - * Each context needs at least one RDMA READ or WRITE WR. - * - * For some hardware we might need more, eventually we should ask the - * HCA driver for a multiplier here. - */ - factor = 1; - - /* - * If the device needs MRs to perform RDMA READ or WRITE operations, - * we'll need two additional MRs for the registrations and the - * invalidation. - */ - if (rdma_protocol_iwarp(dev, attr->port_num) || dev->attrs.max_sgl_rd) - factor += 2; /* inv + reg */ - - return factor * attr->cap.max_rdma_ctxs; -} - -static int smb_direct_create_qpair(struct smbdirect_socket *sc) -{ - struct smbdirect_socket_parameters *sp = &sc->parameters; - int ret; - struct ib_qp_cap qp_cap; - struct ib_qp_init_attr qp_attr; - u32 max_send_wr; - u32 rdma_send_wr; - - /* - * Note that {rdma,ib}_create_qp() will call - * rdma_rw_init_qp() if cap->max_rdma_ctxs is not 0. - * It will adjust cap->max_send_wr to the required - * number of additional WRs for the RDMA RW operations. - * It will cap cap->max_send_wr to the device limit. - * - * +1 for ib_drain_qp - */ - qp_cap.max_send_wr = sp->send_credit_target + 1; - qp_cap.max_recv_wr = sp->recv_credit_max + 1; - qp_cap.max_send_sge = SMBDIRECT_SEND_IO_MAX_SGE; - qp_cap.max_recv_sge = SMBDIRECT_RECV_IO_MAX_SGE; - qp_cap.max_inline_data = 0; - qp_cap.max_rdma_ctxs = sc->rw_io.credits.max; - - /* - * Find out the number of max_send_wr - * after rdma_rw_init_qp() adjusted it. - * - * We only do it on a temporary variable, - * as rdma_create_qp() will trigger - * rdma_rw_init_qp() again. - */ - memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.cap = qp_cap; - qp_attr.port_num = sc->rdma.cm_id->port_num; - rdma_send_wr = smb_direct_rdma_rw_send_wrs(sc->ib.dev, &qp_attr); - max_send_wr = qp_cap.max_send_wr + rdma_send_wr; - - if (qp_cap.max_send_wr > sc->ib.dev->attrs.max_cqe || - qp_cap.max_send_wr > sc->ib.dev->attrs.max_qp_wr) { - pr_err("Possible CQE overrun: max_send_wr %d\n", - qp_cap.max_send_wr); - pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", - IB_DEVICE_NAME_MAX, - sc->ib.dev->name, - sc->ib.dev->attrs.max_cqe, - sc->ib.dev->attrs.max_qp_wr); - pr_err("consider lowering send_credit_target = %d\n", - sp->send_credit_target); - return -EINVAL; - } - - if (qp_cap.max_rdma_ctxs && - (max_send_wr >= sc->ib.dev->attrs.max_cqe || - max_send_wr >= sc->ib.dev->attrs.max_qp_wr)) { - pr_err("Possible CQE overrun: rdma_send_wr %d + max_send_wr %d = %d\n", - rdma_send_wr, qp_cap.max_send_wr, max_send_wr); - pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", - IB_DEVICE_NAME_MAX, - sc->ib.dev->name, - sc->ib.dev->attrs.max_cqe, - sc->ib.dev->attrs.max_qp_wr); - pr_err("consider lowering send_credit_target = %d, max_rdma_ctxs = %d\n", - sp->send_credit_target, qp_cap.max_rdma_ctxs); - return -EINVAL; - } - - if (qp_cap.max_recv_wr > sc->ib.dev->attrs.max_cqe || - qp_cap.max_recv_wr > sc->ib.dev->attrs.max_qp_wr) { - pr_err("Possible CQE overrun: max_recv_wr %d\n", - qp_cap.max_recv_wr); - pr_err("device %.*s reporting max_cqe %d max_qp_wr %d\n", - IB_DEVICE_NAME_MAX, - sc->ib.dev->name, - sc->ib.dev->attrs.max_cqe, - sc->ib.dev->attrs.max_qp_wr); - pr_err("consider lowering receive_credit_max = %d\n", - sp->recv_credit_max); - return -EINVAL; - } - - if (qp_cap.max_send_sge > sc->ib.dev->attrs.max_send_sge || - qp_cap.max_recv_sge > sc->ib.dev->attrs.max_recv_sge) { - pr_err("device %.*s max_send_sge/max_recv_sge = %d/%d too small\n", - IB_DEVICE_NAME_MAX, - sc->ib.dev->name, - sc->ib.dev->attrs.max_send_sge, - sc->ib.dev->attrs.max_recv_sge); - return -EINVAL; - } - - sc->ib.pd = ib_alloc_pd(sc->ib.dev, 0); - if (IS_ERR(sc->ib.pd)) { - pr_err("Can't create RDMA PD\n"); - ret = PTR_ERR(sc->ib.pd); - sc->ib.pd = NULL; - return ret; - } - - sc->ib.send_cq = ib_alloc_cq_any(sc->ib.dev, sc, - max_send_wr, - IB_POLL_WORKQUEUE); - if (IS_ERR(sc->ib.send_cq)) { - pr_err("Can't create RDMA send CQ\n"); - ret = PTR_ERR(sc->ib.send_cq); - sc->ib.send_cq = NULL; - goto err; - } - - sc->ib.recv_cq = ib_alloc_cq_any(sc->ib.dev, sc, - qp_cap.max_recv_wr, - IB_POLL_WORKQUEUE); - if (IS_ERR(sc->ib.recv_cq)) { - pr_err("Can't create RDMA recv CQ\n"); - ret = PTR_ERR(sc->ib.recv_cq); - sc->ib.recv_cq = NULL; - goto err; - } - - /* - * We reset completely here! - * As the above use was just temporary - * to calc max_send_wr and rdma_send_wr. - * - * rdma_create_qp() will trigger rdma_rw_init_qp() - * again if max_rdma_ctxs is not 0. - */ - memset(&qp_attr, 0, sizeof(qp_attr)); - qp_attr.event_handler = smb_direct_qpair_handler; - qp_attr.qp_context = sc; - qp_attr.cap = qp_cap; - qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - qp_attr.qp_type = IB_QPT_RC; - qp_attr.send_cq = sc->ib.send_cq; - qp_attr.recv_cq = sc->ib.recv_cq; - qp_attr.port_num = ~0; - - ret = rdma_create_qp(sc->rdma.cm_id, sc->ib.pd, &qp_attr); - if (ret) { - pr_err("Can't create RDMA QP: %d\n", ret); - goto err; - } - - sc->ib.qp = sc->rdma.cm_id->qp; - sc->rdma.cm_id->event_handler = smb_direct_cm_handler; - - return 0; -err: - if (sc->ib.qp) { - sc->ib.qp = NULL; - rdma_destroy_qp(sc->rdma.cm_id); - } - if (sc->ib.recv_cq) { - ib_destroy_cq(sc->ib.recv_cq); - sc->ib.recv_cq = NULL; - } - if (sc->ib.send_cq) { - ib_destroy_cq(sc->ib.send_cq); - sc->ib.send_cq = NULL; - } - if (sc->ib.pd) { - ib_dealloc_pd(sc->ib.pd); - sc->ib.pd = NULL; - } - return ret; -} - -static int smb_direct_prepare(struct ksmbd_transport *t) -{ - struct smb_direct_transport *st = SMBD_TRANS(t); - struct smbdirect_socket *sc = &st->socket; - struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_recv_io *recvmsg; - struct smbdirect_negotiate_req *req; - unsigned long flags; - int ret; - - /* - * We are waiting to pass the following states: - * - * SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED - * SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING - * SMBDIRECT_SOCKET_NEGOTIATE_NEEDED - * - * To finally get to SMBDIRECT_SOCKET_NEGOTIATE_RUNNING - * in order to continue below. - * - * Everything else is unexpected and an error. - */ - ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); - ret = wait_event_interruptible_timeout(sc->status_wait, - sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_NEEDED && - sc->status != SMBDIRECT_SOCKET_RDMA_CONNECT_RUNNING && - sc->status != SMBDIRECT_SOCKET_NEGOTIATE_NEEDED, - msecs_to_jiffies(sp->negotiate_timeout_msec)); - if (ret <= 0 || sc->status != SMBDIRECT_SOCKET_NEGOTIATE_RUNNING) - return ret < 0 ? ret : -ETIMEDOUT; - - recvmsg = get_first_reassembly(sc); - if (!recvmsg) - return -ECONNABORTED; - - ret = smb_direct_check_recvmsg(recvmsg); - if (ret) - goto put; - - req = (struct smbdirect_negotiate_req *)recvmsg->packet; - sp->max_recv_size = min_t(u32, sp->max_recv_size, - le32_to_cpu(req->preferred_send_size)); - sp->max_send_size = min_t(u32, sp->max_send_size, - le32_to_cpu(req->max_receive_size)); - sp->max_fragmented_send_size = - le32_to_cpu(req->max_fragmented_size); - /* - * The maximum fragmented upper-layer payload receive size supported - * - * Assume max_payload_per_credit is - * smb_direct_receive_credit_max - 24 = 1340 - * - * The maximum number would be - * smb_direct_receive_credit_max * max_payload_per_credit - * - * 1340 * 255 = 341700 (0x536C4) - * - * The minimum value from the spec is 131072 (0x20000) - * - * For now we use the logic we used before: - * (1364 * 255) / 2 = 173910 (0x2A756) - * - * We need to adjust this here in case the peer - * lowered sp->max_recv_size. - * - * TODO: instead of adjusting max_fragmented_recv_size - * we should adjust the number of available buffers, - * but for now we keep the current logic. - */ - sp->max_fragmented_recv_size = - (sp->recv_credit_max * sp->max_recv_size) / 2; - sc->recv_io.credits.target = le16_to_cpu(req->credits_requested); - sc->recv_io.credits.target = min_t(u16, sc->recv_io.credits.target, sp->recv_credit_max); - sc->recv_io.credits.target = max_t(u16, sc->recv_io.credits.target, 1); - -put: - spin_lock_irqsave(&sc->recv_io.reassembly.lock, flags); - sc->recv_io.reassembly.queue_length--; - list_del(&recvmsg->list); - spin_unlock_irqrestore(&sc->recv_io.reassembly.lock, flags); - put_recvmsg(sc, recvmsg); - - if (ret == -ECONNABORTED) - return ret; - - if (ret) - goto respond; - - /* - * We negotiated with success, so we need to refill the recv queue. - * We do that with sc->idle.immediate_work still being disabled - * via smbdirect_socket_init(), so that queue_work(sc->workqueue, - * &sc->idle.immediate_work) in smb_direct_post_recv_credits() - * is a no-op. - * - * The message that grants the credits to the client is - * the negotiate response. - */ - INIT_WORK(&sc->recv_io.posted.refill_work, smb_direct_post_recv_credits); - smb_direct_post_recv_credits(&sc->recv_io.posted.refill_work); - if (unlikely(sc->first_error)) - return sc->first_error; - INIT_WORK(&sc->idle.immediate_work, smb_direct_send_immediate_work); - -respond: - ret = smb_direct_send_negotiate_response(sc, ret); - - return ret; -} - -static int smb_direct_connect(struct smbdirect_socket *sc) -{ - struct smbdirect_recv_io *recv_io; - int ret; - - ret = smb_direct_init_params(sc); - if (ret) { - pr_err("Can't configure RDMA parameters\n"); - return ret; - } - - ret = smb_direct_create_pools(sc); - if (ret) { - pr_err("Can't init RDMA pool: %d\n", ret); - return ret; - } - - list_for_each_entry(recv_io, &sc->recv_io.free.list, list) - recv_io->cqe.done = recv_done; - - ret = smb_direct_create_qpair(sc); - if (ret) { - pr_err("Can't accept RDMA client: %d\n", ret); - return ret; - } - - ret = smb_direct_prepare_negotiation(sc); - if (ret) { - pr_err("Can't negotiate: %d\n", ret); - return ret; - } - return 0; -} - -static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) -{ - if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) - return false; - if (attrs->max_fast_reg_page_list_len == 0) - return false; - return true; -} - -static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id, - struct rdma_cm_event *event) -{ - struct smb_direct_listener *listener = new_cm_id->context; struct smb_direct_transport *t; - struct smbdirect_socket *sc; - struct smbdirect_socket_parameters *sp; struct task_struct *handler; - u8 peer_initiator_depth; - u8 peer_responder_resources; int ret; - if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { - ksmbd_debug(RDMA, - "Fast Registration Work Requests is not supported. device capabilities=%llx\n", - new_cm_id->device->attrs.device_cap_flags); - return -EPROTONOSUPPORT; - } - - t = alloc_transport(new_cm_id); - if (!t) + t = alloc_transport(client_sc); + if (!t) { + smbdirect_socket_release(client_sc); return -ENOMEM; - sc = &t->socket; - sp = &sc->parameters; - - peer_initiator_depth = event->param.conn.initiator_depth; - peer_responder_resources = event->param.conn.responder_resources; - if (rdma_protocol_iwarp(new_cm_id->device, new_cm_id->port_num) && - event->param.conn.private_data_len == 8) { - /* - * Legacy clients with only iWarp MPA v1 support - * need a private blob in order to negotiate - * the IRD/ORD values. - */ - const __be32 *ird_ord_hdr = event->param.conn.private_data; - u32 ird32 = be32_to_cpu(ird_ord_hdr[0]); - u32 ord32 = be32_to_cpu(ird_ord_hdr[1]); - - /* - * cifs.ko sends the legacy IRD/ORD negotiation - * event if iWarp MPA v2 was used. - * - * Here we check that the values match and only - * mark the client as legacy if they don't match. - */ - if ((u32)event->param.conn.initiator_depth != ird32 || - (u32)event->param.conn.responder_resources != ord32) { - /* - * There are broken clients (old cifs.ko) - * using little endian and also - * struct rdma_conn_param only uses u8 - * for initiator_depth and responder_resources, - * so we truncate the value to U8_MAX. - * - * smb_direct_accept_client() will then - * do the real negotiation in order to - * select the minimum between client and - * server. - */ - ird32 = min_t(u32, ird32, U8_MAX); - ord32 = min_t(u32, ord32, U8_MAX); - - sc->rdma.legacy_iwarp = true; - peer_initiator_depth = (u8)ird32; - peer_responder_resources = (u8)ord32; - } } - /* - * First set what the we as server are able to support - */ - sp->initiator_depth = min_t(u8, sp->initiator_depth, - new_cm_id->device->attrs.max_qp_rd_atom); - - /* - * negotiate the value by using the minimum - * between client and server if the client provided - * non 0 values. - */ - if (peer_initiator_depth != 0) - sp->initiator_depth = min_t(u8, sp->initiator_depth, - peer_initiator_depth); - if (peer_responder_resources != 0) - sp->responder_resources = min_t(u8, sp->responder_resources, - peer_responder_resources); - - ret = smb_direct_connect(sc); - if (ret) - goto out_err; - handler = kthread_run(ksmbd_conn_handler_loop, KSMBD_TRANS(t)->conn, "ksmbd:r%u", listener->port); @@ -2764,41 +325,68 @@ out_err: return ret; } -static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, - struct rdma_cm_event *event) +static int smb_direct_listener_kthread_fn(void *p) { - switch (event->event) { - case RDMA_CM_EVENT_CONNECT_REQUEST: { - int ret = smb_direct_handle_connect_request(cm_id, event); + struct smb_direct_listener *listener = (struct smb_direct_listener *)p; + struct smbdirect_socket *client_sc = NULL; - if (ret) { - pr_err("Can't create transport: %d\n", ret); - return ret; - } + while (!kthread_should_stop()) { + struct proto_accept_arg arg = { .err = -EINVAL, }; + long timeo = MAX_SCHEDULE_TIMEOUT; - ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n", - cm_id); - break; - } - default: - pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n", - cm_id, rdma_event_msg(event->event), event->event); - break; + if (!listener->socket) + break; + client_sc = smbdirect_socket_accept(listener->socket, timeo, &arg); + if (!client_sc && arg.err == -EINVAL) + break; + if (!client_sc) + continue; + + ksmbd_debug(CONN, "connect success: accepted new connection\n"); + smb_direct_new_connection(listener, client_sc); } + + ksmbd_debug(CONN, "releasing socket\n"); return 0; } +static void smb_direct_listener_destroy(struct smb_direct_listener *listener) +{ + int ret; + + if (listener->socket) + smbdirect_socket_shutdown(listener->socket); + + if (listener->thread) { + ret = kthread_stop(listener->thread); + if (ret) + pr_err("failed to stop forker thread\n"); + listener->thread = NULL; + } + + if (listener->socket) { + smbdirect_socket_release(listener->socket); + listener->socket = NULL; + } + + listener->port = 0; +} + static int smb_direct_listen(struct smb_direct_listener *listener, int port) { - int ret; - struct rdma_cm_id *cm_id; - u8 node_type = RDMA_NODE_UNSPECIFIED; + struct net *net = current->nsproxy->net_ns; + struct task_struct *kthread; struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; + struct smbdirect_socket_parameters init_params = {}; + struct smbdirect_socket_parameters *sp; + struct smbdirect_socket *sc; + u64 port_flags = 0; + int ret; switch (port) { case SMB_DIRECT_PORT_IWARP: @@ -2806,7 +394,7 @@ static int smb_direct_listen(struct smb_direct_listener *listener, * only allow iWarp devices * for port 5445. */ - node_type = RDMA_NODE_RNIC; + port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IW; break; case SMB_DIRECT_PORT_INFINIBAND: /* @@ -2815,119 +403,95 @@ static int smb_direct_listen(struct smb_direct_listener *listener, * * (Basically don't allow iWarp devices) */ - node_type = RDMA_NODE_IB_CA; + port_flags |= SMBDIRECT_FLAG_PORT_RANGE_ONLY_IB; break; default: pr_err("unsupported smbdirect port=%d!\n", port); return -ENODEV; } - cm_id = rdma_create_id(&init_net, smb_direct_listen_handler, - listener, RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(cm_id)) { - pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id)); - return PTR_ERR(cm_id); + ret = smbdirect_socket_create_kern(net, &sc); + if (ret) { + pr_err("smbdirect_socket_create_kern() failed: %d %1pe\n", + ret, ERR_PTR(ret)); + return ret; } - ret = rdma_restrict_node_type(cm_id, node_type); + /* + * Create the initial parameters + */ + sp = &init_params; + sp->flags |= port_flags; + sp->negotiate_timeout_msec = SMB_DIRECT_NEGOTIATE_TIMEOUT * 1000; + sp->initiator_depth = SMB_DIRECT_CM_INITIATOR_DEPTH; + sp->responder_resources = 1; + sp->recv_credit_max = smb_direct_receive_credit_max; + sp->send_credit_target = smb_direct_send_credit_target; + sp->max_send_size = smb_direct_max_send_size; + sp->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + sp->max_recv_size = smb_direct_max_receive_size; + sp->max_read_write_size = smb_direct_max_read_write_size; + sp->keepalive_interval_msec = SMB_DIRECT_KEEPALIVE_SEND_INTERVAL * 1000; + sp->keepalive_timeout_msec = SMB_DIRECT_KEEPALIVE_RECV_TIMEOUT * 1000; + + smbdirect_socket_set_logging(sc, NULL, + smb_direct_logging_needed, + smb_direct_logging_vaprintf); + ret = smbdirect_socket_set_initial_parameters(sc, sp); if (ret) { - pr_err("rdma_restrict_node_type(%u) failed %d\n", - node_type, ret); + pr_err("Failed smbdirect_socket_set_initial_parameters(): %d %1pe\n", + ret, ERR_PTR(ret)); + goto err; + } + ret = smbdirect_socket_set_kernel_settings(sc, IB_POLL_WORKQUEUE, KSMBD_DEFAULT_GFP); + if (ret) { + pr_err("Failed smbdirect_socket_set_kernel_settings(): %d %1pe\n", + ret, ERR_PTR(ret)); goto err; } - ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + ret = smbdirect_socket_bind(sc, (struct sockaddr *)&sin); if (ret) { - pr_err("Can't bind: %d\n", ret); + pr_err("smbdirect_socket_bind() failed: %d %1pe\n", + ret, ERR_PTR(ret)); goto err; } - ret = rdma_listen(cm_id, 10); + ret = smbdirect_socket_listen(sc, 10); if (ret) { - pr_err("Can't listen: %d\n", ret); + pr_err("Port[%d] smbdirect_socket_listen() failed: %d %1pe\n", + port, ret, ERR_PTR(ret)); goto err; } listener->port = port; - listener->cm_id = cm_id; + listener->socket = sc; + kthread = kthread_run(smb_direct_listener_kthread_fn, + listener, + "ksmbd-smbdirect-listener-%u", port); + if (IS_ERR(kthread)) { + ret = PTR_ERR(kthread); + pr_err("Can't start ksmbd listen kthread: %d %1pe\n", + ret, ERR_PTR(ret)); + goto err; + } + + listener->thread = kthread; return 0; err: - listener->port = 0; - listener->cm_id = NULL; - rdma_destroy_id(cm_id); + smb_direct_listener_destroy(listener); return ret; } -static int smb_direct_ib_client_add(struct ib_device *ib_dev) -{ - struct smb_direct_device *smb_dev; - - if (!rdma_frwr_is_supported(&ib_dev->attrs)) - return 0; - - smb_dev = kzalloc_obj(*smb_dev, KSMBD_DEFAULT_GFP); - if (!smb_dev) - return -ENOMEM; - smb_dev->ib_dev = ib_dev; - - write_lock(&smb_direct_device_lock); - list_add(&smb_dev->list, &smb_direct_device_list); - write_unlock(&smb_direct_device_lock); - - ksmbd_debug(RDMA, "ib device added: name %s\n", ib_dev->name); - return 0; -} - -static void smb_direct_ib_client_remove(struct ib_device *ib_dev, - void *client_data) -{ - struct smb_direct_device *smb_dev, *tmp; - - write_lock(&smb_direct_device_lock); - list_for_each_entry_safe(smb_dev, tmp, &smb_direct_device_list, list) { - if (smb_dev->ib_dev == ib_dev) { - list_del(&smb_dev->list); - kfree(smb_dev); - break; - } - } - write_unlock(&smb_direct_device_lock); -} - -static struct ib_client smb_direct_ib_client = { - .name = "ksmbd_smb_direct_ib", - .add = smb_direct_ib_client_add, - .remove = smb_direct_ib_client_remove, -}; - int ksmbd_rdma_init(void) { int ret; smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) { - .cm_id = NULL, + .socket = NULL, }; - ret = ib_register_client(&smb_direct_ib_client); - if (ret) { - pr_err("failed to ib_register_client\n"); - return ret; - } - - /* When a client is running out of send credits, the credits are - * granted by the server's sending a packet using this queue. - * This avoids the situation that a clients cannot send packets - * for lack of credits - */ - smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", - WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_PERCPU, - 0); - if (!smb_direct_wq) { - ret = -ENOMEM; - goto err; - } - ret = smb_direct_listen(&smb_direct_ib_listener, SMB_DIRECT_PORT_INFINIBAND); if (ret) { @@ -2935,8 +499,8 @@ int ksmbd_rdma_init(void) goto err; } - ksmbd_debug(RDMA, "InfiniBand/RoCEv1/RoCEv2 RDMA listener. cm_id=%p\n", - smb_direct_ib_listener.cm_id); + ksmbd_debug(RDMA, "InfiniBand/RoCEv1/RoCEv2 RDMA listener. socket=%p\n", + smb_direct_ib_listener.socket); ret = smb_direct_listen(&smb_direct_iw_listener, SMB_DIRECT_PORT_IWARP); @@ -2945,107 +509,29 @@ int ksmbd_rdma_init(void) goto err; } - ksmbd_debug(RDMA, "iWarp RDMA listener. cm_id=%p\n", - smb_direct_iw_listener.cm_id); + ksmbd_debug(RDMA, "iWarp RDMA listener. socket=%p\n", + smb_direct_iw_listener.socket); return 0; err: ksmbd_rdma_stop_listening(); - ksmbd_rdma_destroy(); return ret; } void ksmbd_rdma_stop_listening(void) { - if (!smb_direct_ib_listener.cm_id && !smb_direct_iw_listener.cm_id) - return; - - ib_unregister_client(&smb_direct_ib_client); - - if (smb_direct_ib_listener.cm_id) - rdma_destroy_id(smb_direct_ib_listener.cm_id); - if (smb_direct_iw_listener.cm_id) - rdma_destroy_id(smb_direct_iw_listener.cm_id); - - smb_direct_ib_listener = smb_direct_iw_listener = (struct smb_direct_listener) { - .cm_id = NULL, - }; -} - -void ksmbd_rdma_destroy(void) -{ - if (smb_direct_wq) { - destroy_workqueue(smb_direct_wq); - smb_direct_wq = NULL; - } -} - -static bool ksmbd_find_rdma_capable_netdev(struct net_device *netdev) -{ - struct smb_direct_device *smb_dev; - int i; - bool rdma_capable = false; - - read_lock(&smb_direct_device_lock); - list_for_each_entry(smb_dev, &smb_direct_device_list, list) { - for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { - struct net_device *ndev; - - ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1); - if (!ndev) - continue; - - if (ndev == netdev) { - dev_put(ndev); - rdma_capable = true; - goto out; - } - dev_put(ndev); - } - } -out: - read_unlock(&smb_direct_device_lock); - - if (rdma_capable == false) { - struct ib_device *ibdev; - - ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); - if (ibdev) { - rdma_capable = rdma_frwr_is_supported(&ibdev->attrs); - ib_device_put(ibdev); - } - } - - ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", - netdev->name, str_true_false(rdma_capable)); - - return rdma_capable; + smb_direct_listener_destroy(&smb_direct_ib_listener); + smb_direct_listener_destroy(&smb_direct_iw_listener); } bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { - struct net_device *lower_dev; - struct list_head *iter; + u8 node_type = smbdirect_netdev_rdma_capable_node_type(netdev); - if (ksmbd_find_rdma_capable_netdev(netdev)) - return true; - - /* check if netdev is bridge or VLAN */ - if (netif_is_bridge_master(netdev) || - netdev->priv_flags & IFF_802_1Q_VLAN) - netdev_for_each_lower_dev(netdev, lower_dev, iter) - if (ksmbd_find_rdma_capable_netdev(lower_dev)) - return true; - - /* check if netdev is IPoIB safely without layer violation */ - if (netdev->type == ARPHRD_INFINIBAND) - return true; - - return false; + return node_type != RDMA_NODE_UNSPECIFIED; } static const struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { - .prepare = smb_direct_prepare, .disconnect = smb_direct_disconnect, .shutdown = smb_direct_shutdown, .writev = smb_direct_writev, diff --git a/fs/smb/server/transport_rdma.h b/fs/smb/server/transport_rdma.h index 3f93c6a9f7e4..05352dc47f95 100644 --- a/fs/smb/server/transport_rdma.h +++ b/fs/smb/server/transport_rdma.h @@ -14,17 +14,17 @@ #ifdef CONFIG_SMB_SERVER_SMBDIRECT int ksmbd_rdma_init(void); void ksmbd_rdma_stop_listening(void); -void ksmbd_rdma_destroy(void); bool ksmbd_rdma_capable_netdev(struct net_device *netdev); void init_smbd_max_io_size(unsigned int sz); unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt); #else static inline int ksmbd_rdma_init(void) { return 0; } static inline void ksmbd_rdma_stop_listening(void) { } -static inline void ksmbd_rdma_destroy(void) { } static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } static inline void init_smbd_max_io_size(unsigned int sz) { } static inline unsigned int get_smbd_max_read_write_size(struct ksmbd_transport *kt) { return 0; } #endif +#include "../common/smbdirect/smbdirect.h" + #endif /* __KSMBD_TRANSPORT_RDMA_H__ */