diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 3b2ad61995d4..0724a793798f 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -602,3 +602,31 @@ it does not modify the current namespace or any existing children. A namespace with ``ns_mode`` set to ``local`` cannot change ``child_ns_mode`` to ``global`` (returns ``-EPERM``). + +g2h_fallback +------------ + +Controls whether connections to CIDs not owned by the host-to-guest (H2G) +transport automatically fall back to the guest-to-host (G2H) transport. + +When enabled, if a connect targets a CID that the H2G transport (e.g. +vhost-vsock) does not serve, or if no H2G transport is loaded at all, the +connection is routed via the G2H transport (e.g. virtio-vsock) instead. This +allows a host running both nested VMs (via vhost-vsock) and sibling VMs +reachable through the hypervisor (e.g. Nitro Enclaves) to address both using +a single CID space, without requiring applications to set +``VMADDR_FLAG_TO_HOST``. + +When the fallback is taken, ``VMADDR_FLAG_TO_HOST`` is automatically set on +the remote address so that userspace can determine the path via +``getpeername()``. + +Note: With this sysctl enabled, user space that attempts to talk to a guest +CID which is not implemented by the H2G transport will create host vsock +traffic. Environments that rely on H2G-only isolation should set it to 0. + +Values: + + - 0 - Connections to CIDs <= 2 or with VMADDR_FLAG_TO_HOST use G2H; + all others use H2G (or fail with ENODEV if H2G is not loaded). + - 1 - Connections to CIDs not owned by H2G fall back to G2H. (default) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 054f7a718f50..1d8ec6bed53e 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -91,6 +91,18 @@ static struct vhost_vsock *vhost_vsock_get(u32 guest_cid, struct net *net) return NULL; } +static bool vhost_transport_has_remote_cid(struct vsock_sock *vsk, u32 cid) +{ + struct sock *sk = sk_vsock(vsk); + struct net *net = sock_net(sk); + bool found; + + rcu_read_lock(); + found = !!vhost_vsock_get(cid, net); + rcu_read_unlock(); + return found; +} + static void vhost_transport_do_send_pkt(struct vhost_vsock *vsock, struct vhost_virtqueue *vq) @@ -424,6 +436,7 @@ static struct virtio_transport vhost_transport = { .module = THIS_MODULE, .get_local_cid = vhost_transport_get_local_cid, + .has_remote_cid = vhost_transport_has_remote_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index 533d8e75f7bb..4e40063adab4 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -179,6 +179,15 @@ struct vsock_transport { /* Addressing. */ u32 (*get_local_cid)(void); + /* Check if this transport serves a specific remote CID. + * For H2G transports: return true if the CID belongs to a registered + * guest. If not implemented, all CIDs > VMADDR_CID_HOST go to H2G. + * For G2H transports: return true if the transport can reach arbitrary + * CIDs via the hypervisor (i.e. supports the fallback overlay). VMCI + * does not implement this as it only serves CIDs 0 and 2. + */ + bool (*has_remote_cid)(struct vsock_sock *vsk, u32 remote_cid); + /* Read a single skb */ int (*read_skb)(struct vsock_sock *, skb_read_actor_t); diff --git a/include/net/netns/vsock.h b/include/net/netns/vsock.h index dc8cbe45f406..7f84aad92f57 100644 --- a/include/net/netns/vsock.h +++ b/include/net/netns/vsock.h @@ -20,5 +20,7 @@ struct netns_vsock { /* 0 = unlocked, 1 = locked to global, 2 = locked to local */ int child_ns_mode_locked; + + int g2h_fallback; }; #endif /* __NET_NET_NAMESPACE_VSOCK_H */ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index f0ab2f13e9db..cc4b225250b9 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -545,9 +545,13 @@ static void vsock_deassign_transport(struct vsock_sock *vsk) * The vsk->remote_addr is used to decide which transport to use: * - remote CID == VMADDR_CID_LOCAL or g2h->local_cid or VMADDR_CID_HOST if * g2h is not loaded, will use local transport; - * - remote CID <= VMADDR_CID_HOST or h2g is not loaded or remote flags field - * includes VMADDR_FLAG_TO_HOST flag value, will use guest->host transport; - * - remote CID > VMADDR_CID_HOST will use host->guest transport; + * - remote CID <= VMADDR_CID_HOST or remote flags field includes + * VMADDR_FLAG_TO_HOST, will use guest->host transport; + * - remote CID > VMADDR_CID_HOST and h2g is loaded and h2g claims that CID, + * will use host->guest transport; + * - h2g not loaded or h2g does not claim that CID and g2h claims the CID via + * has_remote_cid, will use guest->host transport (when g2h_fallback=1) + * - anything else goes to h2g or returns -ENODEV if no h2g is available */ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) { @@ -581,11 +585,21 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) case SOCK_SEQPACKET: if (vsock_use_local_transport(remote_cid)) new_transport = transport_local; - else if (remote_cid <= VMADDR_CID_HOST || !transport_h2g || + else if (remote_cid <= VMADDR_CID_HOST || (remote_flags & VMADDR_FLAG_TO_HOST)) new_transport = transport_g2h; - else + else if (transport_h2g && + (!transport_h2g->has_remote_cid || + transport_h2g->has_remote_cid(vsk, remote_cid))) new_transport = transport_h2g; + else if (sock_net(sk)->vsock.g2h_fallback && + transport_g2h && transport_g2h->has_remote_cid && + transport_g2h->has_remote_cid(vsk, remote_cid)) { + vsk->remote_addr.svm_flags |= VMADDR_FLAG_TO_HOST; + new_transport = transport_g2h; + } else { + new_transport = transport_h2g; + } break; default: ret = -ESOCKTNOSUPPORT; @@ -2879,6 +2893,15 @@ static struct ctl_table vsock_table[] = { .mode = 0644, .proc_handler = vsock_net_child_mode_string }, + { + .procname = "g2h_fallback", + .data = &init_net.vsock.g2h_fallback, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, }; static int __net_init vsock_sysctl_register(struct net *net) @@ -2894,6 +2917,7 @@ static int __net_init vsock_sysctl_register(struct net *net) table[0].data = &net->vsock.mode; table[1].data = &net->vsock.child_ns_mode; + table[2].data = &net->vsock.g2h_fallback; } net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table, @@ -2928,6 +2952,7 @@ static void vsock_net_init(struct net *net) net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); net->vsock.child_ns_mode = net->vsock.mode; + net->vsock.g2h_fallback = 1; } static __net_init int vsock_sysctl_init_net(struct net *net) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 77fe5b7b066c..57f2d6ec3ffc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -547,11 +547,18 @@ bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port) static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid); +static bool virtio_transport_has_remote_cid(struct vsock_sock *vsk, u32 cid) +{ + /* The CID could be implemented by the host. Always assume it is. */ + return true; +} + static struct virtio_transport virtio_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = virtio_transport_get_local_cid, + .has_remote_cid = virtio_transport_has_remote_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct,