mirror of
https://github.com/torvalds/linux.git
synced 2026-04-18 14:53:58 -04:00
When no H2G transport is loaded, vsock currently routes all CIDs to the
G2H transport (commit 65b422d9b6 ("vsock: forward all packets to the
host when no H2G is registered"). Extend that existing behavior: when
an H2G transport is loaded but does not claim a given CID, the
connection falls back to G2H in the same way.
This matters in environments like Nitro Enclaves, where an instance may
run nested VMs via vhost-vsock (H2G) while also needing to reach sibling
enclaves at higher CIDs through virtio-vsock-pci (G2H). With the old
code, any CID > 2 was unconditionally routed to H2G when vhost was
loaded, making those enclaves unreachable without setting
VMADDR_FLAG_TO_HOST explicitly on every connect.
Requiring every application to set VMADDR_FLAG_TO_HOST creates friction:
tools like socat, iperf, and others would all need to learn about it.
The flag was introduced 6 years ago and I am still not aware of any tool
that supports it. Even if there was support, it would be cumbersome to
use. The most natural experience is a single CID address space where H2G
only wins for CIDs it actually owns, and everything else falls through to
G2H, extending the behavior that already exists when H2G is absent.
To give user space at least a hint that the kernel applied this logic,
automatically set the VMADDR_FLAG_TO_HOST on the remote address so it
can determine the path taken via getpeername().
Add a per-network namespace sysctl net.vsock.g2h_fallback (default 1).
At 0 it forces strict routing: H2G always wins for CID > VMADDR_CID_HOST,
or ENODEV if H2G is not loaded.
Signed-off-by: Alexander Graf <graf@amazon.com>
Tested-by: syzbot@syzkaller.appspotmail.com
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Link: https://patch.msgid.link/20260304230027.59857-1-graf@amazon.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
333 lines
11 KiB
C
333 lines
11 KiB
C
/* SPDX-License-Identifier: GPL-2.0-only */
|
|
/*
|
|
* VMware vSockets Driver
|
|
*
|
|
* Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
|
|
*/
|
|
|
|
#ifndef __AF_VSOCK_H__
|
|
#define __AF_VSOCK_H__
|
|
|
|
#include <linux/kernel.h>
|
|
#include <linux/workqueue.h>
|
|
#include <net/netns/vsock.h>
|
|
#include <net/sock.h>
|
|
#include <uapi/linux/vm_sockets.h>
|
|
|
|
#include "vsock_addr.h"
|
|
|
|
#define LAST_RESERVED_PORT 1023
|
|
|
|
#define VSOCK_HASH_SIZE 251
|
|
extern struct list_head vsock_bind_table[VSOCK_HASH_SIZE + 1];
|
|
extern struct list_head vsock_connected_table[VSOCK_HASH_SIZE];
|
|
extern spinlock_t vsock_table_lock;
|
|
|
|
#define vsock_sk(__sk) ((struct vsock_sock *)__sk)
|
|
#define sk_vsock(__vsk) (&(__vsk)->sk)
|
|
|
|
struct vsock_sock {
|
|
/* sk must be the first member. */
|
|
struct sock sk;
|
|
const struct vsock_transport *transport;
|
|
struct sockaddr_vm local_addr;
|
|
struct sockaddr_vm remote_addr;
|
|
/* Links for the global tables of bound and connected sockets. */
|
|
struct list_head bound_table;
|
|
struct list_head connected_table;
|
|
/* Accessed without the socket lock held. This means it can never be
|
|
* modified outsided of socket create or destruct.
|
|
*/
|
|
bool trusted;
|
|
bool cached_peer_allow_dgram; /* Dgram communication allowed to
|
|
* cached peer?
|
|
*/
|
|
u32 cached_peer; /* Context ID of last dgram destination check. */
|
|
const struct cred *owner;
|
|
/* Rest are SOCK_STREAM only. */
|
|
long connect_timeout;
|
|
/* Listening socket that this came from. */
|
|
struct sock *listener;
|
|
/* Used for pending list and accept queue during connection handshake.
|
|
* The listening socket is the head for both lists. Sockets created
|
|
* for connection requests are placed in the pending list until they
|
|
* are connected, at which point they are put in the accept queue list
|
|
* so they can be accepted in accept(). If accept() cannot accept the
|
|
* connection, it is marked as rejected so the cleanup function knows
|
|
* to clean up the socket.
|
|
*/
|
|
struct list_head pending_links;
|
|
struct list_head accept_queue;
|
|
bool rejected;
|
|
struct delayed_work connect_work;
|
|
struct delayed_work pending_work;
|
|
struct delayed_work close_work;
|
|
bool close_work_scheduled;
|
|
u32 peer_shutdown;
|
|
bool sent_request;
|
|
bool ignore_connecting_rst;
|
|
|
|
/* Protected by lock_sock(sk) */
|
|
u64 buffer_size;
|
|
u64 buffer_min_size;
|
|
u64 buffer_max_size;
|
|
|
|
/* Private to transport. */
|
|
void *trans;
|
|
};
|
|
|
|
s64 vsock_connectible_has_data(struct vsock_sock *vsk);
|
|
s64 vsock_stream_has_data(struct vsock_sock *vsk);
|
|
s64 vsock_stream_has_space(struct vsock_sock *vsk);
|
|
struct sock *vsock_create_connected(struct sock *parent);
|
|
void vsock_data_ready(struct sock *sk);
|
|
|
|
/**** TRANSPORT ****/
|
|
|
|
struct vsock_transport_recv_notify_data {
|
|
u64 data1; /* Transport-defined. */
|
|
u64 data2; /* Transport-defined. */
|
|
bool notify_on_block;
|
|
};
|
|
|
|
struct vsock_transport_send_notify_data {
|
|
u64 data1; /* Transport-defined. */
|
|
u64 data2; /* Transport-defined. */
|
|
};
|
|
|
|
/* Transport features flags */
|
|
/* Transport provides host->guest communication */
|
|
#define VSOCK_TRANSPORT_F_H2G 0x00000001
|
|
/* Transport provides guest->host communication */
|
|
#define VSOCK_TRANSPORT_F_G2H 0x00000002
|
|
/* Transport provides DGRAM communication */
|
|
#define VSOCK_TRANSPORT_F_DGRAM 0x00000004
|
|
/* Transport provides local (loopback) communication */
|
|
#define VSOCK_TRANSPORT_F_LOCAL 0x00000008
|
|
|
|
struct vsock_transport {
|
|
struct module *module;
|
|
|
|
/* Initialize/tear-down socket. */
|
|
int (*init)(struct vsock_sock *, struct vsock_sock *);
|
|
void (*destruct)(struct vsock_sock *);
|
|
void (*release)(struct vsock_sock *);
|
|
|
|
/* Cancel all pending packets sent on vsock. */
|
|
int (*cancel_pkt)(struct vsock_sock *vsk);
|
|
|
|
/* Connections. */
|
|
int (*connect)(struct vsock_sock *);
|
|
|
|
/* DGRAM. */
|
|
int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
|
|
int (*dgram_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
|
|
size_t len, int flags);
|
|
int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
|
|
struct msghdr *, size_t len);
|
|
bool (*dgram_allow)(struct vsock_sock *vsk, u32 cid, u32 port);
|
|
|
|
/* STREAM. */
|
|
/* TODO: stream_bind() */
|
|
ssize_t (*stream_dequeue)(struct vsock_sock *, struct msghdr *,
|
|
size_t len, int flags);
|
|
ssize_t (*stream_enqueue)(struct vsock_sock *, struct msghdr *,
|
|
size_t len);
|
|
s64 (*stream_has_data)(struct vsock_sock *);
|
|
s64 (*stream_has_space)(struct vsock_sock *);
|
|
u64 (*stream_rcvhiwat)(struct vsock_sock *);
|
|
bool (*stream_is_active)(struct vsock_sock *);
|
|
bool (*stream_allow)(struct vsock_sock *vsk, u32 cid, u32 port);
|
|
|
|
/* SEQ_PACKET. */
|
|
ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
|
|
int flags);
|
|
int (*seqpacket_enqueue)(struct vsock_sock *vsk, struct msghdr *msg,
|
|
size_t len);
|
|
bool (*seqpacket_allow)(struct vsock_sock *vsk, u32 remote_cid);
|
|
u32 (*seqpacket_has_data)(struct vsock_sock *vsk);
|
|
|
|
/* Notification. */
|
|
int (*notify_poll_in)(struct vsock_sock *, size_t, bool *);
|
|
int (*notify_poll_out)(struct vsock_sock *, size_t, bool *);
|
|
int (*notify_recv_init)(struct vsock_sock *, size_t,
|
|
struct vsock_transport_recv_notify_data *);
|
|
int (*notify_recv_pre_block)(struct vsock_sock *, size_t,
|
|
struct vsock_transport_recv_notify_data *);
|
|
int (*notify_recv_pre_dequeue)(struct vsock_sock *, size_t,
|
|
struct vsock_transport_recv_notify_data *);
|
|
int (*notify_recv_post_dequeue)(struct vsock_sock *, size_t,
|
|
ssize_t, bool, struct vsock_transport_recv_notify_data *);
|
|
int (*notify_send_init)(struct vsock_sock *,
|
|
struct vsock_transport_send_notify_data *);
|
|
int (*notify_send_pre_block)(struct vsock_sock *,
|
|
struct vsock_transport_send_notify_data *);
|
|
int (*notify_send_pre_enqueue)(struct vsock_sock *,
|
|
struct vsock_transport_send_notify_data *);
|
|
int (*notify_send_post_enqueue)(struct vsock_sock *, ssize_t,
|
|
struct vsock_transport_send_notify_data *);
|
|
/* sk_lock held by the caller */
|
|
void (*notify_buffer_size)(struct vsock_sock *, u64 *);
|
|
int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val);
|
|
|
|
/* SIOCOUTQ ioctl */
|
|
ssize_t (*unsent_bytes)(struct vsock_sock *vsk);
|
|
|
|
/* Shutdown. */
|
|
int (*shutdown)(struct vsock_sock *, int);
|
|
|
|
/* Addressing. */
|
|
u32 (*get_local_cid)(void);
|
|
|
|
/* Check if this transport serves a specific remote CID.
|
|
* For H2G transports: return true if the CID belongs to a registered
|
|
* guest. If not implemented, all CIDs > VMADDR_CID_HOST go to H2G.
|
|
* For G2H transports: return true if the transport can reach arbitrary
|
|
* CIDs via the hypervisor (i.e. supports the fallback overlay). VMCI
|
|
* does not implement this as it only serves CIDs 0 and 2.
|
|
*/
|
|
bool (*has_remote_cid)(struct vsock_sock *vsk, u32 remote_cid);
|
|
|
|
/* Read a single skb */
|
|
int (*read_skb)(struct vsock_sock *, skb_read_actor_t);
|
|
|
|
/* Zero-copy. */
|
|
bool (*msgzerocopy_allow)(void);
|
|
};
|
|
|
|
/**** CORE ****/
|
|
|
|
int vsock_core_register(const struct vsock_transport *t, int features);
|
|
void vsock_core_unregister(const struct vsock_transport *t);
|
|
|
|
/* The transport may downcast this to access transport-specific functions */
|
|
const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk);
|
|
|
|
/**** UTILS ****/
|
|
|
|
/* vsock_table_lock must be held */
|
|
static inline bool __vsock_in_bound_table(struct vsock_sock *vsk)
|
|
{
|
|
return !list_empty(&vsk->bound_table);
|
|
}
|
|
|
|
/* vsock_table_lock must be held */
|
|
static inline bool __vsock_in_connected_table(struct vsock_sock *vsk)
|
|
{
|
|
return !list_empty(&vsk->connected_table);
|
|
}
|
|
|
|
void vsock_add_pending(struct sock *listener, struct sock *pending);
|
|
void vsock_remove_pending(struct sock *listener, struct sock *pending);
|
|
void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
|
|
void vsock_insert_connected(struct vsock_sock *vsk);
|
|
void vsock_remove_bound(struct vsock_sock *vsk);
|
|
void vsock_remove_connected(struct vsock_sock *vsk);
|
|
struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
|
|
struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
|
|
struct sockaddr_vm *dst);
|
|
struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr,
|
|
struct net *net);
|
|
struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src,
|
|
struct sockaddr_vm *dst,
|
|
struct net *net);
|
|
void vsock_remove_sock(struct vsock_sock *vsk);
|
|
void vsock_for_each_connected_socket(struct vsock_transport *transport,
|
|
void (*fn)(struct sock *sk));
|
|
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
|
|
bool vsock_find_cid(unsigned int cid);
|
|
void vsock_linger(struct sock *sk);
|
|
|
|
/**** TAP ****/
|
|
|
|
struct vsock_tap {
|
|
struct net_device *dev;
|
|
struct module *module;
|
|
struct list_head list;
|
|
};
|
|
|
|
int vsock_add_tap(struct vsock_tap *vt);
|
|
int vsock_remove_tap(struct vsock_tap *vt);
|
|
void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque);
|
|
int __vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
|
|
int flags);
|
|
int vsock_connectible_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
|
|
int flags);
|
|
int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
|
|
size_t len, int flags);
|
|
int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
|
|
size_t len, int flags);
|
|
|
|
extern struct proto vsock_proto;
|
|
#ifdef CONFIG_BPF_SYSCALL
|
|
int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
|
|
void __init vsock_bpf_build_proto(void);
|
|
#else
|
|
static inline void __init vsock_bpf_build_proto(void)
|
|
{}
|
|
#endif
|
|
|
|
static inline bool vsock_msgzerocopy_allow(const struct vsock_transport *t)
|
|
{
|
|
return t->msgzerocopy_allow && t->msgzerocopy_allow();
|
|
}
|
|
|
|
static inline enum vsock_net_mode vsock_net_mode(struct net *net)
|
|
{
|
|
if (!net)
|
|
return VSOCK_NET_MODE_GLOBAL;
|
|
|
|
return READ_ONCE(net->vsock.mode);
|
|
}
|
|
|
|
static inline bool vsock_net_mode_global(struct vsock_sock *vsk)
|
|
{
|
|
return vsock_net_mode(sock_net(sk_vsock(vsk))) == VSOCK_NET_MODE_GLOBAL;
|
|
}
|
|
|
|
static inline bool vsock_net_set_child_mode(struct net *net,
|
|
enum vsock_net_mode mode)
|
|
{
|
|
int new_locked = mode + 1;
|
|
int old_locked = 0; /* unlocked */
|
|
|
|
if (try_cmpxchg(&net->vsock.child_ns_mode_locked,
|
|
&old_locked, new_locked)) {
|
|
WRITE_ONCE(net->vsock.child_ns_mode, mode);
|
|
return true;
|
|
}
|
|
|
|
return old_locked == new_locked;
|
|
}
|
|
|
|
static inline enum vsock_net_mode vsock_net_child_mode(struct net *net)
|
|
{
|
|
return READ_ONCE(net->vsock.child_ns_mode);
|
|
}
|
|
|
|
/* Return true if two namespaces pass the mode rules. Otherwise, return false.
|
|
*
|
|
* A NULL namespace is treated as VSOCK_NET_MODE_GLOBAL.
|
|
*
|
|
* Read more about modes in the comment header of net/vmw_vsock/af_vsock.c.
|
|
*/
|
|
static inline bool vsock_net_check_mode(struct net *ns0, struct net *ns1)
|
|
{
|
|
enum vsock_net_mode mode0, mode1;
|
|
|
|
/* Any vsocks within the same network namespace are always reachable,
|
|
* regardless of the mode.
|
|
*/
|
|
if (net_eq(ns0, ns1))
|
|
return true;
|
|
|
|
mode0 = vsock_net_mode(ns0);
|
|
mode1 = vsock_net_mode(ns1);
|
|
|
|
/* Different namespaces are only reachable if they are both
|
|
* global mode.
|
|
*/
|
|
return mode0 == VSOCK_NET_MODE_GLOBAL && mode0 == mode1;
|
|
}
|
|
#endif /* __AF_VSOCK_H__ */
|