diff --git a/Documentation/arch/s390/vfio-ap.rst b/Documentation/arch/s390/vfio-ap.rst index eba1991fbdba..ac0c07f76ddd 100644 --- a/Documentation/arch/s390/vfio-ap.rst +++ b/Documentation/arch/s390/vfio-ap.rst @@ -431,17 +431,14 @@ matrix device. * callback interfaces open_device: - The vfio_ap driver uses this callback to register a - VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the matrix mdev - devices. The open_device callback is invoked by userspace to connect the - VFIO iommu group for the matrix mdev device to the MDEV bus. Access to the - KVM structure used to configure the KVM guest is provided via this callback. - The KVM structure, is used to configure the guest's access to the AP matrix - defined via the vfio_ap mediated device's sysfs attribute files. + the open_device callback is invoked by userspace to connect the + VFIO iommu group for the matrix mdev device to the MDEV bus. The + callback retrieves the KVM structure used to configure the KVM guest + and configures the guest's access to the AP matrix defined via the + vfio_ap mediated device's sysfs attribute files. close_device: - unregisters the VFIO_GROUP_NOTIFY_SET_KVM notifier callback function for the - matrix mdev device and deconfigures the guest's AP matrix. + this callback deconfigures the guest's AP matrix. ioctl: this callback handles the VFIO_DEVICE_GET_INFO and VFIO_DEVICE_RESET ioctls @@ -449,9 +446,8 @@ matrix device. Configure the guest's AP resources ---------------------------------- -Configuring the AP resources for a KVM guest will be performed when the -VFIO_GROUP_NOTIFY_SET_KVM notifier callback is invoked. The notifier -function is called when userspace connects to KVM. The guest's AP resources are +Configuring the AP resources for a KVM guest will be performed at the +time of ``open_device`` and ``close_device``. The guest's AP resources are configured via its APCB by: * Setting the bits in the APM corresponding to the APIDs assigned to the diff --git a/MAINTAINERS b/MAINTAINERS index 447189411512..41f4600e8a03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -27902,6 +27902,12 @@ L: kvm@vger.kernel.org S: Maintained F: drivers/vfio/pci/hisilicon/ +VFIO ISM PCI DRIVER +M: Julian Ruess +L: kvm@vger.kernel.org +S: Maintained +F: drivers/vfio/pci/ism/ + VFIO MEDIATED DEVICE DRIVERS M: Kirti Wankhede L: kvm@vger.kernel.org diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 4f15016d2a5f..b2299e5bc6df 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -15,8 +15,13 @@ #include #include "vfio.h" +static char *vfio_devnode(const struct device *, umode_t *); +static const struct class vfio_class = { + .name = "vfio", + .devnode = vfio_devnode +}; + static struct vfio { - struct class *class; struct list_head group_list; struct mutex group_lock; /* locks group_list */ struct ida group_ida; @@ -456,7 +461,6 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) * Device FDs hold a group file reference, therefore the group release * is only called when there are no open devices. */ - WARN_ON(group->notifier.head); if (group->container) vfio_group_detach_container(group); if (group->iommufd) { @@ -527,7 +531,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, device_initialize(&group->dev); group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); - group->dev.class = vfio.class; + group->dev.class = &vfio_class; group->dev.release = vfio_group_release; cdev_init(&group->cdev, &vfio_group_fops); group->cdev.owner = THIS_MODULE; @@ -541,7 +545,6 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, /* put in vfio_group_release() */ iommu_group_ref_get(iommu_group); group->type = type; - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); return group; } @@ -720,7 +723,6 @@ void vfio_device_remove_group(struct vfio_device *device) * properly hold the group reference. */ WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->notifier.head); /* * Revoke all users of group->iommu_group. At this point we know there @@ -901,13 +903,9 @@ int __init vfio_group_init(void) return ret; /* /dev/vfio/$GROUP */ - vfio.class = class_create("vfio"); - if (IS_ERR(vfio.class)) { - ret = PTR_ERR(vfio.class); + ret = class_register(&vfio_class); + if (ret) goto err_group_class; - } - - vfio.class->devnode = vfio_devnode; ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); if (ret) @@ -915,8 +913,7 @@ int __init vfio_group_init(void) return 0; err_alloc_chrdev: - class_destroy(vfio.class); - vfio.class = NULL; + class_unregister(&vfio_class); err_group_class: vfio_container_cleanup(); return ret; @@ -927,7 +924,6 @@ void vfio_group_cleanup(void) WARN_ON(!list_empty(&vfio.group_list)); ida_destroy(&vfio.group_ida); unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); - class_destroy(vfio.class); - vfio.class = NULL; + class_unregister(&vfio_class); vfio_container_cleanup(); } diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 1e82b44bda1a..296bf01e185e 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -60,6 +60,8 @@ config VFIO_PCI_DMABUF source "drivers/vfio/pci/mlx5/Kconfig" +source "drivers/vfio/pci/ism/Kconfig" + source "drivers/vfio/pci/hisilicon/Kconfig" source "drivers/vfio/pci/pds/Kconfig" diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index e0a0757dd1d2..6138f1bf241d 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -11,6 +11,8 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ +obj-$(CONFIG_ISM_VFIO_PCI) += ism/ + obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ obj-$(CONFIG_PDS_VFIO_PCI) += pds/ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 1d367cff7dcf..bb121f635b9f 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -857,18 +857,12 @@ static long hisi_acc_vf_precopy_ioctl(struct file *filp, struct hisi_acc_vf_core_device *hisi_acc_vdev = migf->hisi_acc_vdev; loff_t *pos = &filp->f_pos; struct vfio_precopy_info info; - unsigned long minsz; int ret; - if (cmd != VFIO_MIG_GET_PRECOPY_INFO) - return -ENOTTY; - - minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + ret = vfio_check_precopy_ioctl(&hisi_acc_vdev->core_device.vdev, cmd, + arg, &info); + if (ret) + return ret; mutex_lock(&hisi_acc_vdev->state_mutex); if (hisi_acc_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY) { @@ -893,7 +887,8 @@ static long hisi_acc_vf_precopy_ioctl(struct file *filp, mutex_unlock(&migf->lock); mutex_unlock(&hisi_acc_vdev->state_mutex); - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user((void __user *)arg, &info, + offsetofend(struct vfio_precopy_info, dirty_bytes)) ? -EFAULT : 0; out: mutex_unlock(&migf->lock); mutex_unlock(&hisi_acc_vdev->state_mutex); diff --git a/drivers/vfio/pci/ism/Kconfig b/drivers/vfio/pci/ism/Kconfig new file mode 100644 index 000000000000..02f47d25fed2 --- /dev/null +++ b/drivers/vfio/pci/ism/Kconfig @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +config ISM_VFIO_PCI + tristate "VFIO support for ISM devices" + depends on S390 + select VFIO_PCI_CORE + help + This provides user space support for IBM Internal Shared Memory (ISM) + Adapter devices using the VFIO framework. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/ism/Makefile b/drivers/vfio/pci/ism/Makefile new file mode 100644 index 000000000000..32cc3c66dd11 --- /dev/null +++ b/drivers/vfio/pci/ism/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ISM_VFIO_PCI) += ism-vfio-pci.o +ism-vfio-pci-y := main.o diff --git a/drivers/vfio/pci/ism/main.c b/drivers/vfio/pci/ism/main.c new file mode 100644 index 000000000000..f83e09b915ed --- /dev/null +++ b/drivers/vfio/pci/ism/main.c @@ -0,0 +1,408 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * vfio-ISM driver for s390 + * + * Copyright IBM Corp. + */ + +#include +#include "../vfio_pci_priv.h" + +#define ISM_VFIO_PCI_OFFSET_SHIFT 48 +#define ISM_VFIO_PCI_OFFSET_TO_INDEX(off) ((off) >> ISM_VFIO_PCI_OFFSET_SHIFT) +#define ISM_VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << ISM_VFIO_PCI_OFFSET_SHIFT) +#define ISM_VFIO_PCI_OFFSET_MASK (((u64)(1) << ISM_VFIO_PCI_OFFSET_SHIFT) - 1) + +/* + * Use __zpci_load() to bypass automatic use of + * PCI MIO instructions which are not supported on ISM devices + */ +#define ISM_READ(size) \ + static int ism_read##size(struct zpci_dev *zdev, int bar, \ + size_t *filled, char __user *buf, \ + loff_t off) \ + { \ + u64 req, tmp; \ + u##size val; \ + int ret; \ + \ + req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, sizeof(val)); \ + ret = __zpci_load(&tmp, req, off); \ + if (ret) \ + return ret; \ + val = (u##size)tmp; \ + if (copy_to_user(buf, &val, sizeof(val))) \ + return -EFAULT; \ + *filled = sizeof(val); \ + return 0; \ + } + +ISM_READ(64); +ISM_READ(32); +ISM_READ(16); +ISM_READ(8); + +struct ism_vfio_pci_core_device { + struct vfio_pci_core_device core_device; + struct kmem_cache *store_block_cache; +}; + +static int ism_vfio_pci_open_device(struct vfio_device *core_vdev) +{ + struct ism_vfio_pci_core_device *ivpcd; + struct vfio_pci_core_device *vdev; + int ret; + + ivpcd = container_of(core_vdev, struct ism_vfio_pci_core_device, + core_device.vdev); + vdev = &ivpcd->core_device; + + ret = vfio_pci_core_enable(vdev); + if (ret) + return ret; + + vfio_pci_core_finish_enable(vdev); + return 0; +} + +/* + * ism_vfio_pci_do_io_r() + * + * On s390, kernel primitives such as ioread() and iowrite() are switched over + * from function-handle-based PCI load/stores instructions to PCI memory-I/O (MIO) + * loads/stores when these are available and not explicitly disabled. Since these + * instructions cannot be used with ISM devices, ensure that classic + * function-handle-based PCI instructions are used instead. + */ +static ssize_t ism_vfio_pci_do_io_r(struct vfio_pci_core_device *vdev, + char __user *buf, loff_t off, size_t count, + int bar) +{ + struct zpci_dev *zdev = to_zpci(vdev->pdev); + ssize_t done = 0; + int ret; + + while (count) { + size_t filled; + + if (count >= 8 && IS_ALIGNED(off, 8)) { + ret = ism_read64(zdev, bar, &filled, buf, off); + if (ret) + return ret; + } else if (count >= 4 && IS_ALIGNED(off, 4)) { + ret = ism_read32(zdev, bar, &filled, buf, off); + if (ret) + return ret; + } else if (count >= 2 && IS_ALIGNED(off, 2)) { + ret = ism_read16(zdev, bar, &filled, buf, off); + if (ret) + return ret; + } else { + ret = ism_read8(zdev, bar, &filled, buf, off); + if (ret) + return ret; + } + + count -= filled; + done += filled; + off += filled; + buf += filled; + } + + return done; +} + +/* + * ism_vfio_pci_do_io_w() + * + * Ensure that the PCI store block (PCISTB) instruction is used as required by the + * ISM device. The ISM device also uses a 256 TiB BAR 0 for write operations, + * which requires a 48bit region address space (ISM_VFIO_PCI_OFFSET_SHIFT). + */ +static ssize_t ism_vfio_pci_do_io_w(struct vfio_pci_core_device *vdev, + char __user *buf, loff_t off, size_t count, + int bar) +{ + struct zpci_dev *zdev = to_zpci(vdev->pdev); + struct ism_vfio_pci_core_device *ivpcd; + ssize_t ret; + void *data; + u64 req; + + if (count > zdev->maxstbl) + return -EINVAL; + if (((off % PAGE_SIZE) + count) > PAGE_SIZE) + return -EINVAL; + + ivpcd = container_of(vdev, struct ism_vfio_pci_core_device, + core_device); + data = kmem_cache_alloc(ivpcd->store_block_cache, GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (copy_from_user(data, buf, count)) { + ret = -EFAULT; + goto out_free; + } + + req = ZPCI_CREATE_REQ(READ_ONCE(zdev->fh), bar, count); + ret = __zpci_store_block(data, req, off); + if (ret) + goto out_free; + + ret = count; + +out_free: + kmem_cache_free(ivpcd->store_block_cache, data); + return ret; +} + +static ssize_t ism_vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) +{ + int bar = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & ISM_VFIO_PCI_OFFSET_MASK; + resource_size_t end; + ssize_t done = 0; + + if (pci_resource_start(vdev->pdev, bar)) + end = pci_resource_len(vdev->pdev, bar); + else + return -EINVAL; + + if (pos >= end) + return -EINVAL; + + count = min(count, (size_t)(end - pos)); + + if (iswrite) + done = ism_vfio_pci_do_io_w(vdev, buf, pos, count, bar); + else + done = ism_vfio_pci_do_io_r(vdev, buf, pos, count, bar); + + if (done >= 0) + *ppos += done; + + return done; +} + +static ssize_t ism_vfio_pci_config_rw(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, + loff_t *ppos, bool iswrite) +{ + loff_t pos = *ppos; + size_t done = 0; + int ret = 0; + + pos &= ISM_VFIO_PCI_OFFSET_MASK; + + while (count) { + /* + * zPCI must not use MIO instructions for config space access, + * so we can use common code path here. + */ + ret = vfio_pci_config_rw_single(vdev, buf, count, &pos, iswrite); + if (ret < 0) + return ret; + + count -= ret; + done += ret; + buf += ret; + pos += ret; + } + + *ppos += done; + + return done; +} + +static ssize_t ism_vfio_pci_rw(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos, bool iswrite) +{ + unsigned int index = ISM_VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct vfio_pci_core_device *vdev; + int ret; + + vdev = container_of(core_vdev, struct vfio_pci_core_device, vdev); + + if (!count) + return 0; + + switch (index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + ret = ism_vfio_pci_config_rw(vdev, buf, count, ppos, iswrite); + break; + + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + ret = ism_vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite); + break; + + default: + return -EINVAL; + } + + return ret; +} + +static ssize_t ism_vfio_pci_read(struct vfio_device *core_vdev, + char __user *buf, size_t count, loff_t *ppos) +{ + return ism_vfio_pci_rw(core_vdev, buf, count, ppos, false); +} + +static ssize_t ism_vfio_pci_write(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos) +{ + return ism_vfio_pci_rw(core_vdev, (char __user *)buf, count, ppos, + true); +} + +static int ism_vfio_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + struct vfio_region_info *info, + struct vfio_info_cap *caps) +{ + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + struct pci_dev *pdev = vdev->pdev; + + switch (info->index) { + case VFIO_PCI_CONFIG_REGION_INDEX: + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pdev->cfg_size; + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + info->offset = ISM_VFIO_PCI_INDEX_TO_OFFSET(info->index); + info->size = pci_resource_len(pdev, info->index); + if (!info->size) { + info->flags = 0; + break; + } + info->flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + break; + default: + info->offset = 0; + info->size = 0; + info->flags = 0; + return -EINVAL; + } + return 0; +} + +static int ism_vfio_pci_init_dev(struct vfio_device *core_vdev) +{ + struct zpci_dev *zdev = to_zpci(to_pci_dev(core_vdev->dev)); + struct ism_vfio_pci_core_device *ivpcd; + char cache_name[20]; + int ret; + + ivpcd = container_of(core_vdev, struct ism_vfio_pci_core_device, + core_device.vdev); + + snprintf(cache_name, sizeof(cache_name), "ism_sb_fid_%08x", zdev->fid); + + ivpcd->store_block_cache = + kmem_cache_create(cache_name, zdev->maxstbl, + (&(struct kmem_cache_args){ + .align = PAGE_SIZE, + .useroffset = 0, + .usersize = zdev->maxstbl, + }), + (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT)); + if (!ivpcd->store_block_cache) + return -ENOMEM; + + ret = vfio_pci_core_init_dev(core_vdev); + if (ret) + kmem_cache_destroy(ivpcd->store_block_cache); + + return ret; +} + +static void ism_vfio_pci_release_dev(struct vfio_device *core_vdev) +{ + struct ism_vfio_pci_core_device *ivpcd = container_of( + core_vdev, struct ism_vfio_pci_core_device, core_device.vdev); + + kmem_cache_destroy(ivpcd->store_block_cache); + vfio_pci_core_release_dev(core_vdev); +} + +static const struct vfio_device_ops ism_pci_ops = { + .name = "ism-vfio-pci", + .init = ism_vfio_pci_init_dev, + .release = ism_vfio_pci_release_dev, + .open_device = ism_vfio_pci_open_device, + .close_device = vfio_pci_core_close_device, + .ioctl = vfio_pci_core_ioctl, + .get_region_info_caps = ism_vfio_pci_ioctl_get_region_info, + .device_feature = vfio_pci_core_ioctl_feature, + .read = ism_vfio_pci_read, + .write = ism_vfio_pci_write, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +static int ism_vfio_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct ism_vfio_pci_core_device *ivpcd; + int ret; + + ivpcd = vfio_alloc_device(ism_vfio_pci_core_device, core_device.vdev, + &pdev->dev, &ism_pci_ops); + if (IS_ERR(ivpcd)) + return PTR_ERR(ivpcd); + + dev_set_drvdata(&pdev->dev, &ivpcd->core_device); + + ret = vfio_pci_core_register_device(&ivpcd->core_device); + if (ret) + vfio_put_device(&ivpcd->core_device.vdev); + + return ret; +} + +static void ism_vfio_pci_remove(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device; + struct ism_vfio_pci_core_device *ivpcd; + + core_device = dev_get_drvdata(&pdev->dev); + ivpcd = container_of(core_device, struct ism_vfio_pci_core_device, + core_device); + + vfio_pci_core_unregister_device(&ivpcd->core_device); + vfio_put_device(&ivpcd->core_device.vdev); +} + +static const struct pci_device_id ism_device_table[] = { + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_IBM, + PCI_DEVICE_ID_IBM_ISM) }, + {} +}; +MODULE_DEVICE_TABLE(pci, ism_device_table); + +static struct pci_driver ism_vfio_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = ism_device_table, + .probe = ism_vfio_pci_probe, + .remove = ism_vfio_pci_remove, + .err_handler = &vfio_pci_core_err_handlers, + .driver_managed_dma = true, +}; + +module_pci_driver(ism_vfio_pci_driver); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("vfio-pci variant driver for the IBM Internal Shared Memory (ISM) device"); +MODULE_AUTHOR("IBM Corporation"); diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index ca6d95f293cd..5fe0621b5fbd 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -87,7 +87,7 @@ int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod) int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size, u64 *total_size, - u8 query_flags) + u8 *mig_state, u8 query_flags) { u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {}; u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {}; @@ -152,6 +152,10 @@ int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, MLX5_GET64(query_vhca_migration_state_out, out, remaining_total_size) : *state_size; + if (mig_state && mvdev->mig_state_cap) + *mig_state = MLX5_GET(query_vhca_migration_state_out, out, + migration_state); + return 0; } @@ -277,6 +281,9 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks)) mvdev->chunk_mode = 1; + if (MLX5_CAP_GEN_2(mvdev->mdev, migration_state)) + mvdev->mig_state_cap = 1; + end: mlx5_vf_put_core_dev(mvdev->mdev); } @@ -555,6 +562,7 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) { spin_lock_irq(&buf->migf->list_lock); buf->stop_copy_chunk_num = 0; + buf->pre_copy_init_bytes_chunk = false; list_add_tail(&buf->buf_elm, &buf->migf->avail_list); spin_unlock_irq(&buf->migf->list_lock); } @@ -606,6 +614,8 @@ static void mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf, struct mlx5vf_async_data *async_data) { + migf->inflight_save = 0; + wake_up_interruptible(&migf->poll_wait); kvfree(async_data->out); complete(&migf->save_comp); fput(migf->filp); @@ -687,7 +697,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) !next_required_umem_size; if (async_data->header_buf) { status = add_buf_header(async_data->header_buf, image_size, - initial_pre_copy); + initial_pre_copy || + async_data->buf->pre_copy_init_bytes_chunk); if (status) goto err; } @@ -706,9 +717,12 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) } } spin_unlock_irqrestore(&migf->list_lock, flags); - if (initial_pre_copy) { + if (initial_pre_copy || async_data->buf->pre_copy_init_bytes_chunk) { migf->pre_copy_initial_bytes += image_size; - migf->state = MLX5_MIGF_STATE_PRE_COPY; + if (initial_pre_copy) + migf->state = MLX5_MIGF_STATE_PRE_COPY; + if (async_data->buf->pre_copy_init_bytes_chunk) + async_data->buf->pre_copy_init_bytes_chunk = false; } if (stop_copy_last_chunk) migf->state = MLX5_MIGF_STATE_COMPLETE; @@ -809,6 +823,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, async_data->header_buf = header_buf; get_file(migf->filp); + migf->inflight_save = 1; err = mlx5_cmd_exec_cb(&migf->async_ctx, in, sizeof(in), async_data->out, out_size, mlx5vf_save_callback, @@ -819,6 +834,8 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, return 0; err_exec: + migf->inflight_save = 0; + wake_up_interruptible(&migf->poll_wait); if (header_buf) mlx5vf_put_data_buffer(header_buf); fput(migf->filp); diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index d7821b5ca772..deed0f132f39 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -62,6 +62,7 @@ struct mlx5_vhca_data_buffer { u32 *mkey_in; enum dma_data_direction dma_dir; u8 stop_copy_chunk_num; + bool pre_copy_init_bytes_chunk; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; }; @@ -97,6 +98,7 @@ struct mlx5_vf_migration_file { u32 record_tag; u64 stop_copy_prep_size; u64 pre_copy_initial_bytes; + u64 pre_copy_initial_bytes_start; size_t next_required_umem_size; u8 num_ready_chunks; /* Upon chunk mode preserve another set of buffers for stop_copy phase */ @@ -111,6 +113,7 @@ struct mlx5_vf_migration_file { struct completion save_comp; struct mlx5_async_ctx async_ctx; struct mlx5vf_async_data async_data; + u8 inflight_save:1; }; struct mlx5_vhca_cq_buf { @@ -174,6 +177,7 @@ struct mlx5vf_pci_core_device { u8 mdev_detach:1; u8 log_active:1; u8 chunk_mode:1; + u8 mig_state_cap:1; struct completion tracker_comp; /* protect migration state */ struct mutex state_mutex; @@ -198,7 +202,7 @@ int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod); int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev, size_t *state_size, u64 *total_size, - u8 query_flags); + u8 *migration_state, u8 query_flags); void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, const struct vfio_migration_ops *mig_ops, const struct vfio_log_ops *log_ops); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index dbba6173894b..de306dee1d1a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -179,7 +179,8 @@ static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, !list_empty(&migf->buf_list) || migf->state == MLX5_MIGF_STATE_ERROR || migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || - migf->state == MLX5_MIGF_STATE_PRE_COPY || + (migf->state == MLX5_MIGF_STATE_PRE_COPY && + !migf->inflight_save) || migf->state == MLX5_MIGF_STATE_COMPLETE)) return -ERESTARTSYS; } @@ -463,21 +464,16 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, struct mlx5_vhca_data_buffer *buf; struct vfio_precopy_info info = {}; loff_t *pos = &filp->f_pos; - unsigned long minsz; + u8 migration_state = 0; size_t inc_length = 0; - bool end_of_data = false; + bool reinit_state; + bool end_of_data; int ret; - if (cmd != VFIO_MIG_GET_PRECOPY_INFO) - return -ENOTTY; - - minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + ret = vfio_check_precopy_ioctl(&mvdev->core_device.vdev, cmd, arg, + &info); + if (ret) + return ret; mutex_lock(&mvdev->state_mutex); if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && @@ -498,7 +494,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * As so, the other code below is safe with the proper locks. */ ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, - NULL, MLX5VF_QUERY_INC); + NULL, &migration_state, + MLX5VF_QUERY_INC); if (ret) goto err_state_unlock; } @@ -509,43 +506,70 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, goto err_migf_unlock; } - if (migf->pre_copy_initial_bytes > *pos) { - info.initial_bytes = migf->pre_copy_initial_bytes - *pos; - } else { - info.dirty_bytes = migf->max_pos - *pos; - if (!info.dirty_bytes) - end_of_data = true; - info.dirty_bytes += inc_length; - } - - if (!end_of_data || !inc_length) { - mutex_unlock(&migf->lock); - goto done; - } - - mutex_unlock(&migf->lock); /* - * We finished transferring the current state and the device has a - * dirty state, save a new state to be ready for. + * opt-in for VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 serves + * as opt-in for VFIO_PRECOPY_INFO_REINIT as well */ - buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), - DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - mlx5vf_mark_err(migf); - goto err_state_unlock; + reinit_state = mvdev->core_device.vdev.precopy_info_v2 && + migration_state == MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_INIT; + end_of_data = !(migf->max_pos - *pos); + if (reinit_state) { + /* + * Any bytes already present in memory are treated as initial + * bytes, since the caller is required to read them before + * reaching the new initial-bytes region. + */ + migf->pre_copy_initial_bytes_start = *pos; + migf->pre_copy_initial_bytes = migf->max_pos - *pos; + info.initial_bytes = migf->pre_copy_initial_bytes + inc_length; + info.flags |= VFIO_PRECOPY_INFO_REINIT; + } else { + if (migf->pre_copy_initial_bytes_start + + migf->pre_copy_initial_bytes > *pos) { + WARN_ON_ONCE(end_of_data); + info.initial_bytes = migf->pre_copy_initial_bytes_start + + migf->pre_copy_initial_bytes - *pos; + } else { + info.dirty_bytes = (migf->max_pos - *pos) + inc_length; + } + } + mutex_unlock(&migf->lock); + + if ((reinit_state || end_of_data) && inc_length) { + /* + * In case we finished transferring the current state and the + * device has a dirty state, or that the device has a new init + * state, save a new state to be ready for. + */ + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), + DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + mlx5vf_mark_err(migf); + goto err_state_unlock; + } + + buf->pre_copy_init_bytes_chunk = reinit_state; + ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); + if (ret) { + mlx5vf_mark_err(migf); + mlx5vf_put_data_buffer(buf); + goto err_state_unlock; + } + + /* + * SAVE appends a header record via add_buf_header(), + * let's account it as well. + */ + if (reinit_state) + info.initial_bytes += sizeof(struct mlx5_vf_migration_header); + else + info.dirty_bytes += sizeof(struct mlx5_vf_migration_header); } - ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); - if (ret) { - mlx5vf_mark_err(migf); - mlx5vf_put_data_buffer(buf); - goto err_state_unlock; - } - -done: mlx5vf_state_mutex_unlock(mvdev); - if (copy_to_user((void __user *)arg, &info, minsz)) + if (copy_to_user((void __user *)arg, &info, + offsetofend(struct vfio_precopy_info, dirty_bytes))) return -EFAULT; return 0; @@ -575,7 +599,7 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) if (migf->state == MLX5_MIGF_STATE_ERROR) return -ENODEV; - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, NULL, NULL, MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); if (ret) goto err; @@ -641,7 +665,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (ret) goto out; - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, NULL, 0); if (ret) goto out_pd; @@ -1128,7 +1152,7 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, enum mlx5_vf_migf_state state; size_t size; - ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &size, NULL, NULL, MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); if (ret) return ERR_PTR(ret); @@ -1253,7 +1277,7 @@ static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, mutex_lock(&mvdev->state_mutex); ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &state_size, - &total_size, 0); + &total_size, NULL, 0); if (!ret) *stop_copy_length = total_size; mlx5vf_state_mutex_unlock(mvdev); diff --git a/drivers/vfio/pci/qat/Kconfig b/drivers/vfio/pci/qat/Kconfig index bf52cfa4b595..83f037d7e9a4 100644 --- a/drivers/vfio/pci/qat/Kconfig +++ b/drivers/vfio/pci/qat/Kconfig @@ -2,7 +2,7 @@ config QAT_VFIO_PCI tristate "VFIO support for QAT VF PCI devices" select VFIO_PCI_CORE - depends on CRYPTO_DEV_QAT_4XXX + depends on CRYPTO_DEV_QAT_4XXX || CRYPTO_DEV_QAT_420XX || CRYPTO_DEV_QAT_6XXX help This provides migration support for Intel(R) QAT Virtual Function using the VFIO framework. diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index b982d4ae666c..ac9652539d66 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -121,18 +121,12 @@ static long qat_vf_precopy_ioctl(struct file *filp, unsigned int cmd, struct qat_mig_dev *mig_dev = qat_vdev->mdev; struct vfio_precopy_info info; loff_t *pos = &filp->f_pos; - unsigned long minsz; int ret = 0; - if (cmd != VFIO_MIG_GET_PRECOPY_INFO) - return -ENOTTY; - - minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + ret = vfio_check_precopy_ioctl(&qat_vdev->core_device.vdev, cmd, arg, + &info); + if (ret) + return ret; mutex_lock(&qat_vdev->state_mutex); if (qat_vdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && @@ -160,7 +154,8 @@ out: mutex_unlock(&qat_vdev->state_mutex); if (ret) return ret; - return copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + return copy_to_user((void __user *)arg, &info, + offsetofend(struct vfio_precopy_info, dirty_bytes)) ? -EFAULT : 0; } static ssize_t qat_vf_save_read(struct file *filp, char __user *buf, @@ -677,6 +672,8 @@ static const struct pci_device_id qat_vf_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) }, { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) }, { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) }, + /* Intel QAT GEN5 420xx VF device */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4947) }, /* Intel QAT GEN6 6xxx VF device */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4949) }, {} diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index b4e39253f98d..a10ed733f0e3 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1880,8 +1880,9 @@ static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_core_device *vdev, return i; } -static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user *buf, - size_t count, loff_t *ppos, bool iswrite) +ssize_t vfio_pci_config_rw_single(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite) { struct pci_dev *pdev = vdev->pdev; struct perm_bits *perm; @@ -1970,6 +1971,7 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user return ret; } +EXPORT_SYMBOL_GPL(vfio_pci_config_rw_single); ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) @@ -1981,7 +1983,7 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, pos &= VFIO_PCI_OFFSET_MASK; while (count) { - ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite); + ret = vfio_pci_config_rw_single(vdev, buf, count, &pos, iswrite); if (ret < 0) return ret; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 3fea064d00de..ad52abc46c04 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2133,6 +2133,10 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) if (WARN_ON(vdev != dev_get_drvdata(dev))) return -EINVAL; + /* Drivers must set a name. Required for sequestering SR-IOV VFs */ + if (WARN_ON(!vdev->vdev.ops->name)) + return -EINVAL; + if (pdev->hdr_type != PCI_HEADER_TYPE_NORMAL) return -EINVAL; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 27ac280f00b9..fca9d0dfac90 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -37,6 +37,10 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_core_device *vdev, uint32_t flags, ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); +ssize_t vfio_pci_config_rw_single(struct vfio_pci_core_device *vdev, + char __user *buf, size_t count, loff_t *ppos, + bool iswrite); + ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite); diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index 35fa2d6ed611..7e11834ad512 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -443,19 +443,13 @@ static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd, struct vfio_precopy_info info = {}; loff_t *pos = &filp->f_pos; bool end_of_data = false; - unsigned long minsz; u32 ctx_size = 0; int ret; - if (cmd != VFIO_MIG_GET_PRECOPY_INFO) - return -ENOTTY; - - minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; + ret = vfio_check_precopy_ioctl(&virtvdev->core_device.vdev, cmd, arg, + &info); + if (ret) + return ret; mutex_lock(&virtvdev->state_mutex); if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && @@ -514,7 +508,8 @@ static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd, done: virtiovf_state_mutex_unlock(virtvdev); - if (copy_to_user((void __user *)arg, &info, minsz)) + if (copy_to_user((void __user *)arg, &info, + offsetofend(struct vfio_precopy_info, dirty_bytes))) return -EFAULT; return 0; diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c index 88acfcf840fc..4ecadbbfd86e 100644 --- a/drivers/vfio/pci/xe/main.c +++ b/drivers/vfio/pci/xe/main.c @@ -468,39 +468,46 @@ static const struct vfio_migration_ops xe_vfio_pci_migration_ops = { static void xe_vfio_pci_migration_init(struct xe_vfio_pci_core_device *xe_vdev) { struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; - struct pci_dev *pdev = to_pci_dev(core_vdev->dev); - struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); - if (!xe) + if (!xe_sriov_vfio_migration_supported(xe_vdev->xe)) return; - if (!xe_sriov_vfio_migration_supported(xe)) - return; - - mutex_init(&xe_vdev->state_mutex); - spin_lock_init(&xe_vdev->reset_lock); - - /* PF internal control uses vfid index starting from 1 */ - xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; - xe_vdev->xe = xe; core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; core_vdev->mig_ops = &xe_vfio_pci_migration_ops; } -static void xe_vfio_pci_migration_fini(struct xe_vfio_pci_core_device *xe_vdev) +static int xe_vfio_pci_vf_init(struct xe_vfio_pci_core_device *xe_vdev) { - if (!xe_vdev->vfid) - return; + struct vfio_device *core_vdev = &xe_vdev->core_device.vdev; + struct pci_dev *pdev = to_pci_dev(core_vdev->dev); + struct xe_device *xe = xe_sriov_vfio_get_pf(pdev); - mutex_destroy(&xe_vdev->state_mutex); + if (!pdev->is_virtfn) + return 0; + if (!xe) + return -ENODEV; + xe_vdev->xe = xe; + + /* PF internal control uses vfid index starting from 1 */ + xe_vdev->vfid = pci_iov_vf_id(pdev) + 1; + + xe_vfio_pci_migration_init(xe_vdev); + + return 0; } static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev) { struct xe_vfio_pci_core_device *xe_vdev = container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); + int ret; - xe_vfio_pci_migration_init(xe_vdev); + mutex_init(&xe_vdev->state_mutex); + spin_lock_init(&xe_vdev->reset_lock); + + ret = xe_vfio_pci_vf_init(xe_vdev); + if (ret) + return ret; return vfio_pci_core_init_dev(core_vdev); } @@ -510,7 +517,8 @@ static void xe_vfio_pci_release_dev(struct vfio_device *core_vdev) struct xe_vfio_pci_core_device *xe_vdev = container_of(core_vdev, struct xe_vfio_pci_core_device, core_device.vdev); - xe_vfio_pci_migration_fini(xe_vdev); + mutex_destroy(&xe_vdev->state_mutex); + vfio_pci_core_release_dev(core_vdev); } static const struct vfio_device_ops xe_vfio_pci_ops = { diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 50128da18bca..0854f3fa1a22 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -90,7 +90,6 @@ struct vfio_group { struct mutex group_lock; struct kvm *kvm; struct file *opened_file; - struct blocking_notifier_head notifier; struct iommufd_ctx *iommufd; spinlock_t kvm_ref_lock; unsigned int cdev_device_open_cnt; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 03cefdf99a4a..c8151ba54de3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include "vfio.h" diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 742477546b15..8666f35fb3f0 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -553,6 +553,7 @@ static void vfio_df_device_last_close(struct vfio_device_file *df) vfio_df_iommufd_unbind(df); else vfio_device_group_unuse_iommu(device); + device->precopy_info_v2 = 0; module_put(device->dev->driver->owner); } @@ -964,6 +965,23 @@ vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device, return 0; } +static int +vfio_ioctl_device_feature_migration_precopy_info_v2(struct vfio_device *device, + u32 flags, size_t argsz) +{ + int ret; + + if (!(device->migration_flags & VFIO_MIGRATION_PRE_COPY)) + return -EINVAL; + + ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_SET, 0); + if (ret != 1) + return ret; + + device->precopy_info_v2 = 1; + return 0; +} + static int vfio_ioctl_device_feature_migration(struct vfio_device *device, u32 flags, void __user *arg, size_t argsz) @@ -1251,6 +1269,9 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, return vfio_ioctl_device_feature_migration_data_size( device, feature.flags, arg->data, feature.argsz - minsz); + case VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2: + return vfio_ioctl_device_feature_migration_precopy_info_v2( + device, feature.flags, feature.argsz - minsz); default: if (unlikely(!device->ops->device_feature)) return -ENOTTY; diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 007f5138db2b..49f3ad4b1a7c 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -2203,7 +2203,8 @@ struct mlx5_ifc_cmd_hca_cap_2_bits { u8 sf_eq_usage[0x1]; u8 reserved_at_d3[0x5]; u8 multiplane[0x1]; - u8 reserved_at_d9[0x7]; + u8 migration_state[0x1]; + u8 reserved_at_da[0x6]; u8 cross_vhca_object_to_object_supported[0x20]; @@ -13322,13 +13323,24 @@ struct mlx5_ifc_query_vhca_migration_state_in_bits { u8 reserved_at_60[0x20]; }; +enum { + MLX5_QUERY_VHCA_MIG_STATE_UNINITIALIZED = 0x0, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_IDLE = 0x1, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_READY = 0x2, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_DIRTY = 0x3, + MLX5_QUERY_VHCA_MIG_STATE_OPER_MIGRATION_INIT = 0x4, +}; + struct mlx5_ifc_query_vhca_migration_state_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; u8 syndrome[0x20]; - u8 reserved_at_40[0x40]; + u8 reserved_at_40[0x20]; + + u8 migration_state[0x4]; + u8 reserved_at_64[0x1c]; u8 required_umem_size[0x20]; diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e90859956514..31b826efba00 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -16,6 +16,7 @@ #include #include #include +#include struct kvm; struct iommufd_ctx; @@ -52,6 +53,7 @@ struct vfio_device { struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; + u8 precopy_info_v2; struct kvm *kvm; /* Members below here are private, not for driver use */ @@ -72,13 +74,11 @@ struct vfio_device { u8 iommufd_attached:1; #endif u8 cdev_opened:1; -#ifdef CONFIG_DEBUG_FS /* * debug_root is a static property of the vfio_device * which must be set prior to registering the vfio_device. */ struct dentry *debug_root; -#endif }; /** @@ -284,6 +284,44 @@ static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops, return 1; } +/** + * vfio_check_precopy_ioctl - Validate user input for the VFIO_MIG_GET_PRECOPY_INFO ioctl + * @vdev: The vfio device + * @cmd: Cmd from the ioctl + * @arg: Arg from the ioctl + * @info: Driver pointer to hold the userspace input to the ioctl + * + * For use in a driver's get_precopy_info. Checks that the inputs to the + * VFIO_MIG_GET_PRECOPY_INFO ioctl are correct. + + * Returns 0 on success, otherwise errno. + */ + +static inline int +vfio_check_precopy_ioctl(struct vfio_device *vdev, unsigned int cmd, + unsigned long arg, struct vfio_precopy_info *info) +{ + unsigned long minsz; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + + if (copy_from_user(info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info->argsz < minsz) + return -EINVAL; + + /* keep v1 behaviour as is for compatibility reasons */ + if (vdev->precopy_info_v2) + /* flags are output, set its initial value to 0 */ + info->flags = 0; + + return 0; +} + struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev, const struct vfio_device_ops *ops); #define vfio_alloc_device(dev_struct, member, dev, ops) \ diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index bb7b89330d35..5de618a3a5ee 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -141,7 +141,7 @@ struct vfio_info_cap_header { * * Retrieve information about the group. Fills in provided * struct vfio_group_info. Caller sets argsz. - * Return: 0 on succes, -errno on failure. + * Return: 0 on success, -errno on failure. * Availability: Always */ struct vfio_group_status { @@ -1266,6 +1266,19 @@ enum vfio_device_mig_state { * The initial_bytes field indicates the amount of initial precopy * data available from the device. This field should have a non-zero initial * value and decrease as migration data is read from the device. + * The presence of the VFIO_PRECOPY_INFO_REINIT output flag indicates + * that new initial data is present on the stream. + * The new initial data may result, for example, from device reconfiguration + * during migration that requires additional initialization data. + * In that case initial_bytes may report a non-zero value irrespective of + * any previously reported values, which progresses towards zero as precopy + * data is read from the data stream. dirty_bytes is also reset + * to zero and represents the state change of the device relative to the new + * initial_bytes. + * VFIO_PRECOPY_INFO_REINIT can be reported only after userspace opts in to + * VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2. Without this opt-in, the flags field + * of struct vfio_precopy_info is reserved for bug-compatibility reasons. + * * It is recommended to leave PRE_COPY for STOP_COPY only after this field * reaches zero. Leaving PRE_COPY earlier might make things slower. * @@ -1301,6 +1314,7 @@ enum vfio_device_mig_state { struct vfio_precopy_info { __u32 argsz; __u32 flags; +#define VFIO_PRECOPY_INFO_REINIT (1 << 0) /* output - new initial data is present */ __aligned_u64 initial_bytes; __aligned_u64 dirty_bytes; }; @@ -1510,6 +1524,16 @@ struct vfio_device_feature_dma_buf { struct vfio_region_dma_range dma_ranges[] __counted_by(nr_ranges); }; +/* + * Enables the migration precopy_info_v2 behaviour. + * + * VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2. + * + * On SET, enables the v2 pre_copy_info behaviour, where the + * vfio_precopy_info.flags is a valid output field. + */ +#define VFIO_DEVICE_FEATURE_MIG_PRECOPY_INFOv2 12 + /* -------- API for Type1 VFIO IOMMU -------- */ /** diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index bd92c38379b8..69b6d9defbce 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -68,9 +68,12 @@ * Global Structures */ +static const struct class mtty_class = { + .name = MTTY_CLASS_NAME +}; + static struct mtty_dev { dev_t vd_devt; - struct class *vd_class; struct cdev vd_cdev; struct idr vd_idr; struct device dev; @@ -837,18 +840,11 @@ static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd, struct mdev_state *mdev_state = migf->mdev_state; loff_t *pos = &filp->f_pos; struct vfio_precopy_info info = {}; - unsigned long minsz; int ret; - if (cmd != VFIO_MIG_GET_PRECOPY_INFO) - return -ENOTTY; - - minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); - - if (copy_from_user(&info, (void __user *)arg, minsz)) - return -EFAULT; - if (info.argsz < minsz) - return -EINVAL; + ret = vfio_check_precopy_ioctl(&mdev_state->vdev, cmd, arg, &info); + if (ret) + return ret; mutex_lock(&mdev_state->state_mutex); if (mdev_state->state != VFIO_DEVICE_STATE_PRE_COPY && @@ -875,7 +871,8 @@ static long mtty_precopy_ioctl(struct file *filp, unsigned int cmd, info.initial_bytes = migf->filled_size - *pos; mutex_unlock(&migf->lock); - ret = copy_to_user((void __user *)arg, &info, minsz) ? -EFAULT : 0; + ret = copy_to_user((void __user *)arg, &info, + offsetofend(struct vfio_precopy_info, dirty_bytes)) ? -EFAULT : 0; unlock: mtty_state_mutex_unlock(mdev_state); return ret; @@ -1980,15 +1977,14 @@ static int __init mtty_dev_init(void) if (ret) goto err_cdev; - mtty_dev.vd_class = class_create(MTTY_CLASS_NAME); + ret = class_register(&mtty_class); - if (IS_ERR(mtty_dev.vd_class)) { + if (ret) { pr_err("Error: failed to register mtty_dev class\n"); - ret = PTR_ERR(mtty_dev.vd_class); goto err_driver; } - mtty_dev.dev.class = mtty_dev.vd_class; + mtty_dev.dev.class = &mtty_class; mtty_dev.dev.release = mtty_device_release; dev_set_name(&mtty_dev.dev, "%s", MTTY_NAME); @@ -2007,7 +2003,7 @@ err_device: device_del(&mtty_dev.dev); err_put: put_device(&mtty_dev.dev); - class_destroy(mtty_dev.vd_class); + class_unregister(&mtty_class); err_driver: mdev_unregister_driver(&mtty_driver); err_cdev: @@ -2026,8 +2022,7 @@ static void __exit mtty_dev_exit(void) mdev_unregister_driver(&mtty_driver); cdev_del(&mtty_dev.vd_cdev); unregister_chrdev_region(mtty_dev.vd_devt, MINORMASK + 1); - class_destroy(mtty_dev.vd_class); - mtty_dev.vd_class = NULL; + class_unregister(&mtty_class); pr_info("mtty_dev: Unloaded!\n"); } diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 8e90e409e91d..0684932d91bf 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,6 +1,6 @@ ARCH ?= $(shell uname -m) -ifeq (,$(filter $(ARCH),arm64 x86_64)) +ifeq (,$(filter $(ARCH),aarch64 arm64 x86_64)) # Do nothing on unsupported architectures include ../lib.mk else diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c index c75045bcab79..19d9630b24c2 100644 --- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -65,10 +65,21 @@ static bool dsa_int_handle_request_required(struct vfio_pci_device *device) static int dsa_probe(struct vfio_pci_device *device) { - if (!vfio_pci_device_match(device, PCI_VENDOR_ID_INTEL, - PCI_DEVICE_ID_INTEL_DSA_SPR0)) + const u16 vendor_id = vfio_pci_config_readw(device, PCI_VENDOR_ID); + const u16 device_id = vfio_pci_config_readw(device, PCI_DEVICE_ID); + + if (vendor_id != PCI_VENDOR_ID_INTEL) return -EINVAL; + switch (device_id) { + case PCI_DEVICE_ID_INTEL_DSA_SPR0: + case PCI_DEVICE_ID_INTEL_DSA_DMR: + case PCI_DEVICE_ID_INTEL_DSA_GNRD: + break; + default: + return -EINVAL; + } + if (dsa_int_handle_request_required(device)) { dev_err(device, "Device requires requesting interrupt handles\n"); return -EINVAL; diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index 4e5871f1ebc3..fc75e04ef010 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -30,10 +30,12 @@ static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { - u8 buf[sizeof(struct vfio_irq_set) + sizeof(int) * count] = {}; + u8 buf[sizeof(struct vfio_irq_set) + sizeof(int) * count]; struct vfio_irq_set *irq = (void *)&buf; int *irq_fds = (void *)&irq->data; + memset(buf, 0, sizeof(buf)); + irq->argsz = sizeof(buf); irq->flags = VFIO_IRQ_SET_ACTION_TRIGGER; irq->index = index; diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c index 957a89ce7b3a..d7f25ef77671 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_mmio_test.c @@ -100,7 +100,6 @@ static void do_mmio_map_test(struct iommu *iommu, iommu_unmap(iommu, ®ion); } else { VFIO_ASSERT_NE(__iommu_map(iommu, ®ion), 0); - VFIO_ASSERT_NE(__iommu_unmap(iommu, ®ion, NULL), 0); } }