2 年之前 · 408015a97d
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2051,7 +2051,7 @@ F: hw/usb/dev-serial.c
 
				 
			
 
				 VFIO
			
 
				 M: Alex Williamson <alex.williamson@redhat.com>
			
 
				-R: Cédric Le Goater <clg@redhat.com>
			
 
				+M: Cédric Le Goater <clg@redhat.com>
			
 
				 S: Supported
			
 
				 F: hw/vfio/*
			
 
				 F: include/hw/vfio/
			
--- a/docs/devel/vfio-migration.rst
+++ b/docs/devel/vfio-migration.rst
@@ -7,12 +7,21 @@ the guest is running on source host and restoring this saved state on the
 
				 destination host. This document details how saving and restoring of VFIO
			
 
				 devices is done in QEMU.
			
 
				 
			
 
				-Migration of VFIO devices currently consists of a single stop-and-copy phase.
			
 
				-During the stop-and-copy phase the guest is stopped and the entire VFIO device
			
 
				-data is transferred to the destination.
			
 
				-
			
 
				-The pre-copy phase of migration is currently not supported for VFIO devices.
			
 
				-Support for VFIO pre-copy will be added later on.
			
 
				+Migration of VFIO devices consists of two phases: the optional pre-copy phase,
			
 
				+and the stop-and-copy phase. The pre-copy phase is iterative and allows to
			
 
				+accommodate VFIO devices that have a large amount of data that needs to be
			
 
				+transferred. The iterative pre-copy phase of migration allows for the guest to
			
 
				+continue whilst the VFIO device state is transferred to the destination, this
			
 
				+helps to reduce the total downtime of the VM. VFIO devices opt-in to pre-copy
			
 
				+support by reporting the VFIO_MIGRATION_PRE_COPY flag in the
			
 
				+VFIO_DEVICE_FEATURE_MIGRATION ioctl.
			
 
				+
			
 
				+When pre-copy is supported, it's possible to further reduce downtime by
			
 
				+enabling "switchover-ack" migration capability.
			
 
				+VFIO migration uAPI defines "initial bytes" as part of its pre-copy data stream
			
 
				+and recommends that the initial bytes are sent and loaded in the destination
			
 
				+before stopping the source VM. Enabling this migration capability will
			
 
				+guarantee that and thus, can potentially reduce downtime even further.
			
 
				 
			
 
				 Note that currently VFIO migration is supported only for a single device. This
			
 
				 is due to VFIO migration's lack of P2P support. However, P2P support is planned
			
@@ -29,10 +38,23 @@ VFIO implements the device hooks for the iterative approach as follows:
 
				 * A ``load_setup`` function that sets the VFIO device on the destination in
			
 
				   _RESUMING state.
			
 
				 
			
 
				+* A ``state_pending_estimate`` function that reports an estimate of the
			
 
				+  remaining pre-copy data that the vendor driver has yet to save for the VFIO
			
 
				+  device.
			
 
				+
			
 
				 * A ``state_pending_exact`` function that reads pending_bytes from the vendor
			
 
				   driver, which indicates the amount of data that the vendor driver has yet to
			
 
				   save for the VFIO device.
			
 
				 
			
 
				+* An ``is_active_iterate`` function that indicates ``save_live_iterate`` is
			
 
				+  active only when the VFIO device is in pre-copy states.
			
 
				+
			
 
				+* A ``save_live_iterate`` function that reads the VFIO device's data from the
			
 
				+  vendor driver during iterative pre-copy phase.
			
 
				+
			
 
				+* A ``switchover_ack_needed`` function that checks if the VFIO device uses
			
 
				+  "switchover-ack" migration capability when this capability is enabled.
			
 
				+
			
 
				 * A ``save_state`` function to save the device config space if it is present.
			
 
				 
			
 
				 * A ``save_live_complete_precopy`` function that sets the VFIO device in
			
@@ -111,8 +133,10 @@ Flow of state changes during Live migration
 
				 ===========================================
			
 
				 
			
 
				 Below is the flow of state change during live migration.
			
 
				-The values in the brackets represent the VM state, the migration state, and
			
 
				+The values in the parentheses represent the VM state, the migration state, and
			
 
				 the VFIO device state, respectively.
			
 
				+The text in the square brackets represents the flow if the VFIO device supports
			
 
				+pre-copy.
			
 
				 
			
 
				 Live migration save path
			
 
				 ------------------------
			
@@ -124,11 +148,12 @@ Live migration save path
 
				                                   |
			
 
				                      migrate_init spawns migration_thread
			
 
				                 Migration thread then calls each device's .save_setup()
			
 
				-                       (RUNNING, _SETUP, _RUNNING)
			
 
				+                  (RUNNING, _SETUP, _RUNNING [_PRE_COPY])
			
 
				                                   |
			
 
				-                      (RUNNING, _ACTIVE, _RUNNING)
			
 
				-             If device is active, get pending_bytes by .state_pending_exact()
			
 
				+                  (RUNNING, _ACTIVE, _RUNNING [_PRE_COPY])
			
 
				+      If device is active, get pending_bytes by .state_pending_{estimate,exact}()
			
 
				           If total pending_bytes >= threshold_size, call .save_live_iterate()
			
 
				+                  [Data of VFIO device for pre-copy phase is copied]
			
 
				         Iterate till total pending bytes converge and are less than threshold
			
 
				                                   |
			
 
				   On migration completion, vCPU stops and calls .save_live_complete_precopy for
			
--- a/hw/s390x/s390-pci-vfio.c
+++ b/hw/s390x/s390-pci-vfio.c
@@ -289,38 +289,11 @@ static void s390_pci_read_pfip(S390PCIBusDevice *pbdev,
 
				     memcpy(pbdev->zpci_fn.pfip, cap->pfip, CLP_PFIP_NR_SEGMENTS);
			
 
				 }
			
 
				 
			
 
				-static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev,
			
 
				-                                                uint32_t argsz)
			
 
				+static struct vfio_device_info *get_device_info(S390PCIBusDevice *pbdev)
			
 
				 {
			
 
				-    struct vfio_device_info *info = g_malloc0(argsz);
			
 
				-    VFIOPCIDevice *vfio_pci;
			
 
				-    int fd;
			
 
				+    VFIOPCIDevice *vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
			
 
				 
			
 
				-    vfio_pci = container_of(pbdev->pdev, VFIOPCIDevice, pdev);
			
 
				-    fd = vfio_pci->vbasedev.fd;
			
 
				-
			
 
				-    /*
			
 
				-     * If the specified argsz is not large enough to contain all capabilities
			
 
				-     * it will be updated upon return from the ioctl.  Retry until we have
			
 
				-     * a big enough buffer to hold the entire capability chain.  On error,
			
 
				-     * just exit and rely on CLP defaults.
			
 
				-     */
			
 
				-retry:
			
 
				-    info->argsz = argsz;
			
 
				-
			
 
				-    if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
			
 
				-        trace_s390_pci_clp_dev_info(vfio_pci->vbasedev.name);
			
 
				-        g_free(info);
			
 
				-        return NULL;
			
 
				-    }
			
 
				-
			
 
				-    if (info->argsz > argsz) {
			
 
				-        argsz = info->argsz;
			
 
				-        info = g_realloc(info, argsz);
			
 
				-        goto retry;
			
 
				-    }
			
 
				-
			
 
				-    return info;
			
 
				+    return vfio_get_device_info(vfio_pci->vbasedev.fd);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -335,7 +308,7 @@ bool s390_pci_get_host_fh(S390PCIBusDevice *pbdev, uint32_t *fh)
 
				 
			
 
				     assert(fh);
			
 
				 
			
 
				-    info = get_device_info(pbdev, sizeof(*info));
			
 
				+    info = get_device_info(pbdev);
			
 
				     if (!info) {
			
 
				         return false;
			
 
				     }
			
@@ -356,7 +329,7 @@ void s390_pci_get_clp_info(S390PCIBusDevice *pbdev)
 
				 {
			
 
				     g_autofree struct vfio_device_info *info = NULL;
			
 
				 
			
 
				-    info = get_device_info(pbdev, sizeof(*info));
			
 
				+    info = get_device_info(pbdev);
			
 
				     if (!info) {
			
 
				         return;
			
 
				     }
			
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -381,7 +381,7 @@ static unsigned int vfio_migratable_device_num(void)
 
				     return device_num;
			
 
				 }
			
 
				 
			
 
				-int vfio_block_multiple_devices_migration(Error **errp)
			
 
				+int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp)
			
 
				 {
			
 
				     int ret;
			
 
				 
			
@@ -390,6 +390,12 @@ int vfio_block_multiple_devices_migration(Error **errp)
 
				         return 0;
			
 
				     }
			
 
				 
			
 
				+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
			
 
				+        error_setg(errp, "Migration is currently not supported with multiple "
			
 
				+                         "VFIO devices");
			
 
				+        return -EINVAL;
			
 
				+    }
			
 
				+
			
 
				     error_setg(&multiple_devices_migration_blocker,
			
 
				                "Migration is currently not supported with multiple "
			
 
				                "VFIO devices");
			
@@ -427,7 +433,7 @@ static bool vfio_viommu_preset(void)
 
				     return false;
			
 
				 }
			
 
				 
			
 
				-int vfio_block_giommu_migration(Error **errp)
			
 
				+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp)
			
 
				 {
			
 
				     int ret;
			
 
				 
			
@@ -436,6 +442,12 @@ int vfio_block_giommu_migration(Error **errp)
 
				         return 0;
			
 
				     }
			
 
				 
			
 
				+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
			
 
				+        error_setg(errp,
			
 
				+                   "Migration is currently not supported with vIOMMU enabled");
			
 
				+        return -EINVAL;
			
 
				+    }
			
 
				+
			
 
				     error_setg(&giommu_migration_blocker,
			
 
				                "Migration is currently not supported with vIOMMU enabled");
			
 
				     ret = migrate_add_blocker(giommu_migration_blocker, errp);
			
@@ -492,7 +504,8 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
 
				             }
			
 
				 
			
 
				             if (vbasedev->pre_copy_dirty_page_tracking == ON_OFF_AUTO_OFF &&
			
 
				-                migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
			
 
				+                (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
			
 
				+                 migration->device_state == VFIO_DEVICE_STATE_PRE_COPY)) {
			
 
				                 return false;
			
 
				             }
			
 
				         }
			
@@ -537,7 +550,8 @@ static bool vfio_devices_all_running_and_mig_active(VFIOContainer *container)
 
				                 return false;
			
 
				             }
			
 
				 
			
 
				-            if (migration->device_state == VFIO_DEVICE_STATE_RUNNING) {
			
 
				+            if (migration->device_state == VFIO_DEVICE_STATE_RUNNING ||
			
 
				+                migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
			
 
				                 continue;
			
 
				             } else {
			
 
				                 return false;
			
@@ -2844,11 +2858,35 @@ void vfio_put_group(VFIOGroup *group)
 
				     }
			
 
				 }
			
 
				 
			
 
				+struct vfio_device_info *vfio_get_device_info(int fd)
			
 
				+{
			
 
				+    struct vfio_device_info *info;
			
 
				+    uint32_t argsz = sizeof(*info);
			
 
				+
			
 
				+    info = g_malloc0(argsz);
			
 
				+
			
 
				+retry:
			
 
				+    info->argsz = argsz;
			
 
				+
			
 
				+    if (ioctl(fd, VFIO_DEVICE_GET_INFO, info)) {
			
 
				+        g_free(info);
			
 
				+        return NULL;
			
 
				+    }
			
 
				+
			
 
				+    if (info->argsz > argsz) {
			
 
				+        argsz = info->argsz;
			
 
				+        info = g_realloc(info, argsz);
			
 
				+        goto retry;
			
 
				+    }
			
 
				+
			
 
				+    return info;
			
 
				+}
			
 
				+
			
 
				 int vfio_get_device(VFIOGroup *group, const char *name,
			
 
				                     VFIODevice *vbasedev, Error **errp)
			
 
				 {
			
 
				-    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
			
 
				-    int ret, fd;
			
 
				+    g_autofree struct vfio_device_info *info = NULL;
			
 
				+    int fd;
			
 
				 
			
 
				     fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
			
 
				     if (fd < 0) {
			
@@ -2860,11 +2898,11 @@ int vfio_get_device(VFIOGroup *group, const char *name,
 
				         return fd;
			
 
				     }
			
 
				 
			
 
				-    ret = ioctl(fd, VFIO_DEVICE_GET_INFO, &dev_info);
			
 
				-    if (ret) {
			
 
				+    info = vfio_get_device_info(fd);
			
 
				+    if (!info) {
			
 
				         error_setg_errno(errp, errno, "error getting device info");
			
 
				         close(fd);
			
 
				-        return ret;
			
 
				+        return -1;
			
 
				     }
			
 
				 
			
 
				     /*
			
@@ -2892,14 +2930,14 @@ int vfio_get_device(VFIOGroup *group, const char *name,
 
				     vbasedev->group = group;
			
 
				     QLIST_INSERT_HEAD(&group->device_list, vbasedev, next);
			
 
				 
			
 
				-    vbasedev->num_irqs = dev_info.num_irqs;
			
 
				-    vbasedev->num_regions = dev_info.num_regions;
			
 
				-    vbasedev->flags = dev_info.flags;
			
 
				+    vbasedev->num_irqs = info->num_irqs;
			
 
				+    vbasedev->num_regions = info->num_regions;
			
 
				+    vbasedev->flags = info->flags;
			
 
				+
			
 
				+    trace_vfio_get_device(name, info->flags, info->num_regions, info->num_irqs);
			
 
				 
			
 
				-    trace_vfio_get_device(name, dev_info.flags, dev_info.num_regions,
			
 
				-                          dev_info.num_irqs);
			
 
				+    vbasedev->reset_works = !!(info->flags & VFIO_DEVICE_FLAGS_RESET);
			
 
				 
			
 
				-    vbasedev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
			
 
				     return 0;
			
 
				 }
			
 
				 
			
--- a/hw/vfio/migration.c
+++ b/hw/vfio/migration.c
@@ -18,6 +18,8 @@
 
				 #include "sysemu/runstate.h"
			
 
				 #include "hw/vfio/vfio-common.h"
			
 
				 #include "migration/migration.h"
			
 
				+#include "migration/options.h"
			
 
				+#include "migration/savevm.h"
			
 
				 #include "migration/vmstate.h"
			
 
				 #include "migration/qemu-file.h"
			
 
				 #include "migration/register.h"
			
@@ -45,6 +47,7 @@
 
				 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE  (0xffffffffef100002ULL)
			
 
				 #define VFIO_MIG_FLAG_DEV_SETUP_STATE   (0xffffffffef100003ULL)
			
 
				 #define VFIO_MIG_FLAG_DEV_DATA_STATE    (0xffffffffef100004ULL)
			
 
				+#define VFIO_MIG_FLAG_DEV_INIT_DATA_SENT (0xffffffffef100005ULL)
			
 
				 
			
 
				 /*
			
 
				  * This is an arbitrary size based on migration of mlx5 devices, where typically
			
@@ -68,6 +71,8 @@ static const char *mig_state_to_str(enum vfio_device_mig_state state)
 
				         return "STOP_COPY";
			
 
				     case VFIO_DEVICE_STATE_RESUMING:
			
 
				         return "RESUMING";
			
 
				+    case VFIO_DEVICE_STATE_PRE_COPY:
			
 
				+        return "PRE_COPY";
			
 
				     default:
			
 
				         return "UNKNOWN STATE";
			
 
				     }
			
@@ -241,18 +246,45 @@ static int vfio_query_stop_copy_size(VFIODevice *vbasedev,
 
				     return 0;
			
 
				 }
			
 
				 
			
 
				-/* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */
			
 
				-static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
			
 
				+static int vfio_query_precopy_size(VFIOMigration *migration)
			
 
				+{
			
 
				+    struct vfio_precopy_info precopy = {
			
 
				+        .argsz = sizeof(precopy),
			
 
				+    };
			
 
				+
			
 
				+    migration->precopy_init_size = 0;
			
 
				+    migration->precopy_dirty_size = 0;
			
 
				+
			
 
				+    if (ioctl(migration->data_fd, VFIO_MIG_GET_PRECOPY_INFO, &precopy)) {
			
 
				+        return -errno;
			
 
				+    }
			
 
				+
			
 
				+    migration->precopy_init_size = precopy.initial_bytes;
			
 
				+    migration->precopy_dirty_size = precopy.dirty_bytes;
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+/* Returns the size of saved data on success and -errno on error */
			
 
				+static ssize_t vfio_save_block(QEMUFile *f, VFIOMigration *migration)
			
 
				 {
			
 
				     ssize_t data_size;
			
 
				 
			
 
				     data_size = read(migration->data_fd, migration->data_buffer,
			
 
				                      migration->data_buffer_size);
			
 
				     if (data_size < 0) {
			
 
				+        /*
			
 
				+         * Pre-copy emptied all the device state for now. For more information,
			
 
				+         * please refer to the Linux kernel VFIO uAPI.
			
 
				+         */
			
 
				+        if (errno == ENOMSG) {
			
 
				+            return 0;
			
 
				+        }
			
 
				+
			
 
				         return -errno;
			
 
				     }
			
 
				     if (data_size == 0) {
			
 
				-        return 1;
			
 
				+        return 0;
			
 
				     }
			
 
				 
			
 
				     qemu_put_be64(f, VFIO_MIG_FLAG_DEV_DATA_STATE);
			
@@ -262,7 +294,39 @@ static int vfio_save_block(QEMUFile *f, VFIOMigration *migration)
 
				 
			
 
				     trace_vfio_save_block(migration->vbasedev->name, data_size);
			
 
				 
			
 
				-    return qemu_file_get_error(f);
			
 
				+    return qemu_file_get_error(f) ?: data_size;
			
 
				+}
			
 
				+
			
 
				+static void vfio_update_estimated_pending_data(VFIOMigration *migration,
			
 
				+                                               uint64_t data_size)
			
 
				+{
			
 
				+    if (!data_size) {
			
 
				+        /*
			
 
				+         * Pre-copy emptied all the device state for now, update estimated sizes
			
 
				+         * accordingly.
			
 
				+         */
			
 
				+        migration->precopy_init_size = 0;
			
 
				+        migration->precopy_dirty_size = 0;
			
 
				+
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    if (migration->precopy_init_size) {
			
 
				+        uint64_t init_size = MIN(migration->precopy_init_size, data_size);
			
 
				+
			
 
				+        migration->precopy_init_size -= init_size;
			
 
				+        data_size -= init_size;
			
 
				+    }
			
 
				+
			
 
				+    migration->precopy_dirty_size -= MIN(migration->precopy_dirty_size,
			
 
				+                                         data_size);
			
 
				+}
			
 
				+
			
 
				+static bool vfio_precopy_supported(VFIODevice *vbasedev)
			
 
				+{
			
 
				+    VFIOMigration *migration = vbasedev->migration;
			
 
				+
			
 
				+    return migration->mig_flags & VFIO_MIGRATION_PRE_COPY;
			
 
				 }
			
 
				 
			
 
				 /* ---------------------------------------------------------------------- */
			
@@ -285,6 +349,28 @@ static int vfio_save_setup(QEMUFile *f, void *opaque)
 
				         return -ENOMEM;
			
 
				     }
			
 
				 
			
 
				+    if (vfio_precopy_supported(vbasedev)) {
			
 
				+        int ret;
			
 
				+
			
 
				+        switch (migration->device_state) {
			
 
				+        case VFIO_DEVICE_STATE_RUNNING:
			
 
				+            ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_PRE_COPY,
			
 
				+                                           VFIO_DEVICE_STATE_RUNNING);
			
 
				+            if (ret) {
			
 
				+                return ret;
			
 
				+            }
			
 
				+
			
 
				+            vfio_query_precopy_size(migration);
			
 
				+
			
 
				+            break;
			
 
				+        case VFIO_DEVICE_STATE_STOP:
			
 
				+            /* vfio_save_complete_precopy() will go to STOP_COPY */
			
 
				+            break;
			
 
				+        default:
			
 
				+            return -EINVAL;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				     trace_vfio_save_setup(vbasedev->name, migration->data_buffer_size);
			
 
				 
			
 
				     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
			
@@ -299,26 +385,43 @@ static void vfio_save_cleanup(void *opaque)
 
				 
			
 
				     g_free(migration->data_buffer);
			
 
				     migration->data_buffer = NULL;
			
 
				+    migration->precopy_init_size = 0;
			
 
				+    migration->precopy_dirty_size = 0;
			
 
				+    migration->initial_data_sent = false;
			
 
				     vfio_migration_cleanup(vbasedev);
			
 
				     trace_vfio_save_cleanup(vbasedev->name);
			
 
				 }
			
 
				 
			
 
				+static void vfio_state_pending_estimate(void *opaque, uint64_t *must_precopy,
			
 
				+                                        uint64_t *can_postcopy)
			
 
				+{
			
 
				+    VFIODevice *vbasedev = opaque;
			
 
				+    VFIOMigration *migration = vbasedev->migration;
			
 
				+
			
 
				+    if (migration->device_state != VFIO_DEVICE_STATE_PRE_COPY) {
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    *must_precopy +=
			
 
				+        migration->precopy_init_size + migration->precopy_dirty_size;
			
 
				+
			
 
				+    trace_vfio_state_pending_estimate(vbasedev->name, *must_precopy,
			
 
				+                                      *can_postcopy,
			
 
				+                                      migration->precopy_init_size,
			
 
				+                                      migration->precopy_dirty_size);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Migration size of VFIO devices can be as little as a few KBs or as big as
			
 
				  * many GBs. This value should be big enough to cover the worst case.
			
 
				  */
			
 
				 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
			
 
				 
			
 
				-/*
			
 
				- * Only exact function is implemented and not estimate function. The reason is
			
 
				- * that during pre-copy phase of migration the estimate function is called
			
 
				- * repeatedly while pending RAM size is over the threshold, thus migration
			
 
				- * can't converge and querying the VFIO device pending data size is useless.
			
 
				- */
			
 
				 static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
			
 
				                                      uint64_t *can_postcopy)
			
 
				 {
			
 
				     VFIODevice *vbasedev = opaque;
			
 
				+    VFIOMigration *migration = vbasedev->migration;
			
 
				     uint64_t stop_copy_size = VFIO_MIG_STOP_COPY_SIZE;
			
 
				 
			
 
				     /*
			
@@ -328,16 +431,64 @@ static void vfio_state_pending_exact(void *opaque, uint64_t *must_precopy,
 
				     vfio_query_stop_copy_size(vbasedev, &stop_copy_size);
			
 
				     *must_precopy += stop_copy_size;
			
 
				 
			
 
				+    if (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY) {
			
 
				+        vfio_query_precopy_size(migration);
			
 
				+
			
 
				+        *must_precopy +=
			
 
				+            migration->precopy_init_size + migration->precopy_dirty_size;
			
 
				+    }
			
 
				+
			
 
				     trace_vfio_state_pending_exact(vbasedev->name, *must_precopy, *can_postcopy,
			
 
				-                                   stop_copy_size);
			
 
				+                                   stop_copy_size, migration->precopy_init_size,
			
 
				+                                   migration->precopy_dirty_size);
			
 
				+}
			
 
				+
			
 
				+static bool vfio_is_active_iterate(void *opaque)
			
 
				+{
			
 
				+    VFIODevice *vbasedev = opaque;
			
 
				+    VFIOMigration *migration = vbasedev->migration;
			
 
				+
			
 
				+    return migration->device_state == VFIO_DEVICE_STATE_PRE_COPY;
			
 
				+}
			
 
				+
			
 
				+static int vfio_save_iterate(QEMUFile *f, void *opaque)
			
 
				+{
			
 
				+    VFIODevice *vbasedev = opaque;
			
 
				+    VFIOMigration *migration = vbasedev->migration;
			
 
				+    ssize_t data_size;
			
 
				+
			
 
				+    data_size = vfio_save_block(f, migration);
			
 
				+    if (data_size < 0) {
			
 
				+        return data_size;
			
 
				+    }
			
 
				+
			
 
				+    vfio_update_estimated_pending_data(migration, data_size);
			
 
				+
			
 
				+    if (migrate_switchover_ack() && !migration->precopy_init_size &&
			
 
				+        !migration->initial_data_sent) {
			
 
				+        qemu_put_be64(f, VFIO_MIG_FLAG_DEV_INIT_DATA_SENT);
			
 
				+        migration->initial_data_sent = true;
			
 
				+    } else {
			
 
				+        qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
			
 
				+    }
			
 
				+
			
 
				+    trace_vfio_save_iterate(vbasedev->name, migration->precopy_init_size,
			
 
				+                            migration->precopy_dirty_size);
			
 
				+
			
 
				+    /*
			
 
				+     * A VFIO device's pre-copy dirty_bytes is not guaranteed to reach zero.
			
 
				+     * Return 1 so following handlers will not be potentially blocked.
			
 
				+     */
			
 
				+    return 1;
			
 
				 }
			
 
				 
			
 
				 static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
			
 
				 {
			
 
				     VFIODevice *vbasedev = opaque;
			
 
				+    ssize_t data_size;
			
 
				     int ret;
			
 
				 
			
 
				-    /* We reach here with device state STOP only */
			
 
				+    /* We reach here with device state STOP or STOP_COPY only */
			
 
				     ret = vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
			
 
				                                    VFIO_DEVICE_STATE_STOP);
			
 
				     if (ret) {
			
@@ -345,11 +496,11 @@ static int vfio_save_complete_precopy(QEMUFile *f, void *opaque)
 
				     }
			
 
				 
			
 
				     do {
			
 
				-        ret = vfio_save_block(f, vbasedev->migration);
			
 
				-        if (ret < 0) {
			
 
				-            return ret;
			
 
				+        data_size = vfio_save_block(f, vbasedev->migration);
			
 
				+        if (data_size < 0) {
			
 
				+            return data_size;
			
 
				         }
			
 
				-    } while (!ret);
			
 
				+    } while (data_size);
			
 
				 
			
 
				     qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
			
 
				     ret = qemu_file_get_error(f);
			
@@ -439,6 +590,24 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
 
				             }
			
 
				             break;
			
 
				         }
			
 
				+        case VFIO_MIG_FLAG_DEV_INIT_DATA_SENT:
			
 
				+        {
			
 
				+            if (!vfio_precopy_supported(vbasedev) ||
			
 
				+                !migrate_switchover_ack()) {
			
 
				+                error_report("%s: Received INIT_DATA_SENT but switchover ack "
			
 
				+                             "is not used", vbasedev->name);
			
 
				+                return -EINVAL;
			
 
				+            }
			
 
				+
			
 
				+            ret = qemu_loadvm_approve_switchover();
			
 
				+            if (ret) {
			
 
				+                error_report(
			
 
				+                    "%s: qemu_loadvm_approve_switchover failed, err=%d (%s)",
			
 
				+                    vbasedev->name, ret, strerror(-ret));
			
 
				+            }
			
 
				+
			
 
				+            return ret;
			
 
				+        }
			
 
				         default:
			
 
				             error_report("%s: Unknown tag 0x%"PRIx64, vbasedev->name, data);
			
 
				             return -EINVAL;
			
@@ -453,15 +622,26 @@ static int vfio_load_state(QEMUFile *f, void *opaque, int version_id)
 
				     return ret;
			
 
				 }
			
 
				 
			
 
				+static bool vfio_switchover_ack_needed(void *opaque)
			
 
				+{
			
 
				+    VFIODevice *vbasedev = opaque;
			
 
				+
			
 
				+    return vfio_precopy_supported(vbasedev);
			
 
				+}
			
 
				+
			
 
				 static const SaveVMHandlers savevm_vfio_handlers = {
			
 
				     .save_setup = vfio_save_setup,
			
 
				     .save_cleanup = vfio_save_cleanup,
			
 
				+    .state_pending_estimate = vfio_state_pending_estimate,
			
 
				     .state_pending_exact = vfio_state_pending_exact,
			
 
				+    .is_active_iterate = vfio_is_active_iterate,
			
 
				+    .save_live_iterate = vfio_save_iterate,
			
 
				     .save_live_complete_precopy = vfio_save_complete_precopy,
			
 
				     .save_state = vfio_save_state,
			
 
				     .load_setup = vfio_load_setup,
			
 
				     .load_cleanup = vfio_load_cleanup,
			
 
				     .load_state = vfio_load_state,
			
 
				+    .switchover_ack_needed = vfio_switchover_ack_needed,
			
 
				 };
			
 
				 
			
 
				 /* ---------------------------------------------------------------------- */
			
@@ -469,13 +649,18 @@ static const SaveVMHandlers savevm_vfio_handlers = {
 
				 static void vfio_vmstate_change(void *opaque, bool running, RunState state)
			
 
				 {
			
 
				     VFIODevice *vbasedev = opaque;
			
 
				+    VFIOMigration *migration = vbasedev->migration;
			
 
				     enum vfio_device_mig_state new_state;
			
 
				     int ret;
			
 
				 
			
 
				     if (running) {
			
 
				         new_state = VFIO_DEVICE_STATE_RUNNING;
			
 
				     } else {
			
 
				-        new_state = VFIO_DEVICE_STATE_STOP;
			
 
				+        new_state =
			
 
				+            (migration->device_state == VFIO_DEVICE_STATE_PRE_COPY &&
			
 
				+             (state == RUN_STATE_FINISH_MIGRATE || state == RUN_STATE_PAUSED)) ?
			
 
				+                VFIO_DEVICE_STATE_STOP_COPY :
			
 
				+                VFIO_DEVICE_STATE_STOP;
			
 
				     }
			
 
				 
			
 
				     /*
			
@@ -512,7 +697,6 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
 
				     case MIGRATION_STATUS_CANCELLING:
			
 
				     case MIGRATION_STATUS_CANCELLED:
			
 
				     case MIGRATION_STATUS_FAILED:
			
 
				-        bytes_transferred = 0;
			
 
				         /*
			
 
				          * If setting the device in RUNNING state fails, the device should
			
 
				          * be reset. To do so, use ERROR state as a recover state.
			
@@ -540,14 +724,6 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
 
				     feature->argsz = sizeof(buf);
			
 
				     feature->flags = VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_MIGRATION;
			
 
				     if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
			
 
				-        if (errno == ENOTTY) {
			
 
				-            error_report("%s: VFIO migration is not supported in kernel",
			
 
				-                         vbasedev->name);
			
 
				-        } else {
			
 
				-            error_report("%s: Failed to query VFIO migration support, err: %s",
			
 
				-                         vbasedev->name, strerror(errno));
			
 
				-        }
			
 
				-
			
 
				         return -errno;
			
 
				     }
			
 
				 
			
@@ -602,6 +778,7 @@ static int vfio_migration_init(VFIODevice *vbasedev)
 
				     migration->vbasedev = vbasedev;
			
 
				     migration->device_state = VFIO_DEVICE_STATE_RUNNING;
			
 
				     migration->data_fd = -1;
			
 
				+    migration->mig_flags = mig_flags;
			
 
				 
			
 
				     vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
			
 
				 
			
@@ -625,6 +802,27 @@ static int vfio_migration_init(VFIODevice *vbasedev)
 
				     return 0;
			
 
				 }
			
 
				 
			
 
				+static int vfio_block_migration(VFIODevice *vbasedev, Error *err, Error **errp)
			
 
				+{
			
 
				+    int ret;
			
 
				+
			
 
				+    if (vbasedev->enable_migration == ON_OFF_AUTO_ON) {
			
 
				+        error_propagate(errp, err);
			
 
				+        return -EINVAL;
			
 
				+    }
			
 
				+
			
 
				+    vbasedev->migration_blocker = error_copy(err);
			
 
				+    error_free(err);
			
 
				+
			
 
				+    ret = migrate_add_blocker(vbasedev->migration_blocker, errp);
			
 
				+    if (ret < 0) {
			
 
				+        error_free(vbasedev->migration_blocker);
			
 
				+        vbasedev->migration_blocker = NULL;
			
 
				+    }
			
 
				+
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				 /* ---------------------------------------------------------------------- */
			
 
				 
			
 
				 int64_t vfio_mig_bytes_transferred(void)
			
@@ -632,42 +830,61 @@ int64_t vfio_mig_bytes_transferred(void)
 
				     return bytes_transferred;
			
 
				 }
			
 
				 
			
 
				+void vfio_reset_bytes_transferred(void)
			
 
				+{
			
 
				+    bytes_transferred = 0;
			
 
				+}
			
 
				+
			
 
				 int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
			
 
				 {
			
 
				-    int ret = -ENOTSUP;
			
 
				+    Error *err = NULL;
			
 
				+    int ret;
			
 
				 
			
 
				-    if (!vbasedev->enable_migration) {
			
 
				-        goto add_blocker;
			
 
				+    if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
			
 
				+        error_setg(&err, "%s: Migration is disabled for VFIO device",
			
 
				+                   vbasedev->name);
			
 
				+        return vfio_block_migration(vbasedev, err, errp);
			
 
				     }
			
 
				 
			
 
				     ret = vfio_migration_init(vbasedev);
			
 
				     if (ret) {
			
 
				-        goto add_blocker;
			
 
				+        if (ret == -ENOTTY) {
			
 
				+            error_setg(&err, "%s: VFIO migration is not supported in kernel",
			
 
				+                       vbasedev->name);
			
 
				+        } else {
			
 
				+            error_setg(&err,
			
 
				+                       "%s: Migration couldn't be initialized for VFIO device, "
			
 
				+                       "err: %d (%s)",
			
 
				+                       vbasedev->name, ret, strerror(-ret));
			
 
				+        }
			
 
				+
			
 
				+        return vfio_block_migration(vbasedev, err, errp);
			
 
				+    }
			
 
				+
			
 
				+    if (!vbasedev->dirty_pages_supported) {
			
 
				+        if (vbasedev->enable_migration == ON_OFF_AUTO_AUTO) {
			
 
				+            error_setg(&err,
			
 
				+                       "%s: VFIO device doesn't support device dirty tracking",
			
 
				+                       vbasedev->name);
			
 
				+            return vfio_block_migration(vbasedev, err, errp);
			
 
				+        }
			
 
				+
			
 
				+        warn_report("%s: VFIO device doesn't support device dirty tracking",
			
 
				+                    vbasedev->name);
			
 
				     }
			
 
				 
			
 
				-    ret = vfio_block_multiple_devices_migration(errp);
			
 
				+    ret = vfio_block_multiple_devices_migration(vbasedev, errp);
			
 
				     if (ret) {
			
 
				         return ret;
			
 
				     }
			
 
				 
			
 
				-    ret = vfio_block_giommu_migration(errp);
			
 
				+    ret = vfio_block_giommu_migration(vbasedev, errp);
			
 
				     if (ret) {
			
 
				         return ret;
			
 
				     }
			
 
				 
			
 
				-    trace_vfio_migration_probe(vbasedev->name);
			
 
				+    trace_vfio_migration_realize(vbasedev->name);
			
 
				     return 0;
			
 
				-
			
 
				-add_blocker:
			
 
				-    error_setg(&vbasedev->migration_blocker,
			
 
				-               "VFIO device doesn't support migration");
			
 
				-
			
 
				-    ret = migrate_add_blocker(vbasedev->migration_blocker, errp);
			
 
				-    if (ret < 0) {
			
 
				-        error_free(vbasedev->migration_blocker);
			
 
				-        vbasedev->migration_blocker = NULL;
			
 
				-    }
			
 
				-    return ret;
			
 
				 }
			
 
				 
			
 
				 void vfio_migration_exit(VFIODevice *vbasedev)
			
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -1490,6 +1490,9 @@ void vfio_setup_resetfn_quirk(VFIOPCIDevice *vdev)
 
				  * +---------------------------------+---------------------------------+
			
 
				  *
			
 
				  * https://lists.gnu.org/archive/html/qemu-devel/2017-08/pdfUda5iEpgOS.pdf
			
 
				+ *
			
 
				+ * Specification for Turning and later GPU architectures:
			
 
				+ * https://lists.gnu.org/archive/html/qemu-devel/2023-06/pdf142OR4O4c2.pdf
			
 
				  */
			
 
				 static void get_nv_gpudirect_clique_id(Object *obj, Visitor *v,
			
 
				                                        const char *name, void *opaque,
			
@@ -1530,7 +1533,9 @@ const PropertyInfo qdev_prop_nv_gpudirect_clique = {
 
				 static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
			
 
				 {
			
 
				     PCIDevice *pdev = &vdev->pdev;
			
 
				-    int ret, pos = 0xC8;
			
 
				+    int ret, pos;
			
 
				+    bool c8_conflict = false, d4_conflict = false;
			
 
				+    uint8_t tmp;
			
 
				 
			
 
				     if (vdev->nv_gpudirect_clique == 0xFF) {
			
 
				         return 0;
			
@@ -1547,6 +1552,40 @@ static int vfio_add_nv_gpudirect_cap(VFIOPCIDevice *vdev, Error **errp)
 
				         return -EINVAL;
			
 
				     }
			
 
				 
			
 
				+    /*
			
 
				+     * Per the updated specification above, it's recommended to use offset
			
 
				+     * D4h for Turing and later GPU architectures due to a conflict of the
			
 
				+     * MSI-X capability at C8h.  We don't know how to determine the GPU
			
 
				+     * architecture, instead we walk the capability chain to mark conflicts
			
 
				+     * and choose one or error based on the result.
			
 
				+     *
			
 
				+     * NB. Cap list head in pdev->config is already cleared, read from device.
			
 
				+     */
			
 
				+    ret = pread(vdev->vbasedev.fd, &tmp, 1,
			
 
				+                vdev->config_offset + PCI_CAPABILITY_LIST);
			
 
				+    if (ret != 1 || !tmp) {
			
 
				+        error_setg(errp, "NVIDIA GPUDirect Clique ID: error getting cap list");
			
 
				+        return -EINVAL;
			
 
				+    }
			
 
				+
			
 
				+    do {
			
 
				+        if (tmp == 0xC8) {
			
 
				+            c8_conflict = true;
			
 
				+        } else if (tmp == 0xD4) {
			
 
				+            d4_conflict = true;
			
 
				+        }
			
 
				+        tmp = pdev->config[tmp + PCI_CAP_LIST_NEXT];
			
 
				+    } while (tmp);
			
 
				+
			
 
				+    if (!c8_conflict) {
			
 
				+        pos = 0xC8;
			
 
				+    } else if (!d4_conflict) {
			
 
				+        pos = 0xD4;
			
 
				+    } else {
			
 
				+        error_setg(errp, "NVIDIA GPUDirect Clique ID: invalid config space");
			
 
				+        return -EINVAL;
			
 
				+    }
			
 
				+
			
 
				     ret = pci_add_capability(pdev, PCI_CAP_ID_VNDR, pos, 8, errp);
			
 
				     if (ret < 0) {
			
 
				         error_prepend(errp, "Failed to add NVIDIA GPUDirect cap: ");
			
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -663,6 +663,8 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
 
				 
			
 
				     vfio_disable_interrupts(vdev);
			
 
				 
			
 
				+    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
			
 
				+retry:
			
 
				     /*
			
 
				      * Setting vector notifiers needs to enable route for each vector.
			
 
				      * Deferring to commit the KVM routes once rather than per vector
			
@@ -670,8 +672,6 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
 
				      */
			
 
				     vfio_prepare_kvm_msi_virq_batch(vdev);
			
 
				 
			
 
				-    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
			
 
				-retry:
			
 
				     vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
			
 
				 
			
 
				     for (i = 0; i < vdev->nr_vectors; i++) {
			
@@ -3221,7 +3221,12 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
				 
			
 
				 out_deregister:
			
 
				     pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
			
 
				-    kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
			
 
				+    if (vdev->irqchip_change_notifier.notify) {
			
 
				+        kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
			
 
				+    }
			
 
				+    if (vdev->intx.mmap_timer) {
			
 
				+        timer_free(vdev->intx.mmap_timer);
			
 
				+    }
			
 
				 out_teardown:
			
 
				     vfio_teardown_msi(vdev);
			
 
				     vfio_bars_exit(vdev);
			
@@ -3347,8 +3352,8 @@ static Property vfio_pci_dev_properties[] = {
 
				                     VFIO_FEATURE_ENABLE_REQ_BIT, true),
			
 
				     DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
			
 
				                     VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
			
 
				-    DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice,
			
 
				-                     vbasedev.enable_migration, false),
			
 
				+    DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
			
 
				+                            vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
			
 
				     DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
			
 
				     DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
			
 
				                      vbasedev.ram_block_discard_allowed, false),
			
--- a/hw/vfio/trace-events
+++ b/hw/vfio/trace-events
@@ -155,13 +155,15 @@ vfio_load_cleanup(const char *name) " (%s)"
 
				 vfio_load_device_config_state(const char *name) " (%s)"
			
 
				 vfio_load_state(const char *name, uint64_t data) " (%s) data 0x%"PRIx64
			
 
				 vfio_load_state_device_data(const char *name, uint64_t data_size, int ret) " (%s) size 0x%"PRIx64" ret %d"
			
 
				-vfio_migration_probe(const char *name) " (%s)"
			
 
				+vfio_migration_realize(const char *name) " (%s)"
			
 
				 vfio_migration_set_state(const char *name, const char *state) " (%s) state %s"
			
 
				 vfio_migration_state_notifier(const char *name, const char *state) " (%s) state %s"
			
 
				 vfio_save_block(const char *name, int data_size) " (%s) data_size %d"
			
 
				 vfio_save_cleanup(const char *name) " (%s)"
			
 
				 vfio_save_complete_precopy(const char *name, int ret) " (%s) ret %d"
			
 
				 vfio_save_device_config_state(const char *name) " (%s)"
			
 
				+vfio_save_iterate(const char *name, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
			
 
				 vfio_save_setup(const char *name, uint64_t data_buffer_size) " (%s) data buffer size 0x%"PRIx64
			
 
				-vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64
			
 
				+vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
			
 
				+vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 0x%"PRIx64
			
 
				 vfio_vmstate_change(const char *name, int running, const char *reason, const char *dev_state) " (%s) running %d reason %s device state %s"
			
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -66,6 +66,10 @@ typedef struct VFIOMigration {
 
				     int data_fd;
			
 
				     void *data_buffer;
			
 
				     size_t data_buffer_size;
			
 
				+    uint64_t mig_flags;
			
 
				+    uint64_t precopy_init_size;
			
 
				+    uint64_t precopy_dirty_size;
			
 
				+    bool initial_data_sent;
			
 
				 } VFIOMigration;
			
 
				 
			
 
				 typedef struct VFIOAddressSpace {
			
@@ -135,7 +139,7 @@ typedef struct VFIODevice {
 
				     bool needs_reset;
			
 
				     bool no_mmap;
			
 
				     bool ram_block_discard_allowed;
			
 
				-    bool enable_migration;
			
 
				+    OnOffAuto enable_migration;
			
 
				     VFIODeviceOps *ops;
			
 
				     unsigned int num_irqs;
			
 
				     unsigned int num_regions;
			
@@ -212,6 +216,7 @@ void vfio_region_finalize(VFIORegion *region);
 
				 void vfio_reset_handler(void *opaque);
			
 
				 VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
			
 
				 void vfio_put_group(VFIOGroup *group);
			
 
				+struct vfio_device_info *vfio_get_device_info(int fd);
			
 
				 int vfio_get_device(VFIOGroup *group, const char *name,
			
 
				                     VFIODevice *vbasedev, Error **errp);
			
 
				 
			
@@ -220,10 +225,11 @@ typedef QLIST_HEAD(VFIOGroupList, VFIOGroup) VFIOGroupList;
 
				 extern VFIOGroupList vfio_group_list;
			
 
				 
			
 
				 bool vfio_mig_active(void);
			
 
				-int vfio_block_multiple_devices_migration(Error **errp);
			
 
				+int vfio_block_multiple_devices_migration(VFIODevice *vbasedev, Error **errp);
			
 
				 void vfio_unblock_multiple_devices_migration(void);
			
 
				-int vfio_block_giommu_migration(Error **errp);
			
 
				+int vfio_block_giommu_migration(VFIODevice *vbasedev, Error **errp);
			
 
				 int64_t vfio_mig_bytes_transferred(void);
			
 
				+void vfio_reset_bytes_transferred(void);
			
 
				 
			
 
				 #ifdef CONFIG_LINUX
			
 
				 int vfio_get_region_info(VFIODevice *vbasedev, int index,
			
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -71,6 +71,8 @@ typedef struct SaveVMHandlers {
 
				     int (*load_cleanup)(void *opaque);
			
 
				     /* Called when postcopy migration wants to resume from failure */
			
 
				     int (*resume_prepare)(MigrationState *s, void *opaque);
			
 
				+    /* Checks if switchover ack should be used. Called only in dest */
			
 
				+    bool (*switchover_ack_needed)(void *opaque);
			
 
				 } SaveVMHandlers;
			
 
				 
			
 
				 int register_savevm_live(const char *idstr,
			
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -78,6 +78,7 @@ enum mig_rp_message_type {
 
				     MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
			
 
				     MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
			
 
				     MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
			
 
				+    MIG_RP_MSG_SWITCHOVER_ACK, /* Tell source it's OK to do switchover */
			
 
				 
			
 
				     MIG_RP_MSG_MAX
			
 
				 };
			
@@ -760,6 +761,11 @@ bool migration_has_all_channels(void)
 
				     return true;
			
 
				 }
			
 
				 
			
 
				+int migrate_send_rp_switchover_ack(MigrationIncomingState *mis)
			
 
				+{
			
 
				+    return migrate_send_rp_message(mis, MIG_RP_MSG_SWITCHOVER_ACK, 0, NULL);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Send a 'SHUT' message on the return channel with the given value
			
 
				  * to indicate that we've finished with the RP.  Non-0 value indicates
			
@@ -1405,6 +1411,7 @@ void migrate_init(MigrationState *s)
 
				     s->vm_old_state = -1;
			
 
				     s->iteration_initial_bytes = 0;
			
 
				     s->threshold_size = 0;
			
 
				+    s->switchover_acked = false;
			
 
				 }
			
 
				 
			
 
				 int migrate_add_blocker_internal(Error *reason, Error **errp)
			
@@ -1621,6 +1628,7 @@ static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
 
				      */
			
 
				     memset(&mig_stats, 0, sizeof(mig_stats));
			
 
				     memset(&compression_counters, 0, sizeof(compression_counters));
			
 
				+    reset_vfio_bytes_transferred();
			
 
				 
			
 
				     return true;
			
 
				 }
			
@@ -1721,6 +1729,7 @@ static struct rp_cmd_args {
 
				     [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
			
 
				     [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
			
 
				     [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
			
 
				+    [MIG_RP_MSG_SWITCHOVER_ACK] = { .len =  0, .name = "SWITCHOVER_ACK" },
			
 
				     [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
			
 
				 };
			
 
				 
			
@@ -1959,6 +1968,11 @@ retry:
 
				             }
			
 
				             break;
			
 
				 
			
 
				+        case MIG_RP_MSG_SWITCHOVER_ACK:
			
 
				+            ms->switchover_acked = true;
			
 
				+            trace_source_return_path_thread_switchover_acked();
			
 
				+            break;
			
 
				+
			
 
				         default:
			
 
				             break;
			
 
				         }
			
@@ -2693,6 +2707,20 @@ static void migration_update_counters(MigrationState *s,
 
				                               bandwidth, s->threshold_size);
			
 
				 }
			
 
				 
			
 
				+static bool migration_can_switchover(MigrationState *s)
			
 
				+{
			
 
				+    if (!migrate_switchover_ack()) {
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    /* No reason to wait for switchover ACK if VM is stopped */
			
 
				+    if (!runstate_is_running()) {
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    return s->switchover_acked;
			
 
				+}
			
 
				+
			
 
				 /* Migration thread iteration status */
			
 
				 typedef enum {
			
 
				     MIG_ITERATE_RESUME,         /* Resume current iteration */
			
@@ -2708,6 +2736,7 @@ static MigIterateState migration_iteration_run(MigrationState *s)
 
				 {
			
 
				     uint64_t must_precopy, can_postcopy;
			
 
				     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
			
 
				+    bool can_switchover = migration_can_switchover(s);
			
 
				 
			
 
				     qemu_savevm_state_pending_estimate(&must_precopy, &can_postcopy);
			
 
				     uint64_t pending_size = must_precopy + can_postcopy;
			
@@ -2720,14 +2749,14 @@ static MigIterateState migration_iteration_run(MigrationState *s)
 
				         trace_migrate_pending_exact(pending_size, must_precopy, can_postcopy);
			
 
				     }
			
 
				 
			
 
				-    if (!pending_size || pending_size < s->threshold_size) {
			
 
				+    if ((!pending_size || pending_size < s->threshold_size) && can_switchover) {
			
 
				         trace_migration_thread_low_pending(pending_size);
			
 
				         migration_completion(s);
			
 
				         return MIG_ITERATE_BREAK;
			
 
				     }
			
 
				 
			
 
				     /* Still a significant amount to transfer */
			
 
				-    if (!in_postcopy && must_precopy <= s->threshold_size &&
			
 
				+    if (!in_postcopy && must_precopy <= s->threshold_size && can_switchover &&
			
 
				         qatomic_read(&s->start_postcopy)) {
			
 
				         if (postcopy_start(s)) {
			
 
				             error_report("%s: postcopy failed to start", __func__);
			
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -210,6 +210,13 @@ struct MigrationIncomingState {
 
				      * contains valid information.
			
 
				      */
			
 
				     QemuMutex page_request_mutex;
			
 
				+
			
 
				+    /*
			
 
				+     * Number of devices that have yet to approve switchover. When this reaches
			
 
				+     * zero an ACK that it's OK to do switchover is sent to the source. No lock
			
 
				+     * is needed as this field is updated serially.
			
 
				+     */
			
 
				+    unsigned int switchover_ack_pending_num;
			
 
				 };
			
 
				 
			
 
				 MigrationIncomingState *migration_incoming_get_current(void);
			
@@ -440,6 +447,12 @@ struct MigrationState {
 
				 
			
 
				     /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
			
 
				     JSONWriter *vmdesc;
			
 
				+
			
 
				+    /*
			
 
				+     * Indicates whether an ACK from the destination that it's OK to do
			
 
				+     * switchover has been received.
			
 
				+     */
			
 
				+    bool switchover_acked;
			
 
				 };
			
 
				 
			
 
				 void migrate_set_state(int *state, int old_state, int new_state);
			
@@ -480,6 +493,7 @@ int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
 
				 void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
			
 
				                                  char *block_name);
			
 
				 void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
			
 
				+int migrate_send_rp_switchover_ack(MigrationIncomingState *mis);
			
 
				 
			
 
				 void dirty_bitmap_mig_before_vm_start(void);
			
 
				 void dirty_bitmap_mig_cancel_outgoing(void);
			
@@ -500,6 +514,7 @@ bool migration_rate_limit(void);
 
				 void migration_cancel(const Error *error);
			
 
				 
			
 
				 void populate_vfio_info(MigrationInfo *info);
			
 
				+void reset_vfio_bytes_transferred(void);
			
 
				 void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
			
 
				 
			
 
				 #endif
			
--- a/migration/options.c
+++ b/migration/options.c
@@ -185,6 +185,8 @@ Property migration_properties[] = {
 
				     DEFINE_PROP_MIG_CAP("x-zero-copy-send",
			
 
				             MIGRATION_CAPABILITY_ZERO_COPY_SEND),
			
 
				 #endif
			
 
				+    DEFINE_PROP_MIG_CAP("x-switchover-ack",
			
 
				+                        MIGRATION_CAPABILITY_SWITCHOVER_ACK),
			
 
				 
			
 
				     DEFINE_PROP_END_OF_LIST(),
			
 
				 };
			
@@ -308,6 +310,13 @@ bool migrate_return_path(void)
 
				     return s->capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
			
 
				 }
			
 
				 
			
 
				+bool migrate_switchover_ack(void)
			
 
				+{
			
 
				+    MigrationState *s = migrate_get_current();
			
 
				+
			
 
				+    return s->capabilities[MIGRATION_CAPABILITY_SWITCHOVER_ACK];
			
 
				+}
			
 
				+
			
 
				 bool migrate_validate_uuid(void)
			
 
				 {
			
 
				     MigrationState *s = migrate_get_current();
			
@@ -547,6 +556,14 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, Error **errp)
 
				         }
			
 
				     }
			
 
				 
			
 
				+    if (new_caps[MIGRATION_CAPABILITY_SWITCHOVER_ACK]) {
			
 
				+        if (!new_caps[MIGRATION_CAPABILITY_RETURN_PATH]) {
			
 
				+            error_setg(errp, "Capability 'switchover-ack' requires capability "
			
 
				+                             "'return-path'");
			
 
				+            return false;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				     return true;
			
 
				 }
			
 
				 
			
--- a/migration/options.h
+++ b/migration/options.h
@@ -40,6 +40,7 @@ bool migrate_postcopy_ram(void);
 
				 bool migrate_rdma_pin_all(void);
			
 
				 bool migrate_release_ram(void);
			
 
				 bool migrate_return_path(void);
			
 
				+bool migrate_switchover_ack(void);
			
 
				 bool migrate_validate_uuid(void);
			
 
				 bool migrate_xbzrle(void);
			
 
				 bool migrate_zero_blocks(void);
			
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1622,6 +1622,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
 
				     migrate_init(ms);
			
 
				     memset(&mig_stats, 0, sizeof(mig_stats));
			
 
				     memset(&compression_counters, 0, sizeof(compression_counters));
			
 
				+    reset_vfio_bytes_transferred();
			
 
				     ms->to_dst_file = f;
			
 
				 
			
 
				     qemu_mutex_unlock_iothread();
			
@@ -2360,6 +2361,21 @@ static int loadvm_process_command(QEMUFile *f)
 
				             error_report("CMD_OPEN_RETURN_PATH failed");
			
 
				             return -1;
			
 
				         }
			
 
				+
			
 
				+        /*
			
 
				+         * Switchover ack is enabled but no device uses it, so send an ACK to
			
 
				+         * source that it's OK to switchover. Do it here, after return path has
			
 
				+         * been created.
			
 
				+         */
			
 
				+        if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) {
			
 
				+            int ret = migrate_send_rp_switchover_ack(mis);
			
 
				+            if (ret) {
			
 
				+                error_report(
			
 
				+                    "Could not send switchover ack RP MSG, err %d (%s)", ret,
			
 
				+                    strerror(-ret));
			
 
				+                return ret;
			
 
				+            }
			
 
				+        }
			
 
				         break;
			
 
				 
			
 
				     case MIG_CMD_PING:
			
@@ -2586,6 +2602,23 @@ static int qemu_loadvm_state_header(QEMUFile *f)
 
				     return 0;
			
 
				 }
			
 
				 
			
 
				+static void qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis)
			
 
				+{
			
 
				+    SaveStateEntry *se;
			
 
				+
			
 
				+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
			
 
				+        if (!se->ops || !se->ops->switchover_ack_needed) {
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        if (se->ops->switchover_ack_needed(se->opaque)) {
			
 
				+            mis->switchover_ack_pending_num++;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num);
			
 
				+}
			
 
				+
			
 
				 static int qemu_loadvm_state_setup(QEMUFile *f)
			
 
				 {
			
 
				     SaveStateEntry *se;
			
@@ -2789,6 +2822,10 @@ int qemu_loadvm_state(QEMUFile *f)
 
				         return -EINVAL;
			
 
				     }
			
 
				 
			
 
				+    if (migrate_switchover_ack()) {
			
 
				+        qemu_loadvm_state_switchover_ack_needed(mis);
			
 
				+    }
			
 
				+
			
 
				     cpu_synchronize_all_pre_loadvm();
			
 
				 
			
 
				     ret = qemu_loadvm_state_main(f, mis);
			
@@ -2862,6 +2899,24 @@ int qemu_load_device_state(QEMUFile *f)
 
				     return 0;
			
 
				 }
			
 
				 
			
 
				+int qemu_loadvm_approve_switchover(void)
			
 
				+{
			
 
				+    MigrationIncomingState *mis = migration_incoming_get_current();
			
 
				+
			
 
				+    if (!mis->switchover_ack_pending_num) {
			
 
				+        return -EINVAL;
			
 
				+    }
			
 
				+
			
 
				+    mis->switchover_ack_pending_num--;
			
 
				+    trace_loadvm_approve_switchover(mis->switchover_ack_pending_num);
			
 
				+
			
 
				+    if (mis->switchover_ack_pending_num) {
			
 
				+        return 0;
			
 
				+    }
			
 
				+
			
 
				+    return migrate_send_rp_switchover_ack(mis);
			
 
				+}
			
 
				+
			
 
				 bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
			
 
				                   bool has_devices, strList *devices, Error **errp)
			
 
				 {
			
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -65,6 +65,7 @@ int qemu_loadvm_state(QEMUFile *f);
 
				 void qemu_loadvm_state_cleanup(void);
			
 
				 int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
			
 
				 int qemu_load_device_state(QEMUFile *f);
			
 
				+int qemu_loadvm_approve_switchover(void);
			
 
				 int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
			
 
				         bool in_postcopy, bool inactivate_disks);
			
 
				 
			
--- a/migration/target.c
+++ b/migration/target.c
@@ -14,12 +14,25 @@
 
				 #include "hw/vfio/vfio-common.h"
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_VFIO
			
 
				 void populate_vfio_info(MigrationInfo *info)
			
 
				 {
			
 
				-#ifdef CONFIG_VFIO
			
 
				     if (vfio_mig_active()) {
			
 
				         info->vfio = g_malloc0(sizeof(*info->vfio));
			
 
				         info->vfio->transferred = vfio_mig_bytes_transferred();
			
 
				     }
			
 
				-#endif
			
 
				 }
			
 
				+
			
 
				+void reset_vfio_bytes_transferred(void)
			
 
				+{
			
 
				+    vfio_reset_bytes_transferred();
			
 
				+}
			
 
				+#else
			
 
				+void populate_vfio_info(MigrationInfo *info)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+void reset_vfio_bytes_transferred(void)
			
 
				+{
			
 
				+}
			
 
				+#endif
			
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -7,6 +7,7 @@ qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
 
				 qemu_loadvm_state_post_main(int ret) "%d"
			
 
				 qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
			
 
				 qemu_savevm_send_packaged(void) ""
			
 
				+loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
			
 
				 loadvm_state_setup(void) ""
			
 
				 loadvm_state_cleanup(void) ""
			
 
				 loadvm_handle_cmd_packaged(unsigned int length) "%u"
			
@@ -23,6 +24,7 @@ loadvm_postcopy_ram_handle_discard_end(void) ""
 
				 loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
			
 
				 loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
			
 
				 loadvm_process_command_ping(uint32_t val) "0x%x"
			
 
				+loadvm_approve_switchover(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
			
 
				 postcopy_ram_listen_thread_exit(void) ""
			
 
				 postcopy_ram_listen_thread_start(void) ""
			
 
				 qemu_savevm_send_postcopy_advise(void) ""
			
@@ -180,6 +182,7 @@ source_return_path_thread_loop_top(void) ""
 
				 source_return_path_thread_pong(uint32_t val) "0x%x"
			
 
				 source_return_path_thread_shut(uint32_t val) "0x%x"
			
 
				 source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
			
 
				+source_return_path_thread_switchover_acked(void) ""
			
 
				 migration_thread_low_pending(uint64_t pending) "%" PRIu64
			
 
				 migrate_transferred(uint64_t tranferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64
			
 
				 process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
			
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -487,6 +487,16 @@
 
				 #     and should not affect the correctness of postcopy migration.
			
 
				 #     (since 7.1)
			
 
				 #
			
 
				+# @switchover-ack: If enabled, migration will not stop the source VM
			
 
				+#     and complete the migration until an ACK is received from the
			
 
				+#     destination that it's OK to do so.  Exactly when this ACK is
			
 
				+#     sent depends on the migrated devices that use this feature.
			
 
				+#     For example, a device can use it to make sure some of its data
			
 
				+#     is sent and loaded in the destination before doing switchover.
			
 
				+#     This can reduce downtime if devices that support this capability
			
 
				+#     are present.  'return-path' capability must be enabled to use
			
 
				+#     it.  (since 8.1)
			
 
				+#
			
 
				 # Features:
			
 
				 #
			
 
				 # @unstable: Members @x-colo and @x-ignore-shared are experimental.
			
@@ -502,7 +512,7 @@
 
				            'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
			
 
				            { 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
			
 
				            'validate-uuid', 'background-snapshot',
			
 
				-           'zero-copy-send', 'postcopy-preempt'] }
			
 
				+           'zero-copy-send', 'postcopy-preempt', 'switchover-ack'] }
			
 
				 
			
 
				 ##
			
 
				 # @MigrationCapabilityStatus:
			
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -1693,6 +1693,33 @@ static void test_precopy_tcp_plain(void)
 
				     test_precopy_common(&args);
			
 
				 }
			
 
				 
			
 
				+static void *test_migrate_switchover_ack_start(QTestState *from, QTestState *to)
			
 
				+{
			
 
				+
			
 
				+    migrate_set_capability(from, "return-path", true);
			
 
				+    migrate_set_capability(to, "return-path", true);
			
 
				+
			
 
				+    migrate_set_capability(from, "switchover-ack", true);
			
 
				+    migrate_set_capability(to, "switchover-ack", true);
			
 
				+
			
 
				+    return NULL;
			
 
				+}
			
 
				+
			
 
				+static void test_precopy_tcp_switchover_ack(void)
			
 
				+{
			
 
				+    MigrateCommon args = {
			
 
				+        .listen_uri = "tcp:127.0.0.1:0",
			
 
				+        .start_hook = test_migrate_switchover_ack_start,
			
 
				+        /*
			
 
				+         * Source VM must be running in order to consider the switchover ACK
			
 
				+         * when deciding to do switchover or not.
			
 
				+         */
			
 
				+        .live = true,
			
 
				+    };
			
 
				+
			
 
				+    test_precopy_common(&args);
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_GNUTLS
			
 
				 static void test_precopy_tcp_tls_psk_match(void)
			
 
				 {
			
@@ -2737,6 +2764,10 @@ int main(int argc, char **argv)
 
				 #endif /* CONFIG_GNUTLS */
			
 
				 
			
 
				     qtest_add_func("/migration/precopy/tcp/plain", test_precopy_tcp_plain);
			
 
				+
			
 
				+    qtest_add_func("/migration/precopy/tcp/plain/switchover-ack",
			
 
				+                   test_precopy_tcp_switchover_ack);
			
 
				+
			
 
				 #ifdef CONFIG_GNUTLS
			
 
				     qtest_add_func("/migration/precopy/tcp/tls/psk/match",
			
 
				                    test_precopy_tcp_tls_psk_match);