123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684 |
- /*
- * Multifd VFIO migration
- *
- * Copyright (C) 2024,2025 Oracle and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- * SPDX-License-Identifier: GPL-2.0-or-later
- */
- #include "qemu/osdep.h"
- #include "hw/vfio/vfio-common.h"
- #include "migration/misc.h"
- #include "qapi/error.h"
- #include "qemu/bswap.h"
- #include "qemu/error-report.h"
- #include "qemu/lockable.h"
- #include "qemu/main-loop.h"
- #include "qemu/thread.h"
- #include "io/channel-buffer.h"
- #include "migration/qemu-file.h"
- #include "migration-multifd.h"
- #include "trace.h"
- #define VFIO_DEVICE_STATE_CONFIG_STATE (1)
- #define VFIO_DEVICE_STATE_PACKET_VER_CURRENT (0)
- typedef struct VFIODeviceStatePacket {
- uint32_t version;
- uint32_t idx;
- uint32_t flags;
- uint8_t data[0];
- } QEMU_PACKED VFIODeviceStatePacket;
- /* type safety */
- typedef struct VFIOStateBuffers {
- GArray *array;
- } VFIOStateBuffers;
- typedef struct VFIOStateBuffer {
- bool is_present;
- char *data;
- size_t len;
- } VFIOStateBuffer;
- typedef struct VFIOMultifd {
- bool load_bufs_thread_running;
- bool load_bufs_thread_want_exit;
- VFIOStateBuffers load_bufs;
- QemuCond load_bufs_buffer_ready_cond;
- QemuCond load_bufs_thread_finished_cond;
- QemuMutex load_bufs_mutex; /* Lock order: this lock -> BQL */
- uint32_t load_buf_idx;
- uint32_t load_buf_idx_last;
- } VFIOMultifd;
- static void vfio_state_buffer_clear(gpointer data)
- {
- VFIOStateBuffer *lb = data;
- if (!lb->is_present) {
- return;
- }
- g_clear_pointer(&lb->data, g_free);
- lb->is_present = false;
- }
- static void vfio_state_buffers_init(VFIOStateBuffers *bufs)
- {
- bufs->array = g_array_new(FALSE, TRUE, sizeof(VFIOStateBuffer));
- g_array_set_clear_func(bufs->array, vfio_state_buffer_clear);
- }
- static void vfio_state_buffers_destroy(VFIOStateBuffers *bufs)
- {
- g_clear_pointer(&bufs->array, g_array_unref);
- }
- static void vfio_state_buffers_assert_init(VFIOStateBuffers *bufs)
- {
- assert(bufs->array);
- }
- static unsigned int vfio_state_buffers_size_get(VFIOStateBuffers *bufs)
- {
- return bufs->array->len;
- }
- static void vfio_state_buffers_size_set(VFIOStateBuffers *bufs,
- unsigned int size)
- {
- g_array_set_size(bufs->array, size);
- }
- static VFIOStateBuffer *vfio_state_buffers_at(VFIOStateBuffers *bufs,
- unsigned int idx)
- {
- return &g_array_index(bufs->array, VFIOStateBuffer, idx);
- }
- /* called with load_bufs_mutex locked */
- static bool vfio_load_state_buffer_insert(VFIODevice *vbasedev,
- VFIODeviceStatePacket *packet,
- size_t packet_total_size,
- Error **errp)
- {
- VFIOMigration *migration = vbasedev->migration;
- VFIOMultifd *multifd = migration->multifd;
- VFIOStateBuffer *lb;
- vfio_state_buffers_assert_init(&multifd->load_bufs);
- if (packet->idx >= vfio_state_buffers_size_get(&multifd->load_bufs)) {
- vfio_state_buffers_size_set(&multifd->load_bufs, packet->idx + 1);
- }
- lb = vfio_state_buffers_at(&multifd->load_bufs, packet->idx);
- if (lb->is_present) {
- error_setg(errp, "%s: state buffer %" PRIu32 " already filled",
- vbasedev->name, packet->idx);
- return false;
- }
- assert(packet->idx >= multifd->load_buf_idx);
- lb->data = g_memdup2(&packet->data, packet_total_size - sizeof(*packet));
- lb->len = packet_total_size - sizeof(*packet);
- lb->is_present = true;
- return true;
- }
- bool vfio_multifd_load_state_buffer(void *opaque, char *data, size_t data_size,
- Error **errp)
- {
- VFIODevice *vbasedev = opaque;
- VFIOMigration *migration = vbasedev->migration;
- VFIOMultifd *multifd = migration->multifd;
- VFIODeviceStatePacket *packet = (VFIODeviceStatePacket *)data;
- if (!vfio_multifd_transfer_enabled(vbasedev)) {
- error_setg(errp,
- "%s: got device state packet but not doing multifd transfer",
- vbasedev->name);
- return false;
- }
- assert(multifd);
- if (data_size < sizeof(*packet)) {
- error_setg(errp, "%s: packet too short at %zu (min is %zu)",
- vbasedev->name, data_size, sizeof(*packet));
- return false;
- }
- packet->version = be32_to_cpu(packet->version);
- if (packet->version != VFIO_DEVICE_STATE_PACKET_VER_CURRENT) {
- error_setg(errp, "%s: packet has unknown version %" PRIu32,
- vbasedev->name, packet->version);
- return false;
- }
- packet->idx = be32_to_cpu(packet->idx);
- packet->flags = be32_to_cpu(packet->flags);
- if (packet->idx == UINT32_MAX) {
- error_setg(errp, "%s: packet index is invalid", vbasedev->name);
- return false;
- }
- trace_vfio_load_state_device_buffer_incoming(vbasedev->name, packet->idx);
- /*
- * Holding BQL here would violate the lock order and can cause
- * a deadlock once we attempt to lock load_bufs_mutex below.
- */
- assert(!bql_locked());
- WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
- /* config state packet should be the last one in the stream */
- if (packet->flags & VFIO_DEVICE_STATE_CONFIG_STATE) {
- multifd->load_buf_idx_last = packet->idx;
- }
- if (!vfio_load_state_buffer_insert(vbasedev, packet, data_size,
- errp)) {
- return false;
- }
- qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
- }
- return true;
- }
- static bool vfio_load_bufs_thread_load_config(VFIODevice *vbasedev,
- Error **errp)
- {
- VFIOMigration *migration = vbasedev->migration;
- VFIOMultifd *multifd = migration->multifd;
- VFIOStateBuffer *lb;
- g_autoptr(QIOChannelBuffer) bioc = NULL;
- g_autoptr(QEMUFile) f_out = NULL, f_in = NULL;
- uint64_t mig_header;
- int ret;
- assert(multifd->load_buf_idx == multifd->load_buf_idx_last);
- lb = vfio_state_buffers_at(&multifd->load_bufs, multifd->load_buf_idx);
- assert(lb->is_present);
- bioc = qio_channel_buffer_new(lb->len);
- qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-load");
- f_out = qemu_file_new_output(QIO_CHANNEL(bioc));
- qemu_put_buffer(f_out, (uint8_t *)lb->data, lb->len);
- ret = qemu_fflush(f_out);
- if (ret) {
- error_setg(errp, "%s: load config state flush failed: %d",
- vbasedev->name, ret);
- return false;
- }
- qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
- f_in = qemu_file_new_input(QIO_CHANNEL(bioc));
- mig_header = qemu_get_be64(f_in);
- if (mig_header != VFIO_MIG_FLAG_DEV_CONFIG_STATE) {
- error_setg(errp, "%s: expected FLAG_DEV_CONFIG_STATE but got %" PRIx64,
- vbasedev->name, mig_header);
- return false;
- }
- bql_lock();
- ret = vfio_load_device_config_state(f_in, vbasedev);
- bql_unlock();
- if (ret < 0) {
- error_setg(errp, "%s: vfio_load_device_config_state() failed: %d",
- vbasedev->name, ret);
- return false;
- }
- return true;
- }
- static VFIOStateBuffer *vfio_load_state_buffer_get(VFIOMultifd *multifd)
- {
- VFIOStateBuffer *lb;
- unsigned int bufs_len;
- bufs_len = vfio_state_buffers_size_get(&multifd->load_bufs);
- if (multifd->load_buf_idx >= bufs_len) {
- assert(multifd->load_buf_idx == bufs_len);
- return NULL;
- }
- lb = vfio_state_buffers_at(&multifd->load_bufs,
- multifd->load_buf_idx);
- if (!lb->is_present) {
- return NULL;
- }
- return lb;
- }
- static bool vfio_load_state_buffer_write(VFIODevice *vbasedev,
- VFIOStateBuffer *lb,
- Error **errp)
- {
- VFIOMigration *migration = vbasedev->migration;
- VFIOMultifd *multifd = migration->multifd;
- g_autofree char *buf = NULL;
- char *buf_cur;
- size_t buf_len;
- if (!lb->len) {
- return true;
- }
- trace_vfio_load_state_device_buffer_load_start(vbasedev->name,
- multifd->load_buf_idx);
- /* lb might become re-allocated when we drop the lock */
- buf = g_steal_pointer(&lb->data);
- buf_cur = buf;
- buf_len = lb->len;
- while (buf_len > 0) {
- ssize_t wr_ret;
- int errno_save;
- /*
- * Loading data to the device takes a while,
- * drop the lock during this process.
- */
- qemu_mutex_unlock(&multifd->load_bufs_mutex);
- wr_ret = write(migration->data_fd, buf_cur, buf_len);
- errno_save = errno;
- qemu_mutex_lock(&multifd->load_bufs_mutex);
- if (wr_ret < 0) {
- error_setg(errp,
- "%s: writing state buffer %" PRIu32 " failed: %d",
- vbasedev->name, multifd->load_buf_idx, errno_save);
- return false;
- }
- assert(wr_ret <= buf_len);
- buf_len -= wr_ret;
- buf_cur += wr_ret;
- }
- trace_vfio_load_state_device_buffer_load_end(vbasedev->name,
- multifd->load_buf_idx);
- return true;
- }
- static bool vfio_load_bufs_thread_want_exit(VFIOMultifd *multifd,
- bool *should_quit)
- {
- return multifd->load_bufs_thread_want_exit || qatomic_read(should_quit);
- }
- /*
- * This thread is spawned by vfio_multifd_switchover_start() which gets
- * called upon encountering the switchover point marker in main migration
- * stream.
- *
- * It exits after either:
- * * completing loading the remaining device state and device config, OR:
- * * encountering some error while doing the above, OR:
- * * being forcefully aborted by the migration core by it setting should_quit
- * or by vfio_load_cleanup_load_bufs_thread() setting
- * multifd->load_bufs_thread_want_exit.
- */
- static bool vfio_load_bufs_thread(void *opaque, bool *should_quit, Error **errp)
- {
- VFIODevice *vbasedev = opaque;
- VFIOMigration *migration = vbasedev->migration;
- VFIOMultifd *multifd = migration->multifd;
- bool ret = false;
- trace_vfio_load_bufs_thread_start(vbasedev->name);
- assert(multifd);
- QEMU_LOCK_GUARD(&multifd->load_bufs_mutex);
- assert(multifd->load_bufs_thread_running);
- while (true) {
- VFIOStateBuffer *lb;
- /*
- * Always check cancellation first after the buffer_ready wait below in
- * case that cond was signalled by vfio_load_cleanup_load_bufs_thread().
- */
- if (vfio_load_bufs_thread_want_exit(multifd, should_quit)) {
- error_setg(errp, "operation cancelled");
- goto thread_exit;
- }
- assert(multifd->load_buf_idx <= multifd->load_buf_idx_last);
- lb = vfio_load_state_buffer_get(multifd);
- if (!lb) {
- trace_vfio_load_state_device_buffer_starved(vbasedev->name,
- multifd->load_buf_idx);
- qemu_cond_wait(&multifd->load_bufs_buffer_ready_cond,
- &multifd->load_bufs_mutex);
- continue;
- }
- if (multifd->load_buf_idx == multifd->load_buf_idx_last) {
- break;
- }
- if (multifd->load_buf_idx == 0) {
- trace_vfio_load_state_device_buffer_start(vbasedev->name);
- }
- if (!vfio_load_state_buffer_write(vbasedev, lb, errp)) {
- goto thread_exit;
- }
- if (multifd->load_buf_idx == multifd->load_buf_idx_last - 1) {
- trace_vfio_load_state_device_buffer_end(vbasedev->name);
- }
- multifd->load_buf_idx++;
- }
- if (!vfio_load_bufs_thread_load_config(vbasedev, errp)) {
- goto thread_exit;
- }
- ret = true;
- thread_exit:
- /*
- * Notify possibly waiting vfio_load_cleanup_load_bufs_thread() that
- * this thread is exiting.
- */
- multifd->load_bufs_thread_running = false;
- qemu_cond_signal(&multifd->load_bufs_thread_finished_cond);
- trace_vfio_load_bufs_thread_end(vbasedev->name);
- return ret;
- }
- static VFIOMultifd *vfio_multifd_new(void)
- {
- VFIOMultifd *multifd = g_new(VFIOMultifd, 1);
- vfio_state_buffers_init(&multifd->load_bufs);
- qemu_mutex_init(&multifd->load_bufs_mutex);
- multifd->load_buf_idx = 0;
- multifd->load_buf_idx_last = UINT32_MAX;
- qemu_cond_init(&multifd->load_bufs_buffer_ready_cond);
- multifd->load_bufs_thread_running = false;
- multifd->load_bufs_thread_want_exit = false;
- qemu_cond_init(&multifd->load_bufs_thread_finished_cond);
- return multifd;
- }
- /*
- * Terminates vfio_load_bufs_thread by setting
- * multifd->load_bufs_thread_want_exit and signalling all the conditions
- * the thread could be blocked on.
- *
- * Waits for the thread to signal that it had finished.
- */
- static void vfio_load_cleanup_load_bufs_thread(VFIOMultifd *multifd)
- {
- /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
- bql_unlock();
- WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
- while (multifd->load_bufs_thread_running) {
- multifd->load_bufs_thread_want_exit = true;
- qemu_cond_signal(&multifd->load_bufs_buffer_ready_cond);
- qemu_cond_wait(&multifd->load_bufs_thread_finished_cond,
- &multifd->load_bufs_mutex);
- }
- }
- bql_lock();
- }
- static void vfio_multifd_free(VFIOMultifd *multifd)
- {
- vfio_load_cleanup_load_bufs_thread(multifd);
- qemu_cond_destroy(&multifd->load_bufs_thread_finished_cond);
- vfio_state_buffers_destroy(&multifd->load_bufs);
- qemu_cond_destroy(&multifd->load_bufs_buffer_ready_cond);
- qemu_mutex_destroy(&multifd->load_bufs_mutex);
- g_free(multifd);
- }
- void vfio_multifd_cleanup(VFIODevice *vbasedev)
- {
- VFIOMigration *migration = vbasedev->migration;
- g_clear_pointer(&migration->multifd, vfio_multifd_free);
- }
- bool vfio_multifd_transfer_supported(void)
- {
- return multifd_device_state_supported() &&
- migrate_send_switchover_start();
- }
- bool vfio_multifd_transfer_enabled(VFIODevice *vbasedev)
- {
- VFIOMigration *migration = vbasedev->migration;
- return migration->multifd_transfer;
- }
- bool vfio_multifd_setup(VFIODevice *vbasedev, bool alloc_multifd, Error **errp)
- {
- VFIOMigration *migration = vbasedev->migration;
- /*
- * Make a copy of this setting at the start in case it is changed
- * mid-migration.
- */
- if (vbasedev->migration_multifd_transfer == ON_OFF_AUTO_AUTO) {
- migration->multifd_transfer = vfio_multifd_transfer_supported();
- } else {
- migration->multifd_transfer =
- vbasedev->migration_multifd_transfer == ON_OFF_AUTO_ON;
- }
- if (!vfio_multifd_transfer_enabled(vbasedev)) {
- /* Nothing further to check or do */
- return true;
- }
- if (!vfio_multifd_transfer_supported()) {
- error_setg(errp,
- "%s: Multifd device transfer requested but unsupported in the current config",
- vbasedev->name);
- return false;
- }
- if (alloc_multifd) {
- assert(!migration->multifd);
- migration->multifd = vfio_multifd_new();
- }
- return true;
- }
- void vfio_multifd_emit_dummy_eos(VFIODevice *vbasedev, QEMUFile *f)
- {
- assert(vfio_multifd_transfer_enabled(vbasedev));
- /*
- * Emit dummy NOP data on the main migration channel since the actual
- * device state transfer is done via multifd channels.
- */
- qemu_put_be64(f, VFIO_MIG_FLAG_END_OF_STATE);
- }
- static bool
- vfio_save_complete_precopy_thread_config_state(VFIODevice *vbasedev,
- char *idstr,
- uint32_t instance_id,
- uint32_t idx,
- Error **errp)
- {
- g_autoptr(QIOChannelBuffer) bioc = NULL;
- g_autoptr(QEMUFile) f = NULL;
- int ret;
- g_autofree VFIODeviceStatePacket *packet = NULL;
- size_t packet_len;
- bioc = qio_channel_buffer_new(0);
- qio_channel_set_name(QIO_CHANNEL(bioc), "vfio-device-config-save");
- f = qemu_file_new_output(QIO_CHANNEL(bioc));
- if (vfio_save_device_config_state(f, vbasedev, errp)) {
- return false;
- }
- ret = qemu_fflush(f);
- if (ret) {
- error_setg(errp, "%s: save config state flush failed: %d",
- vbasedev->name, ret);
- return false;
- }
- packet_len = sizeof(*packet) + bioc->usage;
- packet = g_malloc0(packet_len);
- packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
- packet->idx = cpu_to_be32(idx);
- packet->flags = cpu_to_be32(VFIO_DEVICE_STATE_CONFIG_STATE);
- memcpy(&packet->data, bioc->data, bioc->usage);
- if (!multifd_queue_device_state(idstr, instance_id,
- (char *)packet, packet_len)) {
- error_setg(errp, "%s: multifd config data queuing failed",
- vbasedev->name);
- return false;
- }
- vfio_mig_add_bytes_transferred(packet_len);
- return true;
- }
- /*
- * This thread is spawned by the migration core directly via
- * .save_live_complete_precopy_thread SaveVMHandler.
- *
- * It exits after either:
- * * completing saving the remaining device state and device config, OR:
- * * encountering some error while doing the above, OR:
- * * being forcefully aborted by the migration core by
- * multifd_device_state_save_thread_should_exit() returning true.
- */
- bool
- vfio_multifd_save_complete_precopy_thread(SaveLiveCompletePrecopyThreadData *d,
- Error **errp)
- {
- VFIODevice *vbasedev = d->handler_opaque;
- VFIOMigration *migration = vbasedev->migration;
- bool ret = false;
- g_autofree VFIODeviceStatePacket *packet = NULL;
- uint32_t idx;
- if (!vfio_multifd_transfer_enabled(vbasedev)) {
- /* Nothing to do, vfio_save_complete_precopy() does the transfer. */
- return true;
- }
- trace_vfio_save_complete_precopy_thread_start(vbasedev->name,
- d->idstr, d->instance_id);
- /* We reach here with device state STOP or STOP_COPY only */
- if (vfio_migration_set_state(vbasedev, VFIO_DEVICE_STATE_STOP_COPY,
- VFIO_DEVICE_STATE_STOP, errp)) {
- goto thread_exit;
- }
- packet = g_malloc0(sizeof(*packet) + migration->data_buffer_size);
- packet->version = cpu_to_be32(VFIO_DEVICE_STATE_PACKET_VER_CURRENT);
- for (idx = 0; ; idx++) {
- ssize_t data_size;
- size_t packet_size;
- if (multifd_device_state_save_thread_should_exit()) {
- error_setg(errp, "operation cancelled");
- goto thread_exit;
- }
- data_size = read(migration->data_fd, &packet->data,
- migration->data_buffer_size);
- if (data_size < 0) {
- error_setg(errp, "%s: reading state buffer %" PRIu32 " failed: %d",
- vbasedev->name, idx, errno);
- goto thread_exit;
- } else if (data_size == 0) {
- break;
- }
- packet->idx = cpu_to_be32(idx);
- packet_size = sizeof(*packet) + data_size;
- if (!multifd_queue_device_state(d->idstr, d->instance_id,
- (char *)packet, packet_size)) {
- error_setg(errp, "%s: multifd data queuing failed", vbasedev->name);
- goto thread_exit;
- }
- vfio_mig_add_bytes_transferred(packet_size);
- }
- if (!vfio_save_complete_precopy_thread_config_state(vbasedev,
- d->idstr,
- d->instance_id,
- idx, errp)) {
- goto thread_exit;
- }
- ret = true;
- thread_exit:
- trace_vfio_save_complete_precopy_thread_end(vbasedev->name, ret);
- return ret;
- }
- int vfio_multifd_switchover_start(VFIODevice *vbasedev)
- {
- VFIOMigration *migration = vbasedev->migration;
- VFIOMultifd *multifd = migration->multifd;
- assert(multifd);
- /* The lock order is load_bufs_mutex -> BQL so unlock BQL here first */
- bql_unlock();
- WITH_QEMU_LOCK_GUARD(&multifd->load_bufs_mutex) {
- assert(!multifd->load_bufs_thread_running);
- multifd->load_bufs_thread_running = true;
- }
- bql_lock();
- qemu_loadvm_start_load_thread(vfio_load_bufs_thread, vbasedev);
- return 0;
- }
|