migration.h 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528
  1. /*
  2. * QEMU live migration
  3. *
  4. * Copyright IBM, Corp. 2008
  5. *
  6. * Authors:
  7. * Anthony Liguori <aliguori@us.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. */
  13. #ifndef QEMU_MIGRATION_H
  14. #define QEMU_MIGRATION_H
  15. #include "exec/cpu-common.h"
  16. #include "hw/qdev-core.h"
  17. #include "qapi/qapi-types-migration.h"
  18. #include "qapi/qmp/json-writer.h"
  19. #include "qemu/thread.h"
  20. #include "qemu/coroutine_int.h"
  21. #include "io/channel.h"
  22. #include "io/channel-buffer.h"
  23. #include "net/announce.h"
  24. #include "qom/object.h"
  25. #include "postcopy-ram.h"
  26. struct PostcopyBlocktimeContext;
  27. #define MIGRATION_RESUME_ACK_VALUE (1)
  28. /*
  29. * 1<<6=64 pages -> 256K chunk when page size is 4K. This gives us
  30. * the benefit that all the chunks are 64 pages aligned then the
  31. * bitmaps are always aligned to LONG.
  32. */
  33. #define CLEAR_BITMAP_SHIFT_MIN 6
  34. /*
  35. * 1<<18=256K pages -> 1G chunk when page size is 4K. This is the
  36. * default value to use if no one specified.
  37. */
  38. #define CLEAR_BITMAP_SHIFT_DEFAULT 18
  39. /*
  40. * 1<<31=2G pages -> 8T chunk when page size is 4K. This should be
  41. * big enough and make sure we won't overflow easily.
  42. */
  43. #define CLEAR_BITMAP_SHIFT_MAX 31
  44. /* This is an abstraction of a "temp huge page" for postcopy's purpose */
  45. typedef struct {
  46. /*
  47. * This points to a temporary huge page as a buffer for UFFDIO_COPY. It's
  48. * mmap()ed and needs to be freed when cleanup.
  49. */
  50. void *tmp_huge_page;
  51. /*
  52. * This points to the host page we're going to install for this temp page.
  53. * It tells us after we've received the whole page, where we should put it.
  54. */
  55. void *host_addr;
  56. /* Number of small pages copied (in size of TARGET_PAGE_SIZE) */
  57. unsigned int target_pages;
  58. /* Whether this page contains all zeros */
  59. bool all_zero;
  60. } PostcopyTmpPage;
  61. typedef enum {
  62. PREEMPT_THREAD_NONE = 0,
  63. PREEMPT_THREAD_CREATED,
  64. PREEMPT_THREAD_QUIT,
  65. } PreemptThreadStatus;
  66. /* State for the incoming migration */
  67. struct MigrationIncomingState {
  68. QEMUFile *from_src_file;
  69. /* Previously received RAM's RAMBlock pointer */
  70. RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
  71. /* A hook to allow cleanup at the end of incoming migration */
  72. void *transport_data;
  73. void (*transport_cleanup)(void *data);
  74. /*
  75. * Used to sync thread creations. Note that we can't create threads in
  76. * parallel with this sem.
  77. */
  78. QemuSemaphore thread_sync_sem;
  79. /*
  80. * Free at the start of the main state load, set as the main thread finishes
  81. * loading state.
  82. */
  83. QemuEvent main_thread_load_event;
  84. /* For network announces */
  85. AnnounceTimer announce_timer;
  86. size_t largest_page_size;
  87. bool have_fault_thread;
  88. QemuThread fault_thread;
  89. /* Set this when we want the fault thread to quit */
  90. bool fault_thread_quit;
  91. bool have_listen_thread;
  92. QemuThread listen_thread;
  93. /* For the kernel to send us notifications */
  94. int userfault_fd;
  95. /* To notify the fault_thread to wake, e.g., when need to quit */
  96. int userfault_event_fd;
  97. QEMUFile *to_src_file;
  98. QemuMutex rp_mutex; /* We send replies from multiple threads */
  99. /* RAMBlock of last request sent to source */
  100. RAMBlock *last_rb;
  101. /*
  102. * Number of postcopy channels including the default precopy channel, so
  103. * vanilla postcopy will only contain one channel which contain both
  104. * precopy and postcopy streams.
  105. *
  106. * This is calculated when the src requests to enable postcopy but before
  107. * it starts. Its value can depend on e.g. whether postcopy preemption is
  108. * enabled.
  109. */
  110. unsigned int postcopy_channels;
  111. /* QEMUFile for postcopy only; it'll be handled by a separate thread */
  112. QEMUFile *postcopy_qemufile_dst;
  113. /*
  114. * When postcopy_qemufile_dst is properly setup, this sem is posted.
  115. * One can wait on this semaphore to wait until the preempt channel is
  116. * properly setup.
  117. */
  118. QemuSemaphore postcopy_qemufile_dst_done;
  119. /* Postcopy priority thread is used to receive postcopy requested pages */
  120. QemuThread postcopy_prio_thread;
  121. /*
  122. * Always set by the main vm load thread only, but can be read by the
  123. * postcopy preempt thread. "volatile" makes sure all reads will be
  124. * uptodate across cores.
  125. */
  126. volatile PreemptThreadStatus preempt_thread_status;
  127. /*
  128. * Used to sync between the ram load main thread and the fast ram load
  129. * thread. It protects postcopy_qemufile_dst, which is the postcopy
  130. * fast channel.
  131. *
  132. * The ram fast load thread will take it mostly for the whole lifecycle
  133. * because it needs to continuously read data from the channel, and
  134. * it'll only release this mutex if postcopy is interrupted, so that
  135. * the ram load main thread will take this mutex over and properly
  136. * release the broken channel.
  137. */
  138. QemuMutex postcopy_prio_thread_mutex;
  139. /*
  140. * An array of temp host huge pages to be used, one for each postcopy
  141. * channel.
  142. */
  143. PostcopyTmpPage *postcopy_tmp_pages;
  144. /* This is shared for all postcopy channels */
  145. void *postcopy_tmp_zero_page;
  146. /* PostCopyFD's for external userfaultfds & handlers of shared memory */
  147. GArray *postcopy_remote_fds;
  148. QEMUBH *bh;
  149. int state;
  150. bool have_colo_incoming_thread;
  151. QemuThread colo_incoming_thread;
  152. /* The coroutine we should enter (back) after failover */
  153. Coroutine *migration_incoming_co;
  154. QemuSemaphore colo_incoming_sem;
  155. /*
  156. * PostcopyBlocktimeContext to keep information for postcopy
  157. * live migration, to calculate vCPU block time
  158. * */
  159. struct PostcopyBlocktimeContext *blocktime_ctx;
  160. /* notify PAUSED postcopy incoming migrations to try to continue */
  161. QemuSemaphore postcopy_pause_sem_dst;
  162. QemuSemaphore postcopy_pause_sem_fault;
  163. /*
  164. * This semaphore is used to allow the ram fast load thread (only when
  165. * postcopy preempt is enabled) fall into sleep when there's network
  166. * interruption detected. When the recovery is done, the main load
  167. * thread will kick the fast ram load thread using this semaphore.
  168. */
  169. QemuSemaphore postcopy_pause_sem_fast_load;
  170. /* List of listening socket addresses */
  171. SocketAddressList *socket_address_list;
  172. /* A tree of pages that we requested to the source VM */
  173. GTree *page_requested;
  174. /* For debugging purpose only, but would be nice to keep */
  175. int page_requested_count;
  176. /*
  177. * The mutex helps to maintain the requested pages that we sent to the
  178. * source, IOW, to guarantee coherent between the page_requests tree and
  179. * the per-ramblock receivedmap. Note! This does not guarantee consistency
  180. * of the real page copy procedures (using UFFDIO_[ZERO]COPY). E.g., even
  181. * if one bit in receivedmap is cleared, UFFDIO_COPY could have happened
  182. * for that page already. This is intended so that the mutex won't
  183. * serialize and blocked by slow operations like UFFDIO_* ioctls. However
  184. * this should be enough to make sure the page_requested tree always
  185. * contains valid information.
  186. */
  187. QemuMutex page_request_mutex;
  188. };
  189. MigrationIncomingState *migration_incoming_get_current(void);
  190. void migration_incoming_state_destroy(void);
  191. void migration_incoming_transport_cleanup(MigrationIncomingState *mis);
  192. /*
  193. * Functions to work with blocktime context
  194. */
  195. void fill_destination_postcopy_migration_info(MigrationInfo *info);
  196. #define TYPE_MIGRATION "migration"
  197. typedef struct MigrationClass MigrationClass;
  198. DECLARE_OBJ_CHECKERS(MigrationState, MigrationClass,
  199. MIGRATION_OBJ, TYPE_MIGRATION)
  200. struct MigrationClass {
  201. /*< private >*/
  202. DeviceClass parent_class;
  203. };
  204. struct MigrationState {
  205. /*< private >*/
  206. DeviceState parent_obj;
  207. /*< public >*/
  208. QemuThread thread;
  209. QEMUBH *vm_start_bh;
  210. QEMUBH *cleanup_bh;
  211. /* Protected by qemu_file_lock */
  212. QEMUFile *to_dst_file;
  213. /* Postcopy specific transfer channel */
  214. QEMUFile *postcopy_qemufile_src;
  215. /*
  216. * It is posted when the preempt channel is established. Note: this is
  217. * used for both the start or recover of a postcopy migration. We'll
  218. * post to this sem every time a new preempt channel is created in the
  219. * main thread, and we keep post() and wait() in pair.
  220. */
  221. QemuSemaphore postcopy_qemufile_src_sem;
  222. QIOChannelBuffer *bioc;
  223. /*
  224. * Protects to_dst_file/from_dst_file pointers. We need to make sure we
  225. * won't yield or hang during the critical section, since this lock will be
  226. * used in OOB command handler.
  227. */
  228. QemuMutex qemu_file_lock;
  229. /*
  230. * Used to allow urgent requests to override rate limiting.
  231. */
  232. QemuSemaphore rate_limit_sem;
  233. /* pages already send at the beginning of current iteration */
  234. uint64_t iteration_initial_pages;
  235. /* pages transferred per second */
  236. double pages_per_second;
  237. /* bytes already send at the beginning of current iteration */
  238. uint64_t iteration_initial_bytes;
  239. /* time at the start of current iteration */
  240. int64_t iteration_start_time;
  241. /*
  242. * The final stage happens when the remaining data is smaller than
  243. * this threshold; it's calculated from the requested downtime and
  244. * measured bandwidth
  245. */
  246. int64_t threshold_size;
  247. /* params from 'migrate-set-parameters' */
  248. MigrationParameters parameters;
  249. int state;
  250. /* State related to return path */
  251. struct {
  252. /* Protected by qemu_file_lock */
  253. QEMUFile *from_dst_file;
  254. QemuThread rp_thread;
  255. bool error;
  256. /*
  257. * We can also check non-zero of rp_thread, but there's no "official"
  258. * way to do this, so this bool makes it slightly more elegant.
  259. * Checking from_dst_file for this is racy because from_dst_file will
  260. * be cleared in the rp_thread!
  261. */
  262. bool rp_thread_created;
  263. QemuSemaphore rp_sem;
  264. /*
  265. * We post to this when we got one PONG from dest. So far it's an
  266. * easy way to know the main channel has successfully established
  267. * on dest QEMU.
  268. */
  269. QemuSemaphore rp_pong_acks;
  270. } rp_state;
  271. double mbps;
  272. /* Timestamp when recent migration starts (ms) */
  273. int64_t start_time;
  274. /* Total time used by latest migration (ms) */
  275. int64_t total_time;
  276. /* Timestamp when VM is down (ms) to migrate the last stuff */
  277. int64_t downtime_start;
  278. int64_t downtime;
  279. int64_t expected_downtime;
  280. bool enabled_capabilities[MIGRATION_CAPABILITY__MAX];
  281. int64_t setup_time;
  282. /*
  283. * Whether guest was running when we enter the completion stage.
  284. * If migration is interrupted by any reason, we need to continue
  285. * running the guest on source.
  286. */
  287. bool vm_was_running;
  288. /* Flag set once the migration has been asked to enter postcopy */
  289. bool start_postcopy;
  290. /* Flag set after postcopy has sent the device state */
  291. bool postcopy_after_devices;
  292. /* Flag set once the migration thread is running (and needs joining) */
  293. bool migration_thread_running;
  294. /* Flag set once the migration thread called bdrv_inactivate_all */
  295. bool block_inactive;
  296. /* Migration is waiting for guest to unplug device */
  297. QemuSemaphore wait_unplug_sem;
  298. /* Migration is paused due to pause-before-switchover */
  299. QemuSemaphore pause_sem;
  300. /* The semaphore is used to notify COLO thread that failover is finished */
  301. QemuSemaphore colo_exit_sem;
  302. /* The event is used to notify COLO thread to do checkpoint */
  303. QemuEvent colo_checkpoint_event;
  304. int64_t colo_checkpoint_time;
  305. QEMUTimer *colo_delay_timer;
  306. /* The first error that has occurred.
  307. We used the mutex to be able to return the 1st error message */
  308. Error *error;
  309. /* mutex to protect errp */
  310. QemuMutex error_mutex;
  311. /* Do we have to clean up -b/-i from old migrate parameters */
  312. /* This feature is deprecated and will be removed */
  313. bool must_remove_block_options;
  314. /*
  315. * Global switch on whether we need to store the global state
  316. * during migration.
  317. */
  318. bool store_global_state;
  319. /* Whether we send QEMU_VM_CONFIGURATION during migration */
  320. bool send_configuration;
  321. /* Whether we send section footer during migration */
  322. bool send_section_footer;
  323. /* Needed by postcopy-pause state */
  324. QemuSemaphore postcopy_pause_sem;
  325. QemuSemaphore postcopy_pause_rp_sem;
  326. /*
  327. * Whether we abort the migration if decompression errors are
  328. * detected at the destination. It is left at false for qemu
  329. * older than 3.0, since only newer qemu sends streams that
  330. * do not trigger spurious decompression errors.
  331. */
  332. bool decompress_error_check;
  333. /*
  334. * This variable only affects behavior when postcopy preempt mode is
  335. * enabled.
  336. *
  337. * When set:
  338. *
  339. * - postcopy preempt src QEMU instance will generate an EOS message at
  340. * the end of migration to shut the preempt channel on dest side.
  341. *
  342. * - postcopy preempt channel will be created at the setup phase on src
  343. QEMU.
  344. *
  345. * When clear:
  346. *
  347. * - postcopy preempt src QEMU instance will _not_ generate an EOS
  348. * message at the end of migration, the dest qemu will shutdown the
  349. * channel itself.
  350. *
  351. * - postcopy preempt channel will be created at the switching phase
  352. * from precopy -> postcopy (to avoid race condtion of misordered
  353. * creation of channels).
  354. *
  355. * NOTE: See message-id <ZBoShWArKDPpX/D7@work-vm> on qemu-devel
  356. * mailing list for more information on the possible race. Everyone
  357. * should probably just keep this value untouched after set by the
  358. * machine type (or the default).
  359. */
  360. bool preempt_pre_7_2;
  361. /*
  362. * This decides the size of guest memory chunk that will be used
  363. * to track dirty bitmap clearing. The size of memory chunk will
  364. * be GUEST_PAGE_SIZE << N. Say, N=0 means we will clear dirty
  365. * bitmap for each page to send (1<<0=1); N=10 means we will clear
  366. * dirty bitmap only once for 1<<10=1K continuous guest pages
  367. * (which is in 4M chunk).
  368. */
  369. uint8_t clear_bitmap_shift;
  370. /*
  371. * This save hostname when out-going migration starts
  372. */
  373. char *hostname;
  374. /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
  375. JSONWriter *vmdesc;
  376. };
  377. void migrate_set_state(int *state, int old_state, int new_state);
  378. void migration_fd_process_incoming(QEMUFile *f, Error **errp);
  379. void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
  380. void migration_incoming_process(void);
  381. bool migration_has_all_channels(void);
  382. uint64_t migrate_max_downtime(void);
  383. void migrate_set_error(MigrationState *s, const Error *error);
  384. void migrate_fd_error(MigrationState *s, const Error *error);
  385. void migrate_fd_connect(MigrationState *s, Error *error_in);
  386. bool migration_is_setup_or_active(int state);
  387. bool migration_is_running(int state);
  388. void migrate_init(MigrationState *s);
  389. bool migration_is_blocked(Error **errp);
  390. /* True if outgoing migration has entered postcopy phase */
  391. bool migration_in_postcopy(void);
  392. MigrationState *migrate_get_current(void);
  393. bool migrate_postcopy(void);
  394. bool migrate_release_ram(void);
  395. bool migrate_postcopy_ram(void);
  396. bool migrate_zero_blocks(void);
  397. bool migrate_dirty_bitmaps(void);
  398. bool migrate_ignore_shared(void);
  399. bool migrate_validate_uuid(void);
  400. bool migrate_auto_converge(void);
  401. bool migrate_use_multifd(void);
  402. bool migrate_pause_before_switchover(void);
  403. int migrate_multifd_channels(void);
  404. MultiFDCompression migrate_multifd_compression(void);
  405. int migrate_multifd_zlib_level(void);
  406. int migrate_multifd_zstd_level(void);
  407. #ifdef CONFIG_LINUX
  408. bool migrate_use_zero_copy_send(void);
  409. #else
  410. #define migrate_use_zero_copy_send() (false)
  411. #endif
  412. int migrate_use_tls(void);
  413. int migrate_use_xbzrle(void);
  414. uint64_t migrate_xbzrle_cache_size(void);
  415. bool migrate_colo_enabled(void);
  416. bool migrate_use_block(void);
  417. bool migrate_use_block_incremental(void);
  418. int migrate_max_cpu_throttle(void);
  419. bool migrate_use_return_path(void);
  420. uint64_t ram_get_total_transferred_pages(void);
  421. bool migrate_use_compression(void);
  422. int migrate_compress_level(void);
  423. int migrate_compress_threads(void);
  424. int migrate_compress_wait_thread(void);
  425. int migrate_decompress_threads(void);
  426. bool migrate_use_events(void);
  427. bool migrate_postcopy_blocktime(void);
  428. bool migrate_background_snapshot(void);
  429. bool migrate_postcopy_preempt(void);
  430. /* Sending on the return path - generic and then for each message type */
  431. void migrate_send_rp_shut(MigrationIncomingState *mis,
  432. uint32_t value);
  433. void migrate_send_rp_pong(MigrationIncomingState *mis,
  434. uint32_t value);
  435. int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb,
  436. ram_addr_t start, uint64_t haddr);
  437. int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
  438. RAMBlock *rb, ram_addr_t start);
  439. void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
  440. char *block_name);
  441. void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
  442. void dirty_bitmap_mig_before_vm_start(void);
  443. void dirty_bitmap_mig_cancel_outgoing(void);
  444. void dirty_bitmap_mig_cancel_incoming(void);
  445. bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm,
  446. Error **errp);
  447. void migrate_add_address(SocketAddress *address);
  448. int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
  449. #define qemu_ram_foreach_block \
  450. #warning "Use foreach_not_ignored_block in migration code"
  451. void migration_make_urgent_request(void);
  452. void migration_consume_urgent_request(void);
  453. bool migration_rate_limit(void);
  454. void migration_cancel(const Error *error);
  455. void populate_vfio_info(MigrationInfo *info);
  456. void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
  457. #endif