migration.h 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. /*
  2. * QEMU live migration
  3. *
  4. * Copyright IBM, Corp. 2008
  5. *
  6. * Authors:
  7. * Anthony Liguori <aliguori@us.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. */
  13. #ifndef QEMU_MIGRATION_H
  14. #define QEMU_MIGRATION_H
  15. #include "exec/cpu-common.h"
  16. #include "hw/qdev-core.h"
  17. #include "qapi/qapi-types-migration.h"
  18. #include "qobject/json-writer.h"
  19. #include "qemu/thread.h"
  20. #include "qemu/coroutine.h"
  21. #include "io/channel.h"
  22. #include "io/channel-buffer.h"
  23. #include "net/announce.h"
  24. #include "qom/object.h"
  25. #include "postcopy-ram.h"
  26. #include "system/runstate.h"
  27. #include "migration/misc.h"
  28. #define MIGRATION_THREAD_SNAPSHOT "mig/snapshot"
  29. #define MIGRATION_THREAD_DIRTY_RATE "mig/dirtyrate"
  30. #define MIGRATION_THREAD_SRC_MAIN "mig/src/main"
  31. #define MIGRATION_THREAD_SRC_MULTIFD "mig/src/send_%d"
  32. #define MIGRATION_THREAD_SRC_RETURN "mig/src/return"
  33. #define MIGRATION_THREAD_SRC_TLS "mig/src/tls"
  34. #define MIGRATION_THREAD_DST_COLO "mig/dst/colo"
  35. #define MIGRATION_THREAD_DST_MULTIFD "mig/dst/recv_%d"
  36. #define MIGRATION_THREAD_DST_FAULT "mig/dst/fault"
  37. #define MIGRATION_THREAD_DST_LISTEN "mig/dst/listen"
  38. #define MIGRATION_THREAD_DST_PREEMPT "mig/dst/preempt"
  39. struct PostcopyBlocktimeContext;
  40. #define MIGRATION_RESUME_ACK_VALUE (1)
  41. /*
  42. * 1<<6=64 pages -> 256K chunk when page size is 4K. This gives us
  43. * the benefit that all the chunks are 64 pages aligned then the
  44. * bitmaps are always aligned to LONG.
  45. */
  46. #define CLEAR_BITMAP_SHIFT_MIN 6
  47. /*
  48. * 1<<18=256K pages -> 1G chunk when page size is 4K. This is the
  49. * default value to use if no one specified.
  50. */
  51. #define CLEAR_BITMAP_SHIFT_DEFAULT 18
  52. /*
  53. * 1<<31=2G pages -> 8T chunk when page size is 4K. This should be
  54. * big enough and make sure we won't overflow easily.
  55. */
  56. #define CLEAR_BITMAP_SHIFT_MAX 31
  57. /* This is an abstraction of a "temp huge page" for postcopy's purpose */
  58. typedef struct {
  59. /*
  60. * This points to a temporary huge page as a buffer for UFFDIO_COPY. It's
  61. * mmap()ed and needs to be freed when cleanup.
  62. */
  63. void *tmp_huge_page;
  64. /*
  65. * This points to the host page we're going to install for this temp page.
  66. * It tells us after we've received the whole page, where we should put it.
  67. */
  68. void *host_addr;
  69. /* Number of small pages copied (in size of TARGET_PAGE_SIZE) */
  70. unsigned int target_pages;
  71. /* Whether this page contains all zeros */
  72. bool all_zero;
  73. } PostcopyTmpPage;
  74. typedef enum {
  75. PREEMPT_THREAD_NONE = 0,
  76. PREEMPT_THREAD_CREATED,
  77. PREEMPT_THREAD_QUIT,
  78. } PreemptThreadStatus;
  79. /* State for the incoming migration */
  80. struct MigrationIncomingState {
  81. QEMUFile *from_src_file;
  82. /* Previously received RAM's RAMBlock pointer */
  83. RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
  84. /* A hook to allow cleanup at the end of incoming migration */
  85. void *transport_data;
  86. void (*transport_cleanup)(void *data);
  87. /*
  88. * Used to sync thread creations. Note that we can't create threads in
  89. * parallel with this sem.
  90. */
  91. QemuSemaphore thread_sync_sem;
  92. /*
  93. * Free at the start of the main state load, set as the main thread finishes
  94. * loading state.
  95. */
  96. QemuEvent main_thread_load_event;
  97. /* For network announces */
  98. AnnounceTimer announce_timer;
  99. size_t largest_page_size;
  100. bool have_fault_thread;
  101. QemuThread fault_thread;
  102. /* Set this when we want the fault thread to quit */
  103. bool fault_thread_quit;
  104. bool have_listen_thread;
  105. QemuThread listen_thread;
  106. /* For the kernel to send us notifications */
  107. int userfault_fd;
  108. /* To notify the fault_thread to wake, e.g., when need to quit */
  109. int userfault_event_fd;
  110. QEMUFile *to_src_file;
  111. QemuMutex rp_mutex; /* We send replies from multiple threads */
  112. /* RAMBlock of last request sent to source */
  113. RAMBlock *last_rb;
  114. /*
  115. * Number of postcopy channels including the default precopy channel, so
  116. * vanilla postcopy will only contain one channel which contain both
  117. * precopy and postcopy streams.
  118. *
  119. * This is calculated when the src requests to enable postcopy but before
  120. * it starts. Its value can depend on e.g. whether postcopy preemption is
  121. * enabled.
  122. */
  123. unsigned int postcopy_channels;
  124. /* QEMUFile for postcopy only; it'll be handled by a separate thread */
  125. QEMUFile *postcopy_qemufile_dst;
  126. /*
  127. * When postcopy_qemufile_dst is properly setup, this sem is posted.
  128. * One can wait on this semaphore to wait until the preempt channel is
  129. * properly setup.
  130. */
  131. QemuSemaphore postcopy_qemufile_dst_done;
  132. /* Postcopy priority thread is used to receive postcopy requested pages */
  133. QemuThread postcopy_prio_thread;
  134. /*
  135. * Always set by the main vm load thread only, but can be read by the
  136. * postcopy preempt thread. "volatile" makes sure all reads will be
  137. * up-to-date across cores.
  138. */
  139. volatile PreemptThreadStatus preempt_thread_status;
  140. /*
  141. * Used to sync between the ram load main thread and the fast ram load
  142. * thread. It protects postcopy_qemufile_dst, which is the postcopy
  143. * fast channel.
  144. *
  145. * The ram fast load thread will take it mostly for the whole lifecycle
  146. * because it needs to continuously read data from the channel, and
  147. * it'll only release this mutex if postcopy is interrupted, so that
  148. * the ram load main thread will take this mutex over and properly
  149. * release the broken channel.
  150. */
  151. QemuMutex postcopy_prio_thread_mutex;
  152. /*
  153. * An array of temp host huge pages to be used, one for each postcopy
  154. * channel.
  155. */
  156. PostcopyTmpPage *postcopy_tmp_pages;
  157. /* This is shared for all postcopy channels */
  158. void *postcopy_tmp_zero_page;
  159. /* PostCopyFD's for external userfaultfds & handlers of shared memory */
  160. GArray *postcopy_remote_fds;
  161. MigrationStatus state;
  162. /*
  163. * The incoming migration coroutine, non-NULL during qemu_loadvm_state().
  164. * Used to wake the migration incoming coroutine from rdma code. How much is
  165. * it safe - it's a question.
  166. */
  167. Coroutine *loadvm_co;
  168. /* The coroutine we should enter (back) after failover */
  169. Coroutine *colo_incoming_co;
  170. QemuSemaphore colo_incoming_sem;
  171. /*
  172. * PostcopyBlocktimeContext to keep information for postcopy
  173. * live migration, to calculate vCPU block time
  174. * */
  175. struct PostcopyBlocktimeContext *blocktime_ctx;
  176. /* notify PAUSED postcopy incoming migrations to try to continue */
  177. QemuSemaphore postcopy_pause_sem_dst;
  178. QemuSemaphore postcopy_pause_sem_fault;
  179. /*
  180. * This semaphore is used to allow the ram fast load thread (only when
  181. * postcopy preempt is enabled) fall into sleep when there's network
  182. * interruption detected. When the recovery is done, the main load
  183. * thread will kick the fast ram load thread using this semaphore.
  184. */
  185. QemuSemaphore postcopy_pause_sem_fast_load;
  186. /* List of listening socket addresses */
  187. SocketAddressList *socket_address_list;
  188. /* A tree of pages that we requested to the source VM */
  189. GTree *page_requested;
  190. /*
  191. * For postcopy only, count the number of requested page faults that
  192. * still haven't been resolved.
  193. */
  194. int page_requested_count;
  195. /*
  196. * The mutex helps to maintain the requested pages that we sent to the
  197. * source, IOW, to guarantee coherent between the page_requests tree and
  198. * the per-ramblock receivedmap. Note! This does not guarantee consistency
  199. * of the real page copy procedures (using UFFDIO_[ZERO]COPY). E.g., even
  200. * if one bit in receivedmap is cleared, UFFDIO_COPY could have happened
  201. * for that page already. This is intended so that the mutex won't
  202. * serialize and blocked by slow operations like UFFDIO_* ioctls. However
  203. * this should be enough to make sure the page_requested tree always
  204. * contains valid information.
  205. */
  206. QemuMutex page_request_mutex;
  207. /*
  208. * If postcopy preempt is enabled, there is a chance that the main
  209. * thread finished loading its data before the preempt channel has
  210. * finished loading the urgent pages. If that happens, the two threads
  211. * will use this condvar to synchronize, so the main thread will always
  212. * wait until all pages received.
  213. */
  214. QemuCond page_request_cond;
  215. /*
  216. * Number of devices that have yet to approve switchover. When this reaches
  217. * zero an ACK that it's OK to do switchover is sent to the source. No lock
  218. * is needed as this field is updated serially.
  219. */
  220. unsigned int switchover_ack_pending_num;
  221. /* Do exit on incoming migration failure */
  222. bool exit_on_error;
  223. };
  224. MigrationIncomingState *migration_incoming_get_current(void);
  225. void migration_incoming_state_destroy(void);
  226. void migration_incoming_transport_cleanup(MigrationIncomingState *mis);
  227. /*
  228. * Functions to work with blocktime context
  229. */
  230. void fill_destination_postcopy_migration_info(MigrationInfo *info);
  231. #define TYPE_MIGRATION "migration"
  232. typedef struct MigrationClass MigrationClass;
  233. DECLARE_OBJ_CHECKERS(MigrationState, MigrationClass,
  234. MIGRATION_OBJ, TYPE_MIGRATION)
  235. struct MigrationClass {
  236. /*< private >*/
  237. DeviceClass parent_class;
  238. };
  239. struct MigrationState {
  240. /*< private >*/
  241. DeviceState parent_obj;
  242. /*< public >*/
  243. QemuThread thread;
  244. /* Protected by qemu_file_lock */
  245. QEMUFile *to_dst_file;
  246. /* Postcopy specific transfer channel */
  247. QEMUFile *postcopy_qemufile_src;
  248. /*
  249. * It is posted when the preempt channel is established. Note: this is
  250. * used for both the start or recover of a postcopy migration. We'll
  251. * post to this sem every time a new preempt channel is created in the
  252. * main thread, and we keep post() and wait() in pair.
  253. */
  254. QemuSemaphore postcopy_qemufile_src_sem;
  255. QIOChannelBuffer *bioc;
  256. /*
  257. * Protects to_dst_file/from_dst_file pointers. We need to make sure we
  258. * won't yield or hang during the critical section, since this lock will be
  259. * used in OOB command handler.
  260. */
  261. QemuMutex qemu_file_lock;
  262. /*
  263. * Used to allow urgent requests to override rate limiting.
  264. */
  265. QemuSemaphore rate_limit_sem;
  266. /* pages already send at the beginning of current iteration */
  267. uint64_t iteration_initial_pages;
  268. /* pages transferred per second */
  269. double pages_per_second;
  270. /* bytes already send at the beginning of current iteration */
  271. uint64_t iteration_initial_bytes;
  272. /* time at the start of current iteration */
  273. int64_t iteration_start_time;
  274. /*
  275. * The final stage happens when the remaining data is smaller than
  276. * this threshold; it's calculated from the requested downtime and
  277. * measured bandwidth, or avail-switchover-bandwidth if specified.
  278. */
  279. uint64_t threshold_size;
  280. /* params from 'migrate-set-parameters' */
  281. MigrationParameters parameters;
  282. MigrationStatus state;
  283. /* State related to return path */
  284. struct {
  285. /* Protected by qemu_file_lock */
  286. QEMUFile *from_dst_file;
  287. QemuThread rp_thread;
  288. /*
  289. * We can also check non-zero of rp_thread, but there's no "official"
  290. * way to do this, so this bool makes it slightly more elegant.
  291. * Checking from_dst_file for this is racy because from_dst_file will
  292. * be cleared in the rp_thread!
  293. */
  294. bool rp_thread_created;
  295. /*
  296. * Used to synchronize between migration main thread and return
  297. * path thread. The migration thread can wait() on this sem, while
  298. * other threads (e.g., return path thread) can kick it using a
  299. * post().
  300. */
  301. QemuSemaphore rp_sem;
  302. /*
  303. * We post to this when we got one PONG from dest. So far it's an
  304. * easy way to know the main channel has successfully established
  305. * on dest QEMU.
  306. */
  307. QemuSemaphore rp_pong_acks;
  308. } rp_state;
  309. double mbps;
  310. /* Timestamp when recent migration starts (ms) */
  311. int64_t start_time;
  312. /* Total time used by latest migration (ms) */
  313. int64_t total_time;
  314. /* Timestamp when VM is down (ms) to migrate the last stuff */
  315. int64_t downtime_start;
  316. int64_t downtime;
  317. int64_t expected_downtime;
  318. bool capabilities[MIGRATION_CAPABILITY__MAX];
  319. int64_t setup_time;
  320. /*
  321. * State before stopping the vm by vm_stop_force_state().
  322. * If migration is interrupted by any reason, we need to continue
  323. * running the guest on source if it was running or restore its stopped
  324. * state.
  325. */
  326. RunState vm_old_state;
  327. /* Flag set once the migration has been asked to enter postcopy */
  328. bool start_postcopy;
  329. /* Flag set once the migration thread is running (and needs joining) */
  330. bool migration_thread_running;
  331. /* Migration is waiting for guest to unplug device */
  332. QemuSemaphore wait_unplug_sem;
  333. /* Migration is paused due to pause-before-switchover */
  334. QemuSemaphore pause_sem;
  335. /* The semaphore is used to notify COLO thread that failover is finished */
  336. QemuSemaphore colo_exit_sem;
  337. /* The event is used to notify COLO thread to do checkpoint */
  338. QemuEvent colo_checkpoint_event;
  339. int64_t colo_checkpoint_time;
  340. QEMUTimer *colo_delay_timer;
  341. /* The first error that has occurred.
  342. We used the mutex to be able to return the 1st error message */
  343. Error *error;
  344. /* mutex to protect errp */
  345. QemuMutex error_mutex;
  346. /*
  347. * Global switch on whether we need to store the global state
  348. * during migration.
  349. */
  350. bool store_global_state;
  351. /* Whether we send QEMU_VM_CONFIGURATION during migration */
  352. bool send_configuration;
  353. /* Whether we send section footer during migration */
  354. bool send_section_footer;
  355. /* Needed by postcopy-pause state */
  356. QemuSemaphore postcopy_pause_sem;
  357. /*
  358. * This variable only affects behavior when postcopy preempt mode is
  359. * enabled.
  360. *
  361. * When set:
  362. *
  363. * - postcopy preempt src QEMU instance will generate an EOS message at
  364. * the end of migration to shut the preempt channel on dest side.
  365. *
  366. * - postcopy preempt channel will be created at the setup phase on src
  367. QEMU.
  368. *
  369. * When clear:
  370. *
  371. * - postcopy preempt src QEMU instance will _not_ generate an EOS
  372. * message at the end of migration, the dest qemu will shutdown the
  373. * channel itself.
  374. *
  375. * - postcopy preempt channel will be created at the switching phase
  376. * from precopy -> postcopy (to avoid race condition of misordered
  377. * creation of channels).
  378. *
  379. * NOTE: See message-id <ZBoShWArKDPpX/D7@work-vm> on qemu-devel
  380. * mailing list for more information on the possible race. Everyone
  381. * should probably just keep this value untouched after set by the
  382. * machine type (or the default).
  383. */
  384. bool preempt_pre_7_2;
  385. /*
  386. * flush every channel after each section sent.
  387. *
  388. * This assures that we can't mix pages from one iteration through
  389. * ram pages with pages for the following iteration. We really
  390. * only need to do this flush after we have go through all the
  391. * dirty pages. For historical reasons, we do that after each
  392. * section. This is suboptimal (we flush too many times).
  393. * Default value is false. (since 8.1)
  394. */
  395. bool multifd_flush_after_each_section;
  396. /*
  397. * This decides the size of guest memory chunk that will be used
  398. * to track dirty bitmap clearing. The size of memory chunk will
  399. * be GUEST_PAGE_SIZE << N. Say, N=0 means we will clear dirty
  400. * bitmap for each page to send (1<<0=1); N=10 means we will clear
  401. * dirty bitmap only once for 1<<10=1K continuous guest pages
  402. * (which is in 4M chunk).
  403. */
  404. uint8_t clear_bitmap_shift;
  405. /*
  406. * This save hostname when out-going migration starts
  407. */
  408. char *hostname;
  409. /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
  410. JSONWriter *vmdesc;
  411. /*
  412. * Indicates whether an ACK from the destination that it's OK to do
  413. * switchover has been received.
  414. */
  415. bool switchover_acked;
  416. /* Is this a rdma migration */
  417. bool rdma_migration;
  418. GSource *hup_source;
  419. };
  420. void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
  421. MigrationStatus new_state);
  422. void migration_fd_process_incoming(QEMUFile *f);
  423. void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
  424. void migration_incoming_process(void);
  425. bool migration_has_all_channels(void);
  426. void migrate_set_error(MigrationState *s, const Error *error);
  427. bool migrate_has_error(MigrationState *s);
  428. void migrate_fd_connect(MigrationState *s, Error *error_in);
  429. int migration_call_notifiers(MigrationState *s, MigrationEventType type,
  430. Error **errp);
  431. int migrate_init(MigrationState *s, Error **errp);
  432. bool migration_is_blocked(Error **errp);
  433. /* True if outgoing migration has entered postcopy phase */
  434. bool migration_in_postcopy(void);
  435. bool migration_postcopy_is_alive(MigrationStatus state);
  436. MigrationState *migrate_get_current(void);
  437. bool migration_has_failed(MigrationState *);
  438. bool migrate_mode_is_cpr(MigrationState *);
  439. uint64_t ram_get_total_transferred_pages(void);
  440. /* Sending on the return path - generic and then for each message type */
  441. void migrate_send_rp_shut(MigrationIncomingState *mis,
  442. uint32_t value);
  443. void migrate_send_rp_pong(MigrationIncomingState *mis,
  444. uint32_t value);
  445. int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb,
  446. ram_addr_t start, uint64_t haddr);
  447. int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
  448. RAMBlock *rb, ram_addr_t start);
  449. void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
  450. char *block_name);
  451. void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
  452. int migrate_send_rp_switchover_ack(MigrationIncomingState *mis);
  453. void dirty_bitmap_mig_before_vm_start(void);
  454. void dirty_bitmap_mig_cancel_outgoing(void);
  455. void dirty_bitmap_mig_cancel_incoming(void);
  456. bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm,
  457. Error **errp);
  458. void migrate_add_address(SocketAddress *address);
  459. int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
  460. #define qemu_ram_foreach_block \
  461. #warning "Use foreach_not_ignored_block in migration code"
  462. void migration_make_urgent_request(void);
  463. void migration_consume_urgent_request(void);
  464. bool migration_rate_limit(void);
  465. void migration_bh_schedule(QEMUBHFunc *cb, void *opaque);
  466. void migration_cancel(const Error *error);
  467. void migration_populate_vfio_info(MigrationInfo *info);
  468. void migration_reset_vfio_bytes_transferred(void);
  469. void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
  470. /*
  471. * Migration thread waiting for return path thread. Return non-zero if an
  472. * error is detected.
  473. */
  474. int migration_rp_wait(MigrationState *s);
  475. /*
  476. * Kick the migration thread waiting for return path messages. NOTE: the
  477. * name can be slightly confusing (when read as "kick the rp thread"), just
  478. * to remember the target is always the migration thread.
  479. */
  480. void migration_rp_kick(MigrationState *s);
  481. void migration_bitmap_sync_precopy(bool last_stage);
  482. /* migration/block-dirty-bitmap.c */
  483. void dirty_bitmap_mig_init(void);
  484. bool should_send_vmdesc(void);
  485. /* migration/block-active.c */
  486. void migration_block_active_setup(bool active);
  487. #endif