block-migration.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. /*
  2. * QEMU live block migration
  3. *
  4. * Copyright IBM, Corp. 2009
  5. *
  6. * Authors:
  7. * Liran Schour <lirans@il.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu-common.h"
  16. #include "block/block_int.h"
  17. #include "hw/hw.h"
  18. #include "qemu/queue.h"
  19. #include "qemu/timer.h"
  20. #include "migration/block.h"
  21. #include "migration/migration.h"
  22. #include "sysemu/blockdev.h"
  23. #include <assert.h>
  24. #define BLOCK_SIZE (1 << 20)
  25. #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS)
  26. #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
  27. #define BLK_MIG_FLAG_EOS 0x02
  28. #define BLK_MIG_FLAG_PROGRESS 0x04
  29. #define BLK_MIG_FLAG_ZERO_BLOCK 0x08
  30. #define MAX_IS_ALLOCATED_SEARCH 65536
  31. //#define DEBUG_BLK_MIGRATION
  32. #ifdef DEBUG_BLK_MIGRATION
  33. #define DPRINTF(fmt, ...) \
  34. do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
  35. #else
  36. #define DPRINTF(fmt, ...) \
  37. do { } while (0)
  38. #endif
  39. typedef struct BlkMigDevState {
  40. /* Written during setup phase. Can be read without a lock. */
  41. BlockDriverState *bs;
  42. int shared_base;
  43. int64_t total_sectors;
  44. QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  45. /* Only used by migration thread. Does not need a lock. */
  46. int bulk_completed;
  47. int64_t cur_sector;
  48. int64_t cur_dirty;
  49. /* Protected by block migration lock. */
  50. unsigned long *aio_bitmap;
  51. int64_t completed_sectors;
  52. BdrvDirtyBitmap *dirty_bitmap;
  53. } BlkMigDevState;
  54. typedef struct BlkMigBlock {
  55. /* Only used by migration thread. */
  56. uint8_t *buf;
  57. BlkMigDevState *bmds;
  58. int64_t sector;
  59. int nr_sectors;
  60. struct iovec iov;
  61. QEMUIOVector qiov;
  62. BlockDriverAIOCB *aiocb;
  63. /* Protected by block migration lock. */
  64. int ret;
  65. QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  66. } BlkMigBlock;
  67. typedef struct BlkMigState {
  68. /* Written during setup phase. Can be read without a lock. */
  69. int blk_enable;
  70. int shared_base;
  71. QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
  72. int64_t total_sector_sum;
  73. bool zero_blocks;
  74. /* Protected by lock. */
  75. QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
  76. int submitted;
  77. int read_done;
  78. /* Only used by migration thread. Does not need a lock. */
  79. int transferred;
  80. int prev_progress;
  81. int bulk_completed;
  82. /* Lock must be taken _inside_ the iothread lock. */
  83. QemuMutex lock;
  84. } BlkMigState;
  85. static BlkMigState block_mig_state;
  86. static void blk_mig_lock(void)
  87. {
  88. qemu_mutex_lock(&block_mig_state.lock);
  89. }
  90. static void blk_mig_unlock(void)
  91. {
  92. qemu_mutex_unlock(&block_mig_state.lock);
  93. }
  94. /* Must run outside of the iothread lock during the bulk phase,
  95. * or the VM will stall.
  96. */
  97. static void blk_send(QEMUFile *f, BlkMigBlock * blk)
  98. {
  99. int len;
  100. uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
  101. if (block_mig_state.zero_blocks &&
  102. buffer_is_zero(blk->buf, BLOCK_SIZE)) {
  103. flags |= BLK_MIG_FLAG_ZERO_BLOCK;
  104. }
  105. /* sector number and flags */
  106. qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
  107. | flags);
  108. /* device name */
  109. len = strlen(blk->bmds->bs->device_name);
  110. qemu_put_byte(f, len);
  111. qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
  112. /* if a block is zero we need to flush here since the network
  113. * bandwidth is now a lot higher than the storage device bandwidth.
  114. * thus if we queue zero blocks we slow down the migration */
  115. if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
  116. qemu_fflush(f);
  117. return;
  118. }
  119. qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
  120. }
  121. int blk_mig_active(void)
  122. {
  123. return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
  124. }
  125. uint64_t blk_mig_bytes_transferred(void)
  126. {
  127. BlkMigDevState *bmds;
  128. uint64_t sum = 0;
  129. blk_mig_lock();
  130. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  131. sum += bmds->completed_sectors;
  132. }
  133. blk_mig_unlock();
  134. return sum << BDRV_SECTOR_BITS;
  135. }
  136. uint64_t blk_mig_bytes_remaining(void)
  137. {
  138. return blk_mig_bytes_total() - blk_mig_bytes_transferred();
  139. }
  140. uint64_t blk_mig_bytes_total(void)
  141. {
  142. BlkMigDevState *bmds;
  143. uint64_t sum = 0;
  144. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  145. sum += bmds->total_sectors;
  146. }
  147. return sum << BDRV_SECTOR_BITS;
  148. }
  149. /* Called with migration lock held. */
  150. static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
  151. {
  152. int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
  153. if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
  154. return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
  155. (1UL << (chunk % (sizeof(unsigned long) * 8))));
  156. } else {
  157. return 0;
  158. }
  159. }
  160. /* Called with migration lock held. */
  161. static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
  162. int nb_sectors, int set)
  163. {
  164. int64_t start, end;
  165. unsigned long val, idx, bit;
  166. start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
  167. end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
  168. for (; start <= end; start++) {
  169. idx = start / (sizeof(unsigned long) * 8);
  170. bit = start % (sizeof(unsigned long) * 8);
  171. val = bmds->aio_bitmap[idx];
  172. if (set) {
  173. val |= 1UL << bit;
  174. } else {
  175. val &= ~(1UL << bit);
  176. }
  177. bmds->aio_bitmap[idx] = val;
  178. }
  179. }
  180. static void alloc_aio_bitmap(BlkMigDevState *bmds)
  181. {
  182. BlockDriverState *bs = bmds->bs;
  183. int64_t bitmap_size;
  184. bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
  185. BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
  186. bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
  187. bmds->aio_bitmap = g_malloc0(bitmap_size);
  188. }
  189. /* Never hold migration lock when yielding to the main loop! */
  190. static void blk_mig_read_cb(void *opaque, int ret)
  191. {
  192. BlkMigBlock *blk = opaque;
  193. blk_mig_lock();
  194. blk->ret = ret;
  195. QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
  196. bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
  197. block_mig_state.submitted--;
  198. block_mig_state.read_done++;
  199. assert(block_mig_state.submitted >= 0);
  200. blk_mig_unlock();
  201. }
  202. /* Called with no lock taken. */
  203. static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
  204. {
  205. int64_t total_sectors = bmds->total_sectors;
  206. int64_t cur_sector = bmds->cur_sector;
  207. BlockDriverState *bs = bmds->bs;
  208. BlkMigBlock *blk;
  209. int nr_sectors;
  210. if (bmds->shared_base) {
  211. qemu_mutex_lock_iothread();
  212. while (cur_sector < total_sectors &&
  213. !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
  214. &nr_sectors)) {
  215. cur_sector += nr_sectors;
  216. }
  217. qemu_mutex_unlock_iothread();
  218. }
  219. if (cur_sector >= total_sectors) {
  220. bmds->cur_sector = bmds->completed_sectors = total_sectors;
  221. return 1;
  222. }
  223. bmds->completed_sectors = cur_sector;
  224. cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
  225. /* we are going to transfer a full block even if it is not allocated */
  226. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  227. if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  228. nr_sectors = total_sectors - cur_sector;
  229. }
  230. blk = g_malloc(sizeof(BlkMigBlock));
  231. blk->buf = g_malloc(BLOCK_SIZE);
  232. blk->bmds = bmds;
  233. blk->sector = cur_sector;
  234. blk->nr_sectors = nr_sectors;
  235. blk->iov.iov_base = blk->buf;
  236. blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  237. qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  238. blk_mig_lock();
  239. block_mig_state.submitted++;
  240. blk_mig_unlock();
  241. qemu_mutex_lock_iothread();
  242. blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
  243. nr_sectors, blk_mig_read_cb, blk);
  244. bdrv_reset_dirty(bs, cur_sector, nr_sectors);
  245. qemu_mutex_unlock_iothread();
  246. bmds->cur_sector = cur_sector + nr_sectors;
  247. return (bmds->cur_sector >= total_sectors);
  248. }
  249. /* Called with iothread lock taken. */
  250. static void set_dirty_tracking(void)
  251. {
  252. BlkMigDevState *bmds;
  253. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  254. bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE);
  255. }
  256. }
  257. static void unset_dirty_tracking(void)
  258. {
  259. BlkMigDevState *bmds;
  260. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  261. bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
  262. }
  263. }
  264. static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
  265. {
  266. BlkMigDevState *bmds;
  267. int64_t sectors;
  268. if (!bdrv_is_read_only(bs)) {
  269. sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
  270. if (sectors <= 0) {
  271. return;
  272. }
  273. bmds = g_malloc0(sizeof(BlkMigDevState));
  274. bmds->bs = bs;
  275. bmds->bulk_completed = 0;
  276. bmds->total_sectors = sectors;
  277. bmds->completed_sectors = 0;
  278. bmds->shared_base = block_mig_state.shared_base;
  279. alloc_aio_bitmap(bmds);
  280. bdrv_set_in_use(bs, 1);
  281. bdrv_ref(bs);
  282. block_mig_state.total_sector_sum += sectors;
  283. if (bmds->shared_base) {
  284. DPRINTF("Start migration for %s with shared base image\n",
  285. bs->device_name);
  286. } else {
  287. DPRINTF("Start full migration for %s\n", bs->device_name);
  288. }
  289. QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
  290. }
  291. }
  292. static void init_blk_migration(QEMUFile *f)
  293. {
  294. block_mig_state.submitted = 0;
  295. block_mig_state.read_done = 0;
  296. block_mig_state.transferred = 0;
  297. block_mig_state.total_sector_sum = 0;
  298. block_mig_state.prev_progress = -1;
  299. block_mig_state.bulk_completed = 0;
  300. block_mig_state.zero_blocks = migrate_zero_blocks();
  301. bdrv_iterate(init_blk_migration_it, NULL);
  302. }
  303. /* Called with no lock taken. */
  304. static int blk_mig_save_bulked_block(QEMUFile *f)
  305. {
  306. int64_t completed_sector_sum = 0;
  307. BlkMigDevState *bmds;
  308. int progress;
  309. int ret = 0;
  310. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  311. if (bmds->bulk_completed == 0) {
  312. if (mig_save_device_bulk(f, bmds) == 1) {
  313. /* completed bulk section for this device */
  314. bmds->bulk_completed = 1;
  315. }
  316. completed_sector_sum += bmds->completed_sectors;
  317. ret = 1;
  318. break;
  319. } else {
  320. completed_sector_sum += bmds->completed_sectors;
  321. }
  322. }
  323. if (block_mig_state.total_sector_sum != 0) {
  324. progress = completed_sector_sum * 100 /
  325. block_mig_state.total_sector_sum;
  326. } else {
  327. progress = 100;
  328. }
  329. if (progress != block_mig_state.prev_progress) {
  330. block_mig_state.prev_progress = progress;
  331. qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
  332. | BLK_MIG_FLAG_PROGRESS);
  333. DPRINTF("Completed %d %%\r", progress);
  334. }
  335. return ret;
  336. }
  337. static void blk_mig_reset_dirty_cursor(void)
  338. {
  339. BlkMigDevState *bmds;
  340. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  341. bmds->cur_dirty = 0;
  342. }
  343. }
  344. /* Called with iothread lock taken. */
  345. static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
  346. int is_async)
  347. {
  348. BlkMigBlock *blk;
  349. int64_t total_sectors = bmds->total_sectors;
  350. int64_t sector;
  351. int nr_sectors;
  352. int ret = -EIO;
  353. for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
  354. blk_mig_lock();
  355. if (bmds_aio_inflight(bmds, sector)) {
  356. blk_mig_unlock();
  357. bdrv_drain_all();
  358. } else {
  359. blk_mig_unlock();
  360. }
  361. if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
  362. if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  363. nr_sectors = total_sectors - sector;
  364. } else {
  365. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  366. }
  367. blk = g_malloc(sizeof(BlkMigBlock));
  368. blk->buf = g_malloc(BLOCK_SIZE);
  369. blk->bmds = bmds;
  370. blk->sector = sector;
  371. blk->nr_sectors = nr_sectors;
  372. if (is_async) {
  373. blk->iov.iov_base = blk->buf;
  374. blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  375. qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  376. blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
  377. nr_sectors, blk_mig_read_cb, blk);
  378. blk_mig_lock();
  379. block_mig_state.submitted++;
  380. bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
  381. blk_mig_unlock();
  382. } else {
  383. ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
  384. if (ret < 0) {
  385. goto error;
  386. }
  387. blk_send(f, blk);
  388. g_free(blk->buf);
  389. g_free(blk);
  390. }
  391. bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
  392. break;
  393. }
  394. sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
  395. bmds->cur_dirty = sector;
  396. }
  397. return (bmds->cur_dirty >= bmds->total_sectors);
  398. error:
  399. DPRINTF("Error reading sector %" PRId64 "\n", sector);
  400. g_free(blk->buf);
  401. g_free(blk);
  402. return ret;
  403. }
  404. /* Called with iothread lock taken.
  405. *
  406. * return value:
  407. * 0: too much data for max_downtime
  408. * 1: few enough data for max_downtime
  409. */
  410. static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
  411. {
  412. BlkMigDevState *bmds;
  413. int ret = 1;
  414. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  415. ret = mig_save_device_dirty(f, bmds, is_async);
  416. if (ret <= 0) {
  417. break;
  418. }
  419. }
  420. return ret;
  421. }
  422. /* Called with no locks taken. */
  423. static int flush_blks(QEMUFile *f)
  424. {
  425. BlkMigBlock *blk;
  426. int ret = 0;
  427. DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
  428. __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
  429. block_mig_state.transferred);
  430. blk_mig_lock();
  431. while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  432. if (qemu_file_rate_limit(f)) {
  433. break;
  434. }
  435. if (blk->ret < 0) {
  436. ret = blk->ret;
  437. break;
  438. }
  439. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  440. blk_mig_unlock();
  441. blk_send(f, blk);
  442. blk_mig_lock();
  443. g_free(blk->buf);
  444. g_free(blk);
  445. block_mig_state.read_done--;
  446. block_mig_state.transferred++;
  447. assert(block_mig_state.read_done >= 0);
  448. }
  449. blk_mig_unlock();
  450. DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
  451. block_mig_state.submitted, block_mig_state.read_done,
  452. block_mig_state.transferred);
  453. return ret;
  454. }
  455. /* Called with iothread lock taken. */
  456. static int64_t get_remaining_dirty(void)
  457. {
  458. BlkMigDevState *bmds;
  459. int64_t dirty = 0;
  460. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  461. dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
  462. }
  463. return dirty << BDRV_SECTOR_BITS;
  464. }
  465. /* Called with iothread lock taken. */
  466. static void blk_mig_cleanup(void)
  467. {
  468. BlkMigDevState *bmds;
  469. BlkMigBlock *blk;
  470. bdrv_drain_all();
  471. unset_dirty_tracking();
  472. blk_mig_lock();
  473. while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
  474. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
  475. bdrv_set_in_use(bmds->bs, 0);
  476. bdrv_unref(bmds->bs);
  477. g_free(bmds->aio_bitmap);
  478. g_free(bmds);
  479. }
  480. while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  481. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  482. g_free(blk->buf);
  483. g_free(blk);
  484. }
  485. blk_mig_unlock();
  486. }
  487. static void block_migration_cancel(void *opaque)
  488. {
  489. blk_mig_cleanup();
  490. }
  491. static int block_save_setup(QEMUFile *f, void *opaque)
  492. {
  493. int ret;
  494. DPRINTF("Enter save live setup submitted %d transferred %d\n",
  495. block_mig_state.submitted, block_mig_state.transferred);
  496. qemu_mutex_lock_iothread();
  497. init_blk_migration(f);
  498. /* start track dirty blocks */
  499. set_dirty_tracking();
  500. qemu_mutex_unlock_iothread();
  501. ret = flush_blks(f);
  502. blk_mig_reset_dirty_cursor();
  503. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  504. return ret;
  505. }
  506. static int block_save_iterate(QEMUFile *f, void *opaque)
  507. {
  508. int ret;
  509. int64_t last_ftell = qemu_ftell(f);
  510. DPRINTF("Enter save live iterate submitted %d transferred %d\n",
  511. block_mig_state.submitted, block_mig_state.transferred);
  512. ret = flush_blks(f);
  513. if (ret) {
  514. return ret;
  515. }
  516. blk_mig_reset_dirty_cursor();
  517. /* control the rate of transfer */
  518. blk_mig_lock();
  519. while ((block_mig_state.submitted +
  520. block_mig_state.read_done) * BLOCK_SIZE <
  521. qemu_file_get_rate_limit(f)) {
  522. blk_mig_unlock();
  523. if (block_mig_state.bulk_completed == 0) {
  524. /* first finish the bulk phase */
  525. if (blk_mig_save_bulked_block(f) == 0) {
  526. /* finished saving bulk on all devices */
  527. block_mig_state.bulk_completed = 1;
  528. }
  529. ret = 0;
  530. } else {
  531. /* Always called with iothread lock taken for
  532. * simplicity, block_save_complete also calls it.
  533. */
  534. qemu_mutex_lock_iothread();
  535. ret = blk_mig_save_dirty_block(f, 1);
  536. qemu_mutex_unlock_iothread();
  537. }
  538. if (ret < 0) {
  539. return ret;
  540. }
  541. blk_mig_lock();
  542. if (ret != 0) {
  543. /* no more dirty blocks */
  544. break;
  545. }
  546. }
  547. blk_mig_unlock();
  548. ret = flush_blks(f);
  549. if (ret) {
  550. return ret;
  551. }
  552. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  553. return qemu_ftell(f) - last_ftell;
  554. }
  555. /* Called with iothread lock taken. */
  556. static int block_save_complete(QEMUFile *f, void *opaque)
  557. {
  558. int ret;
  559. DPRINTF("Enter save live complete submitted %d transferred %d\n",
  560. block_mig_state.submitted, block_mig_state.transferred);
  561. ret = flush_blks(f);
  562. if (ret) {
  563. return ret;
  564. }
  565. blk_mig_reset_dirty_cursor();
  566. /* we know for sure that save bulk is completed and
  567. all async read completed */
  568. blk_mig_lock();
  569. assert(block_mig_state.submitted == 0);
  570. blk_mig_unlock();
  571. do {
  572. ret = blk_mig_save_dirty_block(f, 0);
  573. if (ret < 0) {
  574. return ret;
  575. }
  576. } while (ret == 0);
  577. /* report completion */
  578. qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
  579. DPRINTF("Block migration completed\n");
  580. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  581. blk_mig_cleanup();
  582. return 0;
  583. }
  584. static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
  585. {
  586. /* Estimate pending number of bytes to send */
  587. uint64_t pending;
  588. qemu_mutex_lock_iothread();
  589. blk_mig_lock();
  590. pending = get_remaining_dirty() +
  591. block_mig_state.submitted * BLOCK_SIZE +
  592. block_mig_state.read_done * BLOCK_SIZE;
  593. /* Report at least one block pending during bulk phase */
  594. if (pending == 0 && !block_mig_state.bulk_completed) {
  595. pending = BLOCK_SIZE;
  596. }
  597. blk_mig_unlock();
  598. qemu_mutex_unlock_iothread();
  599. DPRINTF("Enter save live pending %" PRIu64 "\n", pending);
  600. return pending;
  601. }
  602. static int block_load(QEMUFile *f, void *opaque, int version_id)
  603. {
  604. static int banner_printed;
  605. int len, flags;
  606. char device_name[256];
  607. int64_t addr;
  608. BlockDriverState *bs, *bs_prev = NULL;
  609. uint8_t *buf;
  610. int64_t total_sectors = 0;
  611. int nr_sectors;
  612. int ret;
  613. do {
  614. addr = qemu_get_be64(f);
  615. flags = addr & ~BDRV_SECTOR_MASK;
  616. addr >>= BDRV_SECTOR_BITS;
  617. if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
  618. /* get device name */
  619. len = qemu_get_byte(f);
  620. qemu_get_buffer(f, (uint8_t *)device_name, len);
  621. device_name[len] = '\0';
  622. bs = bdrv_find(device_name);
  623. if (!bs) {
  624. fprintf(stderr, "Error unknown block device %s\n",
  625. device_name);
  626. return -EINVAL;
  627. }
  628. if (bs != bs_prev) {
  629. bs_prev = bs;
  630. total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
  631. if (total_sectors <= 0) {
  632. error_report("Error getting length of block device %s",
  633. device_name);
  634. return -EINVAL;
  635. }
  636. }
  637. if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  638. nr_sectors = total_sectors - addr;
  639. } else {
  640. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  641. }
  642. if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
  643. ret = bdrv_write_zeroes(bs, addr, nr_sectors,
  644. BDRV_REQ_MAY_UNMAP);
  645. } else {
  646. buf = g_malloc(BLOCK_SIZE);
  647. qemu_get_buffer(f, buf, BLOCK_SIZE);
  648. ret = bdrv_write(bs, addr, buf, nr_sectors);
  649. g_free(buf);
  650. }
  651. if (ret < 0) {
  652. return ret;
  653. }
  654. } else if (flags & BLK_MIG_FLAG_PROGRESS) {
  655. if (!banner_printed) {
  656. printf("Receiving block device images\n");
  657. banner_printed = 1;
  658. }
  659. printf("Completed %d %%%c", (int)addr,
  660. (addr == 100) ? '\n' : '\r');
  661. fflush(stdout);
  662. } else if (!(flags & BLK_MIG_FLAG_EOS)) {
  663. fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
  664. return -EINVAL;
  665. }
  666. ret = qemu_file_get_error(f);
  667. if (ret != 0) {
  668. return ret;
  669. }
  670. } while (!(flags & BLK_MIG_FLAG_EOS));
  671. return 0;
  672. }
  673. static void block_set_params(const MigrationParams *params, void *opaque)
  674. {
  675. block_mig_state.blk_enable = params->blk;
  676. block_mig_state.shared_base = params->shared;
  677. /* shared base means that blk_enable = 1 */
  678. block_mig_state.blk_enable |= params->shared;
  679. }
  680. static bool block_is_active(void *opaque)
  681. {
  682. return block_mig_state.blk_enable == 1;
  683. }
  684. SaveVMHandlers savevm_block_handlers = {
  685. .set_params = block_set_params,
  686. .save_live_setup = block_save_setup,
  687. .save_live_iterate = block_save_iterate,
  688. .save_live_complete = block_save_complete,
  689. .save_live_pending = block_save_pending,
  690. .load_state = block_load,
  691. .cancel = block_migration_cancel,
  692. .is_active = block_is_active,
  693. };
  694. void blk_mig_init(void)
  695. {
  696. QSIMPLEQ_INIT(&block_mig_state.bmds_list);
  697. QSIMPLEQ_INIT(&block_mig_state.blk_list);
  698. qemu_mutex_init(&block_mig_state.lock);
  699. register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
  700. &block_mig_state);
  701. }