block-migration.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891
  1. /*
  2. * QEMU live block migration
  3. *
  4. * Copyright IBM, Corp. 2009
  5. *
  6. * Authors:
  7. * Liran Schour <lirans@il.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu-common.h"
  16. #include "block/block_int.h"
  17. #include "hw/hw.h"
  18. #include "qemu/queue.h"
  19. #include "qemu/timer.h"
  20. #include "migration/block.h"
  21. #include "migration/migration.h"
  22. #include "sysemu/blockdev.h"
  23. #include <assert.h>
  24. #define BLOCK_SIZE (1 << 20)
  25. #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS)
  26. #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
  27. #define BLK_MIG_FLAG_EOS 0x02
  28. #define BLK_MIG_FLAG_PROGRESS 0x04
  29. #define BLK_MIG_FLAG_ZERO_BLOCK 0x08
  30. #define MAX_IS_ALLOCATED_SEARCH 65536
  31. //#define DEBUG_BLK_MIGRATION
  32. #ifdef DEBUG_BLK_MIGRATION
  33. #define DPRINTF(fmt, ...) \
  34. do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
  35. #else
  36. #define DPRINTF(fmt, ...) \
  37. do { } while (0)
  38. #endif
  39. typedef struct BlkMigDevState {
  40. /* Written during setup phase. Can be read without a lock. */
  41. BlockDriverState *bs;
  42. int shared_base;
  43. int64_t total_sectors;
  44. QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  45. /* Only used by migration thread. Does not need a lock. */
  46. int bulk_completed;
  47. int64_t cur_sector;
  48. int64_t cur_dirty;
  49. /* Protected by block migration lock. */
  50. unsigned long *aio_bitmap;
  51. int64_t completed_sectors;
  52. BdrvDirtyBitmap *dirty_bitmap;
  53. Error *blocker;
  54. } BlkMigDevState;
  55. typedef struct BlkMigBlock {
  56. /* Only used by migration thread. */
  57. uint8_t *buf;
  58. BlkMigDevState *bmds;
  59. int64_t sector;
  60. int nr_sectors;
  61. struct iovec iov;
  62. QEMUIOVector qiov;
  63. BlockDriverAIOCB *aiocb;
  64. /* Protected by block migration lock. */
  65. int ret;
  66. QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  67. } BlkMigBlock;
  68. typedef struct BlkMigState {
  69. /* Written during setup phase. Can be read without a lock. */
  70. int blk_enable;
  71. int shared_base;
  72. QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
  73. int64_t total_sector_sum;
  74. bool zero_blocks;
  75. /* Protected by lock. */
  76. QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
  77. int submitted;
  78. int read_done;
  79. /* Only used by migration thread. Does not need a lock. */
  80. int transferred;
  81. int prev_progress;
  82. int bulk_completed;
  83. /* Lock must be taken _inside_ the iothread lock. */
  84. QemuMutex lock;
  85. } BlkMigState;
  86. static BlkMigState block_mig_state;
  87. static void blk_mig_lock(void)
  88. {
  89. qemu_mutex_lock(&block_mig_state.lock);
  90. }
  91. static void blk_mig_unlock(void)
  92. {
  93. qemu_mutex_unlock(&block_mig_state.lock);
  94. }
  95. /* Must run outside of the iothread lock during the bulk phase,
  96. * or the VM will stall.
  97. */
  98. static void blk_send(QEMUFile *f, BlkMigBlock * blk)
  99. {
  100. int len;
  101. uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
  102. if (block_mig_state.zero_blocks &&
  103. buffer_is_zero(blk->buf, BLOCK_SIZE)) {
  104. flags |= BLK_MIG_FLAG_ZERO_BLOCK;
  105. }
  106. /* sector number and flags */
  107. qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
  108. | flags);
  109. /* device name */
  110. len = strlen(blk->bmds->bs->device_name);
  111. qemu_put_byte(f, len);
  112. qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
  113. /* if a block is zero we need to flush here since the network
  114. * bandwidth is now a lot higher than the storage device bandwidth.
  115. * thus if we queue zero blocks we slow down the migration */
  116. if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
  117. qemu_fflush(f);
  118. return;
  119. }
  120. qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
  121. }
  122. int blk_mig_active(void)
  123. {
  124. return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
  125. }
  126. uint64_t blk_mig_bytes_transferred(void)
  127. {
  128. BlkMigDevState *bmds;
  129. uint64_t sum = 0;
  130. blk_mig_lock();
  131. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  132. sum += bmds->completed_sectors;
  133. }
  134. blk_mig_unlock();
  135. return sum << BDRV_SECTOR_BITS;
  136. }
  137. uint64_t blk_mig_bytes_remaining(void)
  138. {
  139. return blk_mig_bytes_total() - blk_mig_bytes_transferred();
  140. }
  141. uint64_t blk_mig_bytes_total(void)
  142. {
  143. BlkMigDevState *bmds;
  144. uint64_t sum = 0;
  145. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  146. sum += bmds->total_sectors;
  147. }
  148. return sum << BDRV_SECTOR_BITS;
  149. }
  150. /* Called with migration lock held. */
  151. static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
  152. {
  153. int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
  154. if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
  155. return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
  156. (1UL << (chunk % (sizeof(unsigned long) * 8))));
  157. } else {
  158. return 0;
  159. }
  160. }
  161. /* Called with migration lock held. */
  162. static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
  163. int nb_sectors, int set)
  164. {
  165. int64_t start, end;
  166. unsigned long val, idx, bit;
  167. start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
  168. end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
  169. for (; start <= end; start++) {
  170. idx = start / (sizeof(unsigned long) * 8);
  171. bit = start % (sizeof(unsigned long) * 8);
  172. val = bmds->aio_bitmap[idx];
  173. if (set) {
  174. val |= 1UL << bit;
  175. } else {
  176. val &= ~(1UL << bit);
  177. }
  178. bmds->aio_bitmap[idx] = val;
  179. }
  180. }
  181. static void alloc_aio_bitmap(BlkMigDevState *bmds)
  182. {
  183. BlockDriverState *bs = bmds->bs;
  184. int64_t bitmap_size;
  185. bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
  186. BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
  187. bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
  188. bmds->aio_bitmap = g_malloc0(bitmap_size);
  189. }
  190. /* Never hold migration lock when yielding to the main loop! */
  191. static void blk_mig_read_cb(void *opaque, int ret)
  192. {
  193. BlkMigBlock *blk = opaque;
  194. blk_mig_lock();
  195. blk->ret = ret;
  196. QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
  197. bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
  198. block_mig_state.submitted--;
  199. block_mig_state.read_done++;
  200. assert(block_mig_state.submitted >= 0);
  201. blk_mig_unlock();
  202. }
  203. /* Called with no lock taken. */
  204. static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
  205. {
  206. int64_t total_sectors = bmds->total_sectors;
  207. int64_t cur_sector = bmds->cur_sector;
  208. BlockDriverState *bs = bmds->bs;
  209. BlkMigBlock *blk;
  210. int nr_sectors;
  211. if (bmds->shared_base) {
  212. qemu_mutex_lock_iothread();
  213. while (cur_sector < total_sectors &&
  214. !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
  215. &nr_sectors)) {
  216. cur_sector += nr_sectors;
  217. }
  218. qemu_mutex_unlock_iothread();
  219. }
  220. if (cur_sector >= total_sectors) {
  221. bmds->cur_sector = bmds->completed_sectors = total_sectors;
  222. return 1;
  223. }
  224. bmds->completed_sectors = cur_sector;
  225. cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
  226. /* we are going to transfer a full block even if it is not allocated */
  227. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  228. if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  229. nr_sectors = total_sectors - cur_sector;
  230. }
  231. blk = g_malloc(sizeof(BlkMigBlock));
  232. blk->buf = g_malloc(BLOCK_SIZE);
  233. blk->bmds = bmds;
  234. blk->sector = cur_sector;
  235. blk->nr_sectors = nr_sectors;
  236. blk->iov.iov_base = blk->buf;
  237. blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  238. qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  239. blk_mig_lock();
  240. block_mig_state.submitted++;
  241. blk_mig_unlock();
  242. qemu_mutex_lock_iothread();
  243. blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
  244. nr_sectors, blk_mig_read_cb, blk);
  245. bdrv_reset_dirty(bs, cur_sector, nr_sectors);
  246. qemu_mutex_unlock_iothread();
  247. bmds->cur_sector = cur_sector + nr_sectors;
  248. return (bmds->cur_sector >= total_sectors);
  249. }
  250. /* Called with iothread lock taken. */
  251. static int set_dirty_tracking(void)
  252. {
  253. BlkMigDevState *bmds;
  254. int ret;
  255. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  256. bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
  257. NULL);
  258. if (!bmds->dirty_bitmap) {
  259. ret = -errno;
  260. goto fail;
  261. }
  262. }
  263. return 0;
  264. fail:
  265. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  266. if (bmds->dirty_bitmap) {
  267. bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
  268. }
  269. }
  270. return ret;
  271. }
  272. static void unset_dirty_tracking(void)
  273. {
  274. BlkMigDevState *bmds;
  275. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  276. bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
  277. }
  278. }
  279. static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
  280. {
  281. BlkMigDevState *bmds;
  282. int64_t sectors;
  283. if (!bdrv_is_read_only(bs)) {
  284. sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
  285. if (sectors <= 0) {
  286. return;
  287. }
  288. bmds = g_malloc0(sizeof(BlkMigDevState));
  289. bmds->bs = bs;
  290. bmds->bulk_completed = 0;
  291. bmds->total_sectors = sectors;
  292. bmds->completed_sectors = 0;
  293. bmds->shared_base = block_mig_state.shared_base;
  294. alloc_aio_bitmap(bmds);
  295. error_setg(&bmds->blocker, "block device is in use by migration");
  296. bdrv_op_block_all(bs, bmds->blocker);
  297. bdrv_ref(bs);
  298. block_mig_state.total_sector_sum += sectors;
  299. if (bmds->shared_base) {
  300. DPRINTF("Start migration for %s with shared base image\n",
  301. bs->device_name);
  302. } else {
  303. DPRINTF("Start full migration for %s\n", bs->device_name);
  304. }
  305. QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
  306. }
  307. }
  308. static void init_blk_migration(QEMUFile *f)
  309. {
  310. block_mig_state.submitted = 0;
  311. block_mig_state.read_done = 0;
  312. block_mig_state.transferred = 0;
  313. block_mig_state.total_sector_sum = 0;
  314. block_mig_state.prev_progress = -1;
  315. block_mig_state.bulk_completed = 0;
  316. block_mig_state.zero_blocks = migrate_zero_blocks();
  317. bdrv_iterate(init_blk_migration_it, NULL);
  318. }
  319. /* Called with no lock taken. */
  320. static int blk_mig_save_bulked_block(QEMUFile *f)
  321. {
  322. int64_t completed_sector_sum = 0;
  323. BlkMigDevState *bmds;
  324. int progress;
  325. int ret = 0;
  326. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  327. if (bmds->bulk_completed == 0) {
  328. if (mig_save_device_bulk(f, bmds) == 1) {
  329. /* completed bulk section for this device */
  330. bmds->bulk_completed = 1;
  331. }
  332. completed_sector_sum += bmds->completed_sectors;
  333. ret = 1;
  334. break;
  335. } else {
  336. completed_sector_sum += bmds->completed_sectors;
  337. }
  338. }
  339. if (block_mig_state.total_sector_sum != 0) {
  340. progress = completed_sector_sum * 100 /
  341. block_mig_state.total_sector_sum;
  342. } else {
  343. progress = 100;
  344. }
  345. if (progress != block_mig_state.prev_progress) {
  346. block_mig_state.prev_progress = progress;
  347. qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
  348. | BLK_MIG_FLAG_PROGRESS);
  349. DPRINTF("Completed %d %%\r", progress);
  350. }
  351. return ret;
  352. }
  353. static void blk_mig_reset_dirty_cursor(void)
  354. {
  355. BlkMigDevState *bmds;
  356. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  357. bmds->cur_dirty = 0;
  358. }
  359. }
  360. /* Called with iothread lock taken. */
  361. static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
  362. int is_async)
  363. {
  364. BlkMigBlock *blk;
  365. int64_t total_sectors = bmds->total_sectors;
  366. int64_t sector;
  367. int nr_sectors;
  368. int ret = -EIO;
  369. for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
  370. blk_mig_lock();
  371. if (bmds_aio_inflight(bmds, sector)) {
  372. blk_mig_unlock();
  373. bdrv_drain_all();
  374. } else {
  375. blk_mig_unlock();
  376. }
  377. if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
  378. if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  379. nr_sectors = total_sectors - sector;
  380. } else {
  381. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  382. }
  383. blk = g_malloc(sizeof(BlkMigBlock));
  384. blk->buf = g_malloc(BLOCK_SIZE);
  385. blk->bmds = bmds;
  386. blk->sector = sector;
  387. blk->nr_sectors = nr_sectors;
  388. if (is_async) {
  389. blk->iov.iov_base = blk->buf;
  390. blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  391. qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  392. blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
  393. nr_sectors, blk_mig_read_cb, blk);
  394. blk_mig_lock();
  395. block_mig_state.submitted++;
  396. bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
  397. blk_mig_unlock();
  398. } else {
  399. ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
  400. if (ret < 0) {
  401. goto error;
  402. }
  403. blk_send(f, blk);
  404. g_free(blk->buf);
  405. g_free(blk);
  406. }
  407. bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
  408. break;
  409. }
  410. sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
  411. bmds->cur_dirty = sector;
  412. }
  413. return (bmds->cur_dirty >= bmds->total_sectors);
  414. error:
  415. DPRINTF("Error reading sector %" PRId64 "\n", sector);
  416. g_free(blk->buf);
  417. g_free(blk);
  418. return ret;
  419. }
  420. /* Called with iothread lock taken.
  421. *
  422. * return value:
  423. * 0: too much data for max_downtime
  424. * 1: few enough data for max_downtime
  425. */
  426. static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
  427. {
  428. BlkMigDevState *bmds;
  429. int ret = 1;
  430. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  431. ret = mig_save_device_dirty(f, bmds, is_async);
  432. if (ret <= 0) {
  433. break;
  434. }
  435. }
  436. return ret;
  437. }
  438. /* Called with no locks taken. */
  439. static int flush_blks(QEMUFile *f)
  440. {
  441. BlkMigBlock *blk;
  442. int ret = 0;
  443. DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
  444. __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
  445. block_mig_state.transferred);
  446. blk_mig_lock();
  447. while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  448. if (qemu_file_rate_limit(f)) {
  449. break;
  450. }
  451. if (blk->ret < 0) {
  452. ret = blk->ret;
  453. break;
  454. }
  455. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  456. blk_mig_unlock();
  457. blk_send(f, blk);
  458. blk_mig_lock();
  459. g_free(blk->buf);
  460. g_free(blk);
  461. block_mig_state.read_done--;
  462. block_mig_state.transferred++;
  463. assert(block_mig_state.read_done >= 0);
  464. }
  465. blk_mig_unlock();
  466. DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
  467. block_mig_state.submitted, block_mig_state.read_done,
  468. block_mig_state.transferred);
  469. return ret;
  470. }
  471. /* Called with iothread lock taken. */
  472. static int64_t get_remaining_dirty(void)
  473. {
  474. BlkMigDevState *bmds;
  475. int64_t dirty = 0;
  476. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  477. dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
  478. }
  479. return dirty << BDRV_SECTOR_BITS;
  480. }
  481. /* Called with iothread lock taken. */
  482. static void blk_mig_cleanup(void)
  483. {
  484. BlkMigDevState *bmds;
  485. BlkMigBlock *blk;
  486. bdrv_drain_all();
  487. unset_dirty_tracking();
  488. blk_mig_lock();
  489. while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
  490. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
  491. bdrv_op_unblock_all(bmds->bs, bmds->blocker);
  492. error_free(bmds->blocker);
  493. bdrv_unref(bmds->bs);
  494. g_free(bmds->aio_bitmap);
  495. g_free(bmds);
  496. }
  497. while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  498. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  499. g_free(blk->buf);
  500. g_free(blk);
  501. }
  502. blk_mig_unlock();
  503. }
  504. static void block_migration_cancel(void *opaque)
  505. {
  506. blk_mig_cleanup();
  507. }
  508. static int block_save_setup(QEMUFile *f, void *opaque)
  509. {
  510. int ret;
  511. DPRINTF("Enter save live setup submitted %d transferred %d\n",
  512. block_mig_state.submitted, block_mig_state.transferred);
  513. qemu_mutex_lock_iothread();
  514. init_blk_migration(f);
  515. /* start track dirty blocks */
  516. ret = set_dirty_tracking();
  517. if (ret) {
  518. qemu_mutex_unlock_iothread();
  519. return ret;
  520. }
  521. qemu_mutex_unlock_iothread();
  522. ret = flush_blks(f);
  523. blk_mig_reset_dirty_cursor();
  524. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  525. return ret;
  526. }
  527. static int block_save_iterate(QEMUFile *f, void *opaque)
  528. {
  529. int ret;
  530. int64_t last_ftell = qemu_ftell(f);
  531. int64_t delta_ftell;
  532. DPRINTF("Enter save live iterate submitted %d transferred %d\n",
  533. block_mig_state.submitted, block_mig_state.transferred);
  534. ret = flush_blks(f);
  535. if (ret) {
  536. return ret;
  537. }
  538. blk_mig_reset_dirty_cursor();
  539. /* control the rate of transfer */
  540. blk_mig_lock();
  541. while ((block_mig_state.submitted +
  542. block_mig_state.read_done) * BLOCK_SIZE <
  543. qemu_file_get_rate_limit(f)) {
  544. blk_mig_unlock();
  545. if (block_mig_state.bulk_completed == 0) {
  546. /* first finish the bulk phase */
  547. if (blk_mig_save_bulked_block(f) == 0) {
  548. /* finished saving bulk on all devices */
  549. block_mig_state.bulk_completed = 1;
  550. }
  551. ret = 0;
  552. } else {
  553. /* Always called with iothread lock taken for
  554. * simplicity, block_save_complete also calls it.
  555. */
  556. qemu_mutex_lock_iothread();
  557. ret = blk_mig_save_dirty_block(f, 1);
  558. qemu_mutex_unlock_iothread();
  559. }
  560. if (ret < 0) {
  561. return ret;
  562. }
  563. blk_mig_lock();
  564. if (ret != 0) {
  565. /* no more dirty blocks */
  566. break;
  567. }
  568. }
  569. blk_mig_unlock();
  570. ret = flush_blks(f);
  571. if (ret) {
  572. return ret;
  573. }
  574. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  575. delta_ftell = qemu_ftell(f) - last_ftell;
  576. if (delta_ftell > 0) {
  577. return 1;
  578. } else if (delta_ftell < 0) {
  579. return -1;
  580. } else {
  581. return 0;
  582. }
  583. }
  584. /* Called with iothread lock taken. */
  585. static int block_save_complete(QEMUFile *f, void *opaque)
  586. {
  587. int ret;
  588. DPRINTF("Enter save live complete submitted %d transferred %d\n",
  589. block_mig_state.submitted, block_mig_state.transferred);
  590. ret = flush_blks(f);
  591. if (ret) {
  592. return ret;
  593. }
  594. blk_mig_reset_dirty_cursor();
  595. /* we know for sure that save bulk is completed and
  596. all async read completed */
  597. blk_mig_lock();
  598. assert(block_mig_state.submitted == 0);
  599. blk_mig_unlock();
  600. do {
  601. ret = blk_mig_save_dirty_block(f, 0);
  602. if (ret < 0) {
  603. return ret;
  604. }
  605. } while (ret == 0);
  606. /* report completion */
  607. qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
  608. DPRINTF("Block migration completed\n");
  609. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  610. blk_mig_cleanup();
  611. return 0;
  612. }
  613. static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
  614. {
  615. /* Estimate pending number of bytes to send */
  616. uint64_t pending;
  617. qemu_mutex_lock_iothread();
  618. blk_mig_lock();
  619. pending = get_remaining_dirty() +
  620. block_mig_state.submitted * BLOCK_SIZE +
  621. block_mig_state.read_done * BLOCK_SIZE;
  622. /* Report at least one block pending during bulk phase */
  623. if (pending <= max_size && !block_mig_state.bulk_completed) {
  624. pending = max_size + BLOCK_SIZE;
  625. }
  626. blk_mig_unlock();
  627. qemu_mutex_unlock_iothread();
  628. DPRINTF("Enter save live pending %" PRIu64 "\n", pending);
  629. return pending;
  630. }
  631. static int block_load(QEMUFile *f, void *opaque, int version_id)
  632. {
  633. static int banner_printed;
  634. int len, flags;
  635. char device_name[256];
  636. int64_t addr;
  637. BlockDriverState *bs, *bs_prev = NULL;
  638. uint8_t *buf;
  639. int64_t total_sectors = 0;
  640. int nr_sectors;
  641. int ret;
  642. do {
  643. addr = qemu_get_be64(f);
  644. flags = addr & ~BDRV_SECTOR_MASK;
  645. addr >>= BDRV_SECTOR_BITS;
  646. if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
  647. /* get device name */
  648. len = qemu_get_byte(f);
  649. qemu_get_buffer(f, (uint8_t *)device_name, len);
  650. device_name[len] = '\0';
  651. bs = bdrv_find(device_name);
  652. if (!bs) {
  653. fprintf(stderr, "Error unknown block device %s\n",
  654. device_name);
  655. return -EINVAL;
  656. }
  657. if (bs != bs_prev) {
  658. bs_prev = bs;
  659. total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
  660. if (total_sectors <= 0) {
  661. error_report("Error getting length of block device %s",
  662. device_name);
  663. return -EINVAL;
  664. }
  665. }
  666. if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  667. nr_sectors = total_sectors - addr;
  668. } else {
  669. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  670. }
  671. if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
  672. ret = bdrv_write_zeroes(bs, addr, nr_sectors,
  673. BDRV_REQ_MAY_UNMAP);
  674. } else {
  675. buf = g_malloc(BLOCK_SIZE);
  676. qemu_get_buffer(f, buf, BLOCK_SIZE);
  677. ret = bdrv_write(bs, addr, buf, nr_sectors);
  678. g_free(buf);
  679. }
  680. if (ret < 0) {
  681. return ret;
  682. }
  683. } else if (flags & BLK_MIG_FLAG_PROGRESS) {
  684. if (!banner_printed) {
  685. printf("Receiving block device images\n");
  686. banner_printed = 1;
  687. }
  688. printf("Completed %d %%%c", (int)addr,
  689. (addr == 100) ? '\n' : '\r');
  690. fflush(stdout);
  691. } else if (!(flags & BLK_MIG_FLAG_EOS)) {
  692. fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
  693. return -EINVAL;
  694. }
  695. ret = qemu_file_get_error(f);
  696. if (ret != 0) {
  697. return ret;
  698. }
  699. } while (!(flags & BLK_MIG_FLAG_EOS));
  700. return 0;
  701. }
  702. static void block_set_params(const MigrationParams *params, void *opaque)
  703. {
  704. block_mig_state.blk_enable = params->blk;
  705. block_mig_state.shared_base = params->shared;
  706. /* shared base means that blk_enable = 1 */
  707. block_mig_state.blk_enable |= params->shared;
  708. }
  709. static bool block_is_active(void *opaque)
  710. {
  711. return block_mig_state.blk_enable == 1;
  712. }
  713. static SaveVMHandlers savevm_block_handlers = {
  714. .set_params = block_set_params,
  715. .save_live_setup = block_save_setup,
  716. .save_live_iterate = block_save_iterate,
  717. .save_live_complete = block_save_complete,
  718. .save_live_pending = block_save_pending,
  719. .load_state = block_load,
  720. .cancel = block_migration_cancel,
  721. .is_active = block_is_active,
  722. };
  723. void blk_mig_init(void)
  724. {
  725. QSIMPLEQ_INIT(&block_mig_state.bmds_list);
  726. QSIMPLEQ_INIT(&block_mig_state.blk_list);
  727. qemu_mutex_init(&block_mig_state.lock);
  728. register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
  729. &block_mig_state);
  730. }