block-migration.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. /*
  2. * QEMU live block migration
  3. *
  4. * Copyright IBM, Corp. 2009
  5. *
  6. * Authors:
  7. * Liran Schour <lirans@il.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu-common.h"
  16. #include "block/block.h"
  17. #include "qemu/error-report.h"
  18. #include "qemu/main-loop.h"
  19. #include "hw/hw.h"
  20. #include "qemu/queue.h"
  21. #include "qemu/timer.h"
  22. #include "migration/block.h"
  23. #include "migration/migration.h"
  24. #include "sysemu/blockdev.h"
  25. #include <assert.h>
  26. #define BLOCK_SIZE (1 << 20)
  27. #define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS)
  28. #define BLK_MIG_FLAG_DEVICE_BLOCK 0x01
  29. #define BLK_MIG_FLAG_EOS 0x02
  30. #define BLK_MIG_FLAG_PROGRESS 0x04
  31. #define BLK_MIG_FLAG_ZERO_BLOCK 0x08
  32. #define MAX_IS_ALLOCATED_SEARCH 65536
  33. //#define DEBUG_BLK_MIGRATION
  34. #ifdef DEBUG_BLK_MIGRATION
  35. #define DPRINTF(fmt, ...) \
  36. do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
  37. #else
  38. #define DPRINTF(fmt, ...) \
  39. do { } while (0)
  40. #endif
  41. typedef struct BlkMigDevState {
  42. /* Written during setup phase. Can be read without a lock. */
  43. BlockDriverState *bs;
  44. int shared_base;
  45. int64_t total_sectors;
  46. QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  47. /* Only used by migration thread. Does not need a lock. */
  48. int bulk_completed;
  49. int64_t cur_sector;
  50. int64_t cur_dirty;
  51. /* Protected by block migration lock. */
  52. unsigned long *aio_bitmap;
  53. int64_t completed_sectors;
  54. BdrvDirtyBitmap *dirty_bitmap;
  55. Error *blocker;
  56. } BlkMigDevState;
  57. typedef struct BlkMigBlock {
  58. /* Only used by migration thread. */
  59. uint8_t *buf;
  60. BlkMigDevState *bmds;
  61. int64_t sector;
  62. int nr_sectors;
  63. struct iovec iov;
  64. QEMUIOVector qiov;
  65. BlockAIOCB *aiocb;
  66. /* Protected by block migration lock. */
  67. int ret;
  68. QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  69. } BlkMigBlock;
  70. typedef struct BlkMigState {
  71. /* Written during setup phase. Can be read without a lock. */
  72. int blk_enable;
  73. int shared_base;
  74. QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
  75. int64_t total_sector_sum;
  76. bool zero_blocks;
  77. /* Protected by lock. */
  78. QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
  79. int submitted;
  80. int read_done;
  81. /* Only used by migration thread. Does not need a lock. */
  82. int transferred;
  83. int prev_progress;
  84. int bulk_completed;
  85. /* Lock must be taken _inside_ the iothread lock. */
  86. QemuMutex lock;
  87. } BlkMigState;
  88. static BlkMigState block_mig_state;
  89. static void blk_mig_lock(void)
  90. {
  91. qemu_mutex_lock(&block_mig_state.lock);
  92. }
  93. static void blk_mig_unlock(void)
  94. {
  95. qemu_mutex_unlock(&block_mig_state.lock);
  96. }
  97. /* Must run outside of the iothread lock during the bulk phase,
  98. * or the VM will stall.
  99. */
  100. static void blk_send(QEMUFile *f, BlkMigBlock * blk)
  101. {
  102. int len;
  103. uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
  104. if (block_mig_state.zero_blocks &&
  105. buffer_is_zero(blk->buf, BLOCK_SIZE)) {
  106. flags |= BLK_MIG_FLAG_ZERO_BLOCK;
  107. }
  108. /* sector number and flags */
  109. qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
  110. | flags);
  111. /* device name */
  112. len = strlen(bdrv_get_device_name(blk->bmds->bs));
  113. qemu_put_byte(f, len);
  114. qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
  115. /* if a block is zero we need to flush here since the network
  116. * bandwidth is now a lot higher than the storage device bandwidth.
  117. * thus if we queue zero blocks we slow down the migration */
  118. if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
  119. qemu_fflush(f);
  120. return;
  121. }
  122. qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
  123. }
  124. int blk_mig_active(void)
  125. {
  126. return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
  127. }
  128. uint64_t blk_mig_bytes_transferred(void)
  129. {
  130. BlkMigDevState *bmds;
  131. uint64_t sum = 0;
  132. blk_mig_lock();
  133. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  134. sum += bmds->completed_sectors;
  135. }
  136. blk_mig_unlock();
  137. return sum << BDRV_SECTOR_BITS;
  138. }
  139. uint64_t blk_mig_bytes_remaining(void)
  140. {
  141. return blk_mig_bytes_total() - blk_mig_bytes_transferred();
  142. }
  143. uint64_t blk_mig_bytes_total(void)
  144. {
  145. BlkMigDevState *bmds;
  146. uint64_t sum = 0;
  147. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  148. sum += bmds->total_sectors;
  149. }
  150. return sum << BDRV_SECTOR_BITS;
  151. }
  152. /* Called with migration lock held. */
  153. static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
  154. {
  155. int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
  156. if (sector < bdrv_nb_sectors(bmds->bs)) {
  157. return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
  158. (1UL << (chunk % (sizeof(unsigned long) * 8))));
  159. } else {
  160. return 0;
  161. }
  162. }
  163. /* Called with migration lock held. */
  164. static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
  165. int nb_sectors, int set)
  166. {
  167. int64_t start, end;
  168. unsigned long val, idx, bit;
  169. start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
  170. end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
  171. for (; start <= end; start++) {
  172. idx = start / (sizeof(unsigned long) * 8);
  173. bit = start % (sizeof(unsigned long) * 8);
  174. val = bmds->aio_bitmap[idx];
  175. if (set) {
  176. val |= 1UL << bit;
  177. } else {
  178. val &= ~(1UL << bit);
  179. }
  180. bmds->aio_bitmap[idx] = val;
  181. }
  182. }
  183. static void alloc_aio_bitmap(BlkMigDevState *bmds)
  184. {
  185. BlockDriverState *bs = bmds->bs;
  186. int64_t bitmap_size;
  187. bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
  188. bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
  189. bmds->aio_bitmap = g_malloc0(bitmap_size);
  190. }
  191. /* Never hold migration lock when yielding to the main loop! */
  192. static void blk_mig_read_cb(void *opaque, int ret)
  193. {
  194. BlkMigBlock *blk = opaque;
  195. blk_mig_lock();
  196. blk->ret = ret;
  197. QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
  198. bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
  199. block_mig_state.submitted--;
  200. block_mig_state.read_done++;
  201. assert(block_mig_state.submitted >= 0);
  202. blk_mig_unlock();
  203. }
  204. /* Called with no lock taken. */
  205. static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
  206. {
  207. int64_t total_sectors = bmds->total_sectors;
  208. int64_t cur_sector = bmds->cur_sector;
  209. BlockDriverState *bs = bmds->bs;
  210. BlkMigBlock *blk;
  211. int nr_sectors;
  212. if (bmds->shared_base) {
  213. qemu_mutex_lock_iothread();
  214. while (cur_sector < total_sectors &&
  215. !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
  216. &nr_sectors)) {
  217. cur_sector += nr_sectors;
  218. }
  219. qemu_mutex_unlock_iothread();
  220. }
  221. if (cur_sector >= total_sectors) {
  222. bmds->cur_sector = bmds->completed_sectors = total_sectors;
  223. return 1;
  224. }
  225. bmds->completed_sectors = cur_sector;
  226. cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
  227. /* we are going to transfer a full block even if it is not allocated */
  228. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  229. if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  230. nr_sectors = total_sectors - cur_sector;
  231. }
  232. blk = g_new(BlkMigBlock, 1);
  233. blk->buf = g_malloc(BLOCK_SIZE);
  234. blk->bmds = bmds;
  235. blk->sector = cur_sector;
  236. blk->nr_sectors = nr_sectors;
  237. blk->iov.iov_base = blk->buf;
  238. blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  239. qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  240. blk_mig_lock();
  241. block_mig_state.submitted++;
  242. blk_mig_unlock();
  243. qemu_mutex_lock_iothread();
  244. blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
  245. nr_sectors, blk_mig_read_cb, blk);
  246. bdrv_reset_dirty(bs, cur_sector, nr_sectors);
  247. qemu_mutex_unlock_iothread();
  248. bmds->cur_sector = cur_sector + nr_sectors;
  249. return (bmds->cur_sector >= total_sectors);
  250. }
  251. /* Called with iothread lock taken. */
  252. static int set_dirty_tracking(void)
  253. {
  254. BlkMigDevState *bmds;
  255. int ret;
  256. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  257. bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
  258. NULL);
  259. if (!bmds->dirty_bitmap) {
  260. ret = -errno;
  261. goto fail;
  262. }
  263. }
  264. return 0;
  265. fail:
  266. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  267. if (bmds->dirty_bitmap) {
  268. bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
  269. }
  270. }
  271. return ret;
  272. }
  273. static void unset_dirty_tracking(void)
  274. {
  275. BlkMigDevState *bmds;
  276. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  277. bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
  278. }
  279. }
  280. static void init_blk_migration(QEMUFile *f)
  281. {
  282. BlockDriverState *bs;
  283. BlkMigDevState *bmds;
  284. int64_t sectors;
  285. block_mig_state.submitted = 0;
  286. block_mig_state.read_done = 0;
  287. block_mig_state.transferred = 0;
  288. block_mig_state.total_sector_sum = 0;
  289. block_mig_state.prev_progress = -1;
  290. block_mig_state.bulk_completed = 0;
  291. block_mig_state.zero_blocks = migrate_zero_blocks();
  292. for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
  293. if (bdrv_is_read_only(bs)) {
  294. continue;
  295. }
  296. sectors = bdrv_nb_sectors(bs);
  297. if (sectors <= 0) {
  298. return;
  299. }
  300. bmds = g_new0(BlkMigDevState, 1);
  301. bmds->bs = bs;
  302. bmds->bulk_completed = 0;
  303. bmds->total_sectors = sectors;
  304. bmds->completed_sectors = 0;
  305. bmds->shared_base = block_mig_state.shared_base;
  306. alloc_aio_bitmap(bmds);
  307. error_setg(&bmds->blocker, "block device is in use by migration");
  308. bdrv_op_block_all(bs, bmds->blocker);
  309. bdrv_ref(bs);
  310. block_mig_state.total_sector_sum += sectors;
  311. if (bmds->shared_base) {
  312. DPRINTF("Start migration for %s with shared base image\n",
  313. bdrv_get_device_name(bs));
  314. } else {
  315. DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs));
  316. }
  317. QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
  318. }
  319. }
  320. /* Called with no lock taken. */
  321. static int blk_mig_save_bulked_block(QEMUFile *f)
  322. {
  323. int64_t completed_sector_sum = 0;
  324. BlkMigDevState *bmds;
  325. int progress;
  326. int ret = 0;
  327. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  328. if (bmds->bulk_completed == 0) {
  329. if (mig_save_device_bulk(f, bmds) == 1) {
  330. /* completed bulk section for this device */
  331. bmds->bulk_completed = 1;
  332. }
  333. completed_sector_sum += bmds->completed_sectors;
  334. ret = 1;
  335. break;
  336. } else {
  337. completed_sector_sum += bmds->completed_sectors;
  338. }
  339. }
  340. if (block_mig_state.total_sector_sum != 0) {
  341. progress = completed_sector_sum * 100 /
  342. block_mig_state.total_sector_sum;
  343. } else {
  344. progress = 100;
  345. }
  346. if (progress != block_mig_state.prev_progress) {
  347. block_mig_state.prev_progress = progress;
  348. qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
  349. | BLK_MIG_FLAG_PROGRESS);
  350. DPRINTF("Completed %d %%\r", progress);
  351. }
  352. return ret;
  353. }
  354. static void blk_mig_reset_dirty_cursor(void)
  355. {
  356. BlkMigDevState *bmds;
  357. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  358. bmds->cur_dirty = 0;
  359. }
  360. }
  361. /* Called with iothread lock taken. */
  362. static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
  363. int is_async)
  364. {
  365. BlkMigBlock *blk;
  366. int64_t total_sectors = bmds->total_sectors;
  367. int64_t sector;
  368. int nr_sectors;
  369. int ret = -EIO;
  370. for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
  371. blk_mig_lock();
  372. if (bmds_aio_inflight(bmds, sector)) {
  373. blk_mig_unlock();
  374. bdrv_drain_all();
  375. } else {
  376. blk_mig_unlock();
  377. }
  378. if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
  379. if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  380. nr_sectors = total_sectors - sector;
  381. } else {
  382. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  383. }
  384. blk = g_new(BlkMigBlock, 1);
  385. blk->buf = g_malloc(BLOCK_SIZE);
  386. blk->bmds = bmds;
  387. blk->sector = sector;
  388. blk->nr_sectors = nr_sectors;
  389. if (is_async) {
  390. blk->iov.iov_base = blk->buf;
  391. blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
  392. qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
  393. blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
  394. nr_sectors, blk_mig_read_cb, blk);
  395. blk_mig_lock();
  396. block_mig_state.submitted++;
  397. bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
  398. blk_mig_unlock();
  399. } else {
  400. ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
  401. if (ret < 0) {
  402. goto error;
  403. }
  404. blk_send(f, blk);
  405. g_free(blk->buf);
  406. g_free(blk);
  407. }
  408. bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
  409. break;
  410. }
  411. sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
  412. bmds->cur_dirty = sector;
  413. }
  414. return (bmds->cur_dirty >= bmds->total_sectors);
  415. error:
  416. DPRINTF("Error reading sector %" PRId64 "\n", sector);
  417. g_free(blk->buf);
  418. g_free(blk);
  419. return ret;
  420. }
  421. /* Called with iothread lock taken.
  422. *
  423. * return value:
  424. * 0: too much data for max_downtime
  425. * 1: few enough data for max_downtime
  426. */
  427. static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
  428. {
  429. BlkMigDevState *bmds;
  430. int ret = 1;
  431. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  432. ret = mig_save_device_dirty(f, bmds, is_async);
  433. if (ret <= 0) {
  434. break;
  435. }
  436. }
  437. return ret;
  438. }
  439. /* Called with no locks taken. */
  440. static int flush_blks(QEMUFile *f)
  441. {
  442. BlkMigBlock *blk;
  443. int ret = 0;
  444. DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
  445. __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
  446. block_mig_state.transferred);
  447. blk_mig_lock();
  448. while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  449. if (qemu_file_rate_limit(f)) {
  450. break;
  451. }
  452. if (blk->ret < 0) {
  453. ret = blk->ret;
  454. break;
  455. }
  456. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  457. blk_mig_unlock();
  458. blk_send(f, blk);
  459. blk_mig_lock();
  460. g_free(blk->buf);
  461. g_free(blk);
  462. block_mig_state.read_done--;
  463. block_mig_state.transferred++;
  464. assert(block_mig_state.read_done >= 0);
  465. }
  466. blk_mig_unlock();
  467. DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
  468. block_mig_state.submitted, block_mig_state.read_done,
  469. block_mig_state.transferred);
  470. return ret;
  471. }
  472. /* Called with iothread lock taken. */
  473. static int64_t get_remaining_dirty(void)
  474. {
  475. BlkMigDevState *bmds;
  476. int64_t dirty = 0;
  477. QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
  478. dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
  479. }
  480. return dirty << BDRV_SECTOR_BITS;
  481. }
  482. /* Called with iothread lock taken. */
  483. static void blk_mig_cleanup(void)
  484. {
  485. BlkMigDevState *bmds;
  486. BlkMigBlock *blk;
  487. bdrv_drain_all();
  488. unset_dirty_tracking();
  489. blk_mig_lock();
  490. while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
  491. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
  492. bdrv_op_unblock_all(bmds->bs, bmds->blocker);
  493. error_free(bmds->blocker);
  494. bdrv_unref(bmds->bs);
  495. g_free(bmds->aio_bitmap);
  496. g_free(bmds);
  497. }
  498. while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
  499. QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
  500. g_free(blk->buf);
  501. g_free(blk);
  502. }
  503. blk_mig_unlock();
  504. }
  505. static void block_migration_cancel(void *opaque)
  506. {
  507. blk_mig_cleanup();
  508. }
  509. static int block_save_setup(QEMUFile *f, void *opaque)
  510. {
  511. int ret;
  512. DPRINTF("Enter save live setup submitted %d transferred %d\n",
  513. block_mig_state.submitted, block_mig_state.transferred);
  514. qemu_mutex_lock_iothread();
  515. init_blk_migration(f);
  516. /* start track dirty blocks */
  517. ret = set_dirty_tracking();
  518. if (ret) {
  519. qemu_mutex_unlock_iothread();
  520. return ret;
  521. }
  522. qemu_mutex_unlock_iothread();
  523. ret = flush_blks(f);
  524. blk_mig_reset_dirty_cursor();
  525. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  526. return ret;
  527. }
  528. static int block_save_iterate(QEMUFile *f, void *opaque)
  529. {
  530. int ret;
  531. int64_t last_ftell = qemu_ftell(f);
  532. int64_t delta_ftell;
  533. DPRINTF("Enter save live iterate submitted %d transferred %d\n",
  534. block_mig_state.submitted, block_mig_state.transferred);
  535. ret = flush_blks(f);
  536. if (ret) {
  537. return ret;
  538. }
  539. blk_mig_reset_dirty_cursor();
  540. /* control the rate of transfer */
  541. blk_mig_lock();
  542. while ((block_mig_state.submitted +
  543. block_mig_state.read_done) * BLOCK_SIZE <
  544. qemu_file_get_rate_limit(f)) {
  545. blk_mig_unlock();
  546. if (block_mig_state.bulk_completed == 0) {
  547. /* first finish the bulk phase */
  548. if (blk_mig_save_bulked_block(f) == 0) {
  549. /* finished saving bulk on all devices */
  550. block_mig_state.bulk_completed = 1;
  551. }
  552. ret = 0;
  553. } else {
  554. /* Always called with iothread lock taken for
  555. * simplicity, block_save_complete also calls it.
  556. */
  557. qemu_mutex_lock_iothread();
  558. ret = blk_mig_save_dirty_block(f, 1);
  559. qemu_mutex_unlock_iothread();
  560. }
  561. if (ret < 0) {
  562. return ret;
  563. }
  564. blk_mig_lock();
  565. if (ret != 0) {
  566. /* no more dirty blocks */
  567. break;
  568. }
  569. }
  570. blk_mig_unlock();
  571. ret = flush_blks(f);
  572. if (ret) {
  573. return ret;
  574. }
  575. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  576. delta_ftell = qemu_ftell(f) - last_ftell;
  577. if (delta_ftell > 0) {
  578. return 1;
  579. } else if (delta_ftell < 0) {
  580. return -1;
  581. } else {
  582. return 0;
  583. }
  584. }
  585. /* Called with iothread lock taken. */
  586. static int block_save_complete(QEMUFile *f, void *opaque)
  587. {
  588. int ret;
  589. DPRINTF("Enter save live complete submitted %d transferred %d\n",
  590. block_mig_state.submitted, block_mig_state.transferred);
  591. ret = flush_blks(f);
  592. if (ret) {
  593. return ret;
  594. }
  595. blk_mig_reset_dirty_cursor();
  596. /* we know for sure that save bulk is completed and
  597. all async read completed */
  598. blk_mig_lock();
  599. assert(block_mig_state.submitted == 0);
  600. blk_mig_unlock();
  601. do {
  602. ret = blk_mig_save_dirty_block(f, 0);
  603. if (ret < 0) {
  604. return ret;
  605. }
  606. } while (ret == 0);
  607. /* report completion */
  608. qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
  609. DPRINTF("Block migration completed\n");
  610. qemu_put_be64(f, BLK_MIG_FLAG_EOS);
  611. blk_mig_cleanup();
  612. return 0;
  613. }
  614. static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
  615. {
  616. /* Estimate pending number of bytes to send */
  617. uint64_t pending;
  618. qemu_mutex_lock_iothread();
  619. blk_mig_lock();
  620. pending = get_remaining_dirty() +
  621. block_mig_state.submitted * BLOCK_SIZE +
  622. block_mig_state.read_done * BLOCK_SIZE;
  623. /* Report at least one block pending during bulk phase */
  624. if (pending <= max_size && !block_mig_state.bulk_completed) {
  625. pending = max_size + BLOCK_SIZE;
  626. }
  627. blk_mig_unlock();
  628. qemu_mutex_unlock_iothread();
  629. DPRINTF("Enter save live pending %" PRIu64 "\n", pending);
  630. return pending;
  631. }
  632. static int block_load(QEMUFile *f, void *opaque, int version_id)
  633. {
  634. static int banner_printed;
  635. int len, flags;
  636. char device_name[256];
  637. int64_t addr;
  638. BlockDriverState *bs, *bs_prev = NULL;
  639. uint8_t *buf;
  640. int64_t total_sectors = 0;
  641. int nr_sectors;
  642. int ret;
  643. do {
  644. addr = qemu_get_be64(f);
  645. flags = addr & ~BDRV_SECTOR_MASK;
  646. addr >>= BDRV_SECTOR_BITS;
  647. if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
  648. /* get device name */
  649. len = qemu_get_byte(f);
  650. qemu_get_buffer(f, (uint8_t *)device_name, len);
  651. device_name[len] = '\0';
  652. bs = bdrv_find(device_name);
  653. if (!bs) {
  654. fprintf(stderr, "Error unknown block device %s\n",
  655. device_name);
  656. return -EINVAL;
  657. }
  658. if (bs != bs_prev) {
  659. bs_prev = bs;
  660. total_sectors = bdrv_nb_sectors(bs);
  661. if (total_sectors <= 0) {
  662. error_report("Error getting length of block device %s",
  663. device_name);
  664. return -EINVAL;
  665. }
  666. }
  667. if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
  668. nr_sectors = total_sectors - addr;
  669. } else {
  670. nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
  671. }
  672. if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
  673. ret = bdrv_write_zeroes(bs, addr, nr_sectors,
  674. BDRV_REQ_MAY_UNMAP);
  675. } else {
  676. buf = g_malloc(BLOCK_SIZE);
  677. qemu_get_buffer(f, buf, BLOCK_SIZE);
  678. ret = bdrv_write(bs, addr, buf, nr_sectors);
  679. g_free(buf);
  680. }
  681. if (ret < 0) {
  682. return ret;
  683. }
  684. } else if (flags & BLK_MIG_FLAG_PROGRESS) {
  685. if (!banner_printed) {
  686. printf("Receiving block device images\n");
  687. banner_printed = 1;
  688. }
  689. printf("Completed %d %%%c", (int)addr,
  690. (addr == 100) ? '\n' : '\r');
  691. fflush(stdout);
  692. } else if (!(flags & BLK_MIG_FLAG_EOS)) {
  693. fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
  694. return -EINVAL;
  695. }
  696. ret = qemu_file_get_error(f);
  697. if (ret != 0) {
  698. return ret;
  699. }
  700. } while (!(flags & BLK_MIG_FLAG_EOS));
  701. return 0;
  702. }
  703. static void block_set_params(const MigrationParams *params, void *opaque)
  704. {
  705. block_mig_state.blk_enable = params->blk;
  706. block_mig_state.shared_base = params->shared;
  707. /* shared base means that blk_enable = 1 */
  708. block_mig_state.blk_enable |= params->shared;
  709. }
  710. static bool block_is_active(void *opaque)
  711. {
  712. return block_mig_state.blk_enable == 1;
  713. }
  714. static SaveVMHandlers savevm_block_handlers = {
  715. .set_params = block_set_params,
  716. .save_live_setup = block_save_setup,
  717. .save_live_iterate = block_save_iterate,
  718. .save_live_complete = block_save_complete,
  719. .save_live_pending = block_save_pending,
  720. .load_state = block_load,
  721. .cancel = block_migration_cancel,
  722. .is_active = block_is_active,
  723. };
  724. void blk_mig_init(void)
  725. {
  726. QSIMPLEQ_INIT(&block_mig_state.bmds_list);
  727. QSIMPLEQ_INIT(&block_mig_state.blk_list);
  728. qemu_mutex_init(&block_mig_state.lock);
  729. register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
  730. &block_mig_state);
  731. }