commit.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600
  1. /*
  2. * Live block commit
  3. *
  4. * Copyright Red Hat, Inc. 2012
  5. *
  6. * Authors:
  7. * Jeff Cody <jcody@redhat.com>
  8. * Based on stream.c by Stefan Hajnoczi
  9. *
  10. * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11. * See the COPYING.LIB file in the top-level directory.
  12. *
  13. */
  14. #include "qemu/osdep.h"
  15. #include "qemu/cutils.h"
  16. #include "trace.h"
  17. #include "block/block_int.h"
  18. #include "block/blockjob_int.h"
  19. #include "qapi/error.h"
  20. #include "qemu/ratelimit.h"
  21. #include "qemu/memalign.h"
  22. #include "system/block-backend.h"
  23. enum {
  24. /*
  25. * Size of data buffer for populating the image file. This should be large
  26. * enough to process multiple clusters in a single call, so that populating
  27. * contiguous regions of the image is efficient.
  28. */
  29. COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  30. };
  31. typedef struct CommitBlockJob {
  32. BlockJob common;
  33. BlockDriverState *commit_top_bs;
  34. BlockBackend *top;
  35. BlockBackend *base;
  36. BlockDriverState *base_bs;
  37. BlockDriverState *base_overlay;
  38. BlockdevOnError on_error;
  39. bool base_read_only;
  40. bool chain_frozen;
  41. char *backing_file_str;
  42. bool backing_mask_protocol;
  43. } CommitBlockJob;
  44. static int commit_prepare(Job *job)
  45. {
  46. CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  47. bdrv_graph_rdlock_main_loop();
  48. bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
  49. s->chain_frozen = false;
  50. bdrv_graph_rdunlock_main_loop();
  51. /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
  52. * the normal backing chain can be restored. */
  53. blk_unref(s->base);
  54. s->base = NULL;
  55. /* FIXME: bdrv_drop_intermediate treats total failures and partial failures
  56. * identically. Further work is needed to disambiguate these cases. */
  57. return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
  58. s->backing_file_str,
  59. s->backing_mask_protocol);
  60. }
  61. static void commit_abort(Job *job)
  62. {
  63. CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  64. BlockDriverState *top_bs = blk_bs(s->top);
  65. BlockDriverState *commit_top_backing_bs;
  66. if (s->chain_frozen) {
  67. bdrv_graph_rdlock_main_loop();
  68. bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
  69. bdrv_graph_rdunlock_main_loop();
  70. }
  71. /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
  72. bdrv_ref(top_bs);
  73. bdrv_ref(s->commit_top_bs);
  74. if (s->base) {
  75. blk_unref(s->base);
  76. }
  77. /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
  78. * can succeed */
  79. block_job_remove_all_bdrv(&s->common);
  80. /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
  81. * commit filter driver from the backing chain now. Do this as the final
  82. * step so that the 'consistent read' permission can be granted.
  83. *
  84. * XXX Can (or should) we somehow keep 'consistent read' blocked even
  85. * after the failed/cancelled commit job is gone? If we already wrote
  86. * something to base, the intermediate images aren't valid any more. */
  87. bdrv_graph_rdlock_main_loop();
  88. commit_top_backing_bs = s->commit_top_bs->backing->bs;
  89. bdrv_graph_rdunlock_main_loop();
  90. bdrv_drained_begin(commit_top_backing_bs);
  91. bdrv_graph_wrlock();
  92. bdrv_replace_node(s->commit_top_bs, commit_top_backing_bs, &error_abort);
  93. bdrv_graph_wrunlock();
  94. bdrv_drained_end(commit_top_backing_bs);
  95. bdrv_unref(s->commit_top_bs);
  96. bdrv_unref(top_bs);
  97. }
  98. static void commit_clean(Job *job)
  99. {
  100. CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  101. /* restore base open flags here if appropriate (e.g., change the base back
  102. * to r/o). These reopens do not need to be atomic, since we won't abort
  103. * even on failure here */
  104. if (s->base_read_only) {
  105. bdrv_reopen_set_read_only(s->base_bs, true, NULL);
  106. }
  107. g_free(s->backing_file_str);
  108. blk_unref(s->top);
  109. }
  110. static int coroutine_fn commit_run(Job *job, Error **errp)
  111. {
  112. CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  113. int64_t offset;
  114. int ret = 0;
  115. int64_t n = 0; /* bytes */
  116. QEMU_AUTO_VFREE void *buf = NULL;
  117. int64_t len, base_len;
  118. len = blk_co_getlength(s->top);
  119. if (len < 0) {
  120. return len;
  121. }
  122. job_progress_set_remaining(&s->common.job, len);
  123. base_len = blk_co_getlength(s->base);
  124. if (base_len < 0) {
  125. return base_len;
  126. }
  127. if (base_len < len) {
  128. ret = blk_co_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL);
  129. if (ret) {
  130. return ret;
  131. }
  132. }
  133. buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
  134. for (offset = 0; offset < len; offset += n) {
  135. bool copy;
  136. bool error_in_source = true;
  137. /* Note that even when no rate limit is applied we need to yield
  138. * with no pending I/O here so that bdrv_drain_all() returns.
  139. */
  140. block_job_ratelimit_sleep(&s->common);
  141. if (job_is_cancelled(&s->common.job)) {
  142. break;
  143. }
  144. /* Copy if allocated above the base */
  145. ret = blk_co_is_allocated_above(s->top, s->base_overlay, true,
  146. offset, COMMIT_BUFFER_SIZE, &n);
  147. copy = (ret > 0);
  148. trace_commit_one_iteration(s, offset, n, ret);
  149. if (copy) {
  150. assert(n < SIZE_MAX);
  151. ret = blk_co_pread(s->top, offset, n, buf, 0);
  152. if (ret >= 0) {
  153. ret = blk_co_pwrite(s->base, offset, n, buf, 0);
  154. if (ret < 0) {
  155. error_in_source = false;
  156. }
  157. }
  158. }
  159. if (ret < 0) {
  160. BlockErrorAction action =
  161. block_job_error_action(&s->common, s->on_error,
  162. error_in_source, -ret);
  163. if (action == BLOCK_ERROR_ACTION_REPORT) {
  164. return ret;
  165. } else {
  166. n = 0;
  167. continue;
  168. }
  169. }
  170. /* Publish progress */
  171. job_progress_update(&s->common.job, n);
  172. if (copy) {
  173. block_job_ratelimit_processed_bytes(&s->common, n);
  174. }
  175. }
  176. return 0;
  177. }
  178. static const BlockJobDriver commit_job_driver = {
  179. .job_driver = {
  180. .instance_size = sizeof(CommitBlockJob),
  181. .job_type = JOB_TYPE_COMMIT,
  182. .free = block_job_free,
  183. .user_resume = block_job_user_resume,
  184. .run = commit_run,
  185. .prepare = commit_prepare,
  186. .abort = commit_abort,
  187. .clean = commit_clean
  188. },
  189. };
  190. static int coroutine_fn GRAPH_RDLOCK
  191. bdrv_commit_top_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
  192. QEMUIOVector *qiov, BdrvRequestFlags flags)
  193. {
  194. return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
  195. }
  196. static GRAPH_RDLOCK void bdrv_commit_top_refresh_filename(BlockDriverState *bs)
  197. {
  198. pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
  199. bs->backing->bs->filename);
  200. }
  201. static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
  202. BdrvChildRole role,
  203. BlockReopenQueue *reopen_queue,
  204. uint64_t perm, uint64_t shared,
  205. uint64_t *nperm, uint64_t *nshared)
  206. {
  207. *nperm = 0;
  208. *nshared = BLK_PERM_ALL;
  209. }
  210. /* Dummy node that provides consistent read to its users without requiring it
  211. * from its backing file and that allows writes on the backing file chain. */
  212. static BlockDriver bdrv_commit_top = {
  213. .format_name = "commit_top",
  214. .bdrv_co_preadv = bdrv_commit_top_preadv,
  215. .bdrv_refresh_filename = bdrv_commit_top_refresh_filename,
  216. .bdrv_child_perm = bdrv_commit_top_child_perm,
  217. .is_filter = true,
  218. .filtered_child_is_backing = true,
  219. };
  220. void commit_start(const char *job_id, BlockDriverState *bs,
  221. BlockDriverState *base, BlockDriverState *top,
  222. int creation_flags, int64_t speed,
  223. BlockdevOnError on_error, const char *backing_file_str,
  224. bool backing_mask_protocol,
  225. const char *filter_node_name, Error **errp)
  226. {
  227. CommitBlockJob *s;
  228. BlockDriverState *iter;
  229. BlockDriverState *commit_top_bs = NULL;
  230. BlockDriverState *filtered_base;
  231. int64_t base_size, top_size;
  232. uint64_t base_perms, iter_shared_perms;
  233. int ret;
  234. GLOBAL_STATE_CODE();
  235. assert(top != bs);
  236. bdrv_graph_rdlock_main_loop();
  237. if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) {
  238. error_setg(errp, "Invalid files for merge: top and base are the same");
  239. bdrv_graph_rdunlock_main_loop();
  240. return;
  241. }
  242. bdrv_graph_rdunlock_main_loop();
  243. base_size = bdrv_getlength(base);
  244. if (base_size < 0) {
  245. error_setg_errno(errp, -base_size, "Could not inquire base image size");
  246. return;
  247. }
  248. top_size = bdrv_getlength(top);
  249. if (top_size < 0) {
  250. error_setg_errno(errp, -top_size, "Could not inquire top image size");
  251. return;
  252. }
  253. base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
  254. if (base_size < top_size) {
  255. base_perms |= BLK_PERM_RESIZE;
  256. }
  257. s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
  258. speed, creation_flags, NULL, NULL, errp);
  259. if (!s) {
  260. return;
  261. }
  262. /* convert base to r/w, if necessary */
  263. s->base_read_only = bdrv_is_read_only(base);
  264. if (s->base_read_only) {
  265. if (bdrv_reopen_set_read_only(base, false, errp) != 0) {
  266. goto fail;
  267. }
  268. }
  269. /* Insert commit_top block node above top, so we can block consistent read
  270. * on the backing chain below it */
  271. commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
  272. errp);
  273. if (commit_top_bs == NULL) {
  274. goto fail;
  275. }
  276. if (!filter_node_name) {
  277. commit_top_bs->implicit = true;
  278. }
  279. /* So that we can always drop this node */
  280. commit_top_bs->never_freeze = true;
  281. commit_top_bs->total_sectors = top->total_sectors;
  282. ret = bdrv_append(commit_top_bs, top, errp);
  283. bdrv_unref(commit_top_bs); /* referenced by new parents or failed */
  284. if (ret < 0) {
  285. commit_top_bs = NULL;
  286. goto fail;
  287. }
  288. s->commit_top_bs = commit_top_bs;
  289. /*
  290. * Block all nodes between top and base, because they will
  291. * disappear from the chain after this operation.
  292. * Note that this assumes that the user is fine with removing all
  293. * nodes (including R/W filters) between top and base. Assuring
  294. * this is the responsibility of the interface (i.e. whoever calls
  295. * commit_start()).
  296. */
  297. bdrv_graph_wrlock();
  298. s->base_overlay = bdrv_find_overlay(top, base);
  299. assert(s->base_overlay);
  300. /*
  301. * The topmost node with
  302. * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)
  303. */
  304. filtered_base = bdrv_cow_bs(s->base_overlay);
  305. assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base));
  306. /*
  307. * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
  308. * at s->base (if writes are blocked for a node, they are also blocked
  309. * for its backing file). The other options would be a second filter
  310. * driver above s->base.
  311. */
  312. iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
  313. for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) {
  314. if (iter == filtered_base) {
  315. /*
  316. * From here on, all nodes are filters on the base. This
  317. * allows us to share BLK_PERM_CONSISTENT_READ.
  318. */
  319. iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
  320. }
  321. ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
  322. iter_shared_perms, errp);
  323. if (ret < 0) {
  324. bdrv_graph_wrunlock();
  325. goto fail;
  326. }
  327. }
  328. if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) {
  329. bdrv_graph_wrunlock();
  330. goto fail;
  331. }
  332. s->chain_frozen = true;
  333. ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
  334. bdrv_graph_wrunlock();
  335. if (ret < 0) {
  336. goto fail;
  337. }
  338. s->base = blk_new(s->common.job.aio_context,
  339. base_perms,
  340. BLK_PERM_CONSISTENT_READ
  341. | BLK_PERM_WRITE_UNCHANGED);
  342. ret = blk_insert_bs(s->base, base, errp);
  343. if (ret < 0) {
  344. goto fail;
  345. }
  346. blk_set_disable_request_queuing(s->base, true);
  347. s->base_bs = base;
  348. /* Required permissions are already taken with block_job_add_bdrv() */
  349. s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
  350. ret = blk_insert_bs(s->top, top, errp);
  351. if (ret < 0) {
  352. goto fail;
  353. }
  354. blk_set_disable_request_queuing(s->top, true);
  355. s->backing_file_str = g_strdup(backing_file_str);
  356. s->backing_mask_protocol = backing_mask_protocol;
  357. s->on_error = on_error;
  358. trace_commit_start(bs, base, top, s);
  359. job_start(&s->common.job);
  360. return;
  361. fail:
  362. if (s->chain_frozen) {
  363. bdrv_graph_rdlock_main_loop();
  364. bdrv_unfreeze_backing_chain(commit_top_bs, base);
  365. bdrv_graph_rdunlock_main_loop();
  366. }
  367. if (s->base) {
  368. blk_unref(s->base);
  369. }
  370. if (s->top) {
  371. blk_unref(s->top);
  372. }
  373. if (s->base_read_only) {
  374. bdrv_reopen_set_read_only(base, true, NULL);
  375. }
  376. job_early_fail(&s->common.job);
  377. /* commit_top_bs has to be replaced after deleting the block job,
  378. * otherwise this would fail because of lack of permissions. */
  379. if (commit_top_bs) {
  380. bdrv_drained_begin(top);
  381. bdrv_graph_wrlock();
  382. bdrv_replace_node(commit_top_bs, top, &error_abort);
  383. bdrv_graph_wrunlock();
  384. bdrv_drained_end(top);
  385. }
  386. }
  387. #define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
  388. /* commit COW file into the raw image */
  389. int bdrv_commit(BlockDriverState *bs)
  390. {
  391. BlockBackend *src, *backing;
  392. BlockDriverState *backing_file_bs = NULL;
  393. BlockDriverState *commit_top_bs = NULL;
  394. BlockDriver *drv = bs->drv;
  395. AioContext *ctx;
  396. int64_t offset, length, backing_length;
  397. int ro;
  398. int64_t n;
  399. int ret = 0;
  400. QEMU_AUTO_VFREE uint8_t *buf = NULL;
  401. Error *local_err = NULL;
  402. GLOBAL_STATE_CODE();
  403. GRAPH_RDLOCK_GUARD_MAINLOOP();
  404. if (!drv)
  405. return -ENOMEDIUM;
  406. backing_file_bs = bdrv_cow_bs(bs);
  407. if (!backing_file_bs) {
  408. return -ENOTSUP;
  409. }
  410. if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
  411. bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL))
  412. {
  413. return -EBUSY;
  414. }
  415. ro = bdrv_is_read_only(backing_file_bs);
  416. if (ro) {
  417. if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) {
  418. return -EACCES;
  419. }
  420. }
  421. ctx = bdrv_get_aio_context(bs);
  422. /* WRITE_UNCHANGED is required for bdrv_make_empty() */
  423. src = blk_new(ctx, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED,
  424. BLK_PERM_ALL);
  425. backing = blk_new(ctx, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
  426. ret = blk_insert_bs(src, bs, &local_err);
  427. if (ret < 0) {
  428. error_report_err(local_err);
  429. goto ro_cleanup;
  430. }
  431. /* Insert commit_top block node above backing, so we can write to it */
  432. commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
  433. &local_err);
  434. if (commit_top_bs == NULL) {
  435. error_report_err(local_err);
  436. goto ro_cleanup;
  437. }
  438. bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
  439. bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
  440. ret = blk_insert_bs(backing, backing_file_bs, &local_err);
  441. if (ret < 0) {
  442. error_report_err(local_err);
  443. goto ro_cleanup;
  444. }
  445. length = blk_getlength(src);
  446. if (length < 0) {
  447. ret = length;
  448. goto ro_cleanup;
  449. }
  450. backing_length = blk_getlength(backing);
  451. if (backing_length < 0) {
  452. ret = backing_length;
  453. goto ro_cleanup;
  454. }
  455. /* If our top snapshot is larger than the backing file image,
  456. * grow the backing file image if possible. If not possible,
  457. * we must return an error */
  458. if (length > backing_length) {
  459. ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0,
  460. &local_err);
  461. if (ret < 0) {
  462. error_report_err(local_err);
  463. goto ro_cleanup;
  464. }
  465. }
  466. /* blk_try_blockalign() for src will choose an alignment that works for
  467. * backing as well, so no need to compare the alignment manually. */
  468. buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
  469. if (buf == NULL) {
  470. ret = -ENOMEM;
  471. goto ro_cleanup;
  472. }
  473. for (offset = 0; offset < length; offset += n) {
  474. ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
  475. if (ret < 0) {
  476. goto ro_cleanup;
  477. }
  478. if (ret) {
  479. ret = blk_pread(src, offset, n, buf, 0);
  480. if (ret < 0) {
  481. goto ro_cleanup;
  482. }
  483. ret = blk_pwrite(backing, offset, n, buf, 0);
  484. if (ret < 0) {
  485. goto ro_cleanup;
  486. }
  487. }
  488. }
  489. ret = blk_make_empty(src, NULL);
  490. /* Ignore -ENOTSUP */
  491. if (ret < 0 && ret != -ENOTSUP) {
  492. goto ro_cleanup;
  493. }
  494. blk_flush(src);
  495. /*
  496. * Make sure all data we wrote to the backing device is actually
  497. * stable on disk.
  498. */
  499. blk_flush(backing);
  500. ret = 0;
  501. ro_cleanup:
  502. blk_unref(backing);
  503. if (bdrv_cow_bs(bs) != backing_file_bs) {
  504. bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
  505. }
  506. bdrv_unref(commit_top_bs);
  507. blk_unref(src);
  508. if (ro) {
  509. /* ignoring error return here */
  510. bdrv_reopen_set_read_only(backing_file_bs, true, NULL);
  511. }
  512. return ret;
  513. }