stream.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /*
  2. * Image streaming
  3. *
  4. * Copyright IBM, Corp. 2011
  5. *
  6. * Authors:
  7. * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  10. * See the COPYING.LIB file in the top-level directory.
  11. *
  12. */
  13. #include "qemu/osdep.h"
  14. #include "trace.h"
  15. #include "block/block_int.h"
  16. #include "block/blockjob_int.h"
  17. #include "qapi/error.h"
  18. #include "qapi/qmp/qdict.h"
  19. #include "qemu/ratelimit.h"
  20. #include "sysemu/block-backend.h"
  21. #include "block/copy-on-read.h"
  22. enum {
  23. /*
  24. * Maximum chunk size to feed to copy-on-read. This should be
  25. * large enough to process multiple clusters in a single call, so
  26. * that populating contiguous regions of the image is efficient.
  27. */
  28. STREAM_CHUNK = 512 * 1024, /* in bytes */
  29. };
  30. typedef struct StreamBlockJob {
  31. BlockJob common;
  32. BlockBackend *blk;
  33. BlockDriverState *base_overlay; /* COW overlay (stream from this) */
  34. BlockDriverState *above_base; /* Node directly above the base */
  35. BlockDriverState *cor_filter_bs;
  36. BlockDriverState *target_bs;
  37. BlockdevOnError on_error;
  38. char *backing_file_str;
  39. bool backing_mask_protocol;
  40. bool bs_read_only;
  41. } StreamBlockJob;
  42. static int coroutine_fn stream_populate(BlockBackend *blk,
  43. int64_t offset, uint64_t bytes)
  44. {
  45. assert(bytes < SIZE_MAX);
  46. return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH);
  47. }
  48. static int stream_prepare(Job *job)
  49. {
  50. StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
  51. BlockDriverState *unfiltered_bs;
  52. BlockDriverState *unfiltered_bs_cow;
  53. BlockDriverState *base;
  54. BlockDriverState *unfiltered_base;
  55. Error *local_err = NULL;
  56. int ret = 0;
  57. GLOBAL_STATE_CODE();
  58. bdrv_graph_rdlock_main_loop();
  59. unfiltered_bs = bdrv_skip_filters(s->target_bs);
  60. unfiltered_bs_cow = bdrv_cow_bs(unfiltered_bs);
  61. bdrv_graph_rdunlock_main_loop();
  62. /* We should drop filter at this point, as filter hold the backing chain */
  63. bdrv_cor_filter_drop(s->cor_filter_bs);
  64. s->cor_filter_bs = NULL;
  65. /*
  66. * bdrv_set_backing_hd() requires that the unfiltered_bs and the COW child
  67. * of unfiltered_bs is drained. Drain already here and use
  68. * bdrv_set_backing_hd_drained() instead because the polling during
  69. * drained_begin() might change the graph, and if we do this only later, we
  70. * may end up working with the wrong base node (or it might even have gone
  71. * away by the time we want to use it).
  72. */
  73. bdrv_drained_begin(unfiltered_bs);
  74. if (unfiltered_bs_cow) {
  75. bdrv_ref(unfiltered_bs_cow);
  76. bdrv_drained_begin(unfiltered_bs_cow);
  77. }
  78. bdrv_graph_rdlock_main_loop();
  79. base = bdrv_filter_or_cow_bs(s->above_base);
  80. unfiltered_base = bdrv_skip_filters(base);
  81. bdrv_graph_rdunlock_main_loop();
  82. if (unfiltered_bs_cow) {
  83. const char *base_id = NULL, *base_fmt = NULL;
  84. if (unfiltered_base) {
  85. base_id = s->backing_file_str ?: unfiltered_base->filename;
  86. if (unfiltered_base->drv) {
  87. if (s->backing_mask_protocol &&
  88. unfiltered_base->drv->protocol_name) {
  89. base_fmt = "raw";
  90. } else {
  91. base_fmt = unfiltered_base->drv->format_name;
  92. }
  93. }
  94. }
  95. bdrv_graph_wrlock();
  96. bdrv_set_backing_hd_drained(unfiltered_bs, base, &local_err);
  97. bdrv_graph_wrunlock();
  98. /*
  99. * This call will do I/O, so the graph can change again from here on.
  100. * We have already completed the graph change, so we are not in danger
  101. * of operating on the wrong node any more if this happens.
  102. */
  103. ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt, false);
  104. if (local_err) {
  105. error_report_err(local_err);
  106. ret = -EPERM;
  107. goto out;
  108. }
  109. }
  110. out:
  111. if (unfiltered_bs_cow) {
  112. bdrv_drained_end(unfiltered_bs_cow);
  113. bdrv_unref(unfiltered_bs_cow);
  114. }
  115. bdrv_drained_end(unfiltered_bs);
  116. return ret;
  117. }
  118. static void stream_clean(Job *job)
  119. {
  120. StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
  121. if (s->cor_filter_bs) {
  122. bdrv_cor_filter_drop(s->cor_filter_bs);
  123. s->cor_filter_bs = NULL;
  124. }
  125. blk_unref(s->blk);
  126. s->blk = NULL;
  127. /* Reopen the image back in read-only mode if necessary */
  128. if (s->bs_read_only) {
  129. /* Give up write permissions before making it read-only */
  130. bdrv_reopen_set_read_only(s->target_bs, true, NULL);
  131. }
  132. g_free(s->backing_file_str);
  133. }
  134. static int coroutine_fn stream_run(Job *job, Error **errp)
  135. {
  136. StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
  137. BlockDriverState *unfiltered_bs = NULL;
  138. int64_t len = -1;
  139. int64_t offset = 0;
  140. int error = 0;
  141. int64_t n = 0; /* bytes */
  142. WITH_GRAPH_RDLOCK_GUARD() {
  143. unfiltered_bs = bdrv_skip_filters(s->target_bs);
  144. if (unfiltered_bs == s->base_overlay) {
  145. /* Nothing to stream */
  146. return 0;
  147. }
  148. len = bdrv_co_getlength(s->target_bs);
  149. if (len < 0) {
  150. return len;
  151. }
  152. }
  153. job_progress_set_remaining(&s->common.job, len);
  154. for ( ; offset < len; offset += n) {
  155. bool copy;
  156. int ret = -1;
  157. /* Note that even when no rate limit is applied we need to yield
  158. * with no pending I/O here so that bdrv_drain_all() returns.
  159. */
  160. block_job_ratelimit_sleep(&s->common);
  161. if (job_is_cancelled(&s->common.job)) {
  162. break;
  163. }
  164. copy = false;
  165. WITH_GRAPH_RDLOCK_GUARD() {
  166. ret = bdrv_co_is_allocated(unfiltered_bs, offset, STREAM_CHUNK, &n);
  167. if (ret == 1) {
  168. /* Allocated in the top, no need to copy. */
  169. } else if (ret >= 0) {
  170. /*
  171. * Copy if allocated in the intermediate images. Limit to the
  172. * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).
  173. */
  174. ret = bdrv_co_is_allocated_above(bdrv_cow_bs(unfiltered_bs),
  175. s->base_overlay, true,
  176. offset, n, &n);
  177. /* Finish early if end of backing file has been reached */
  178. if (ret == 0 && n == 0) {
  179. n = len - offset;
  180. }
  181. copy = (ret > 0);
  182. }
  183. }
  184. trace_stream_one_iteration(s, offset, n, ret);
  185. if (copy) {
  186. ret = stream_populate(s->blk, offset, n);
  187. }
  188. if (ret < 0) {
  189. BlockErrorAction action =
  190. block_job_error_action(&s->common, s->on_error, true, -ret);
  191. if (action == BLOCK_ERROR_ACTION_STOP) {
  192. n = 0;
  193. continue;
  194. }
  195. if (error == 0) {
  196. error = ret;
  197. }
  198. if (action == BLOCK_ERROR_ACTION_REPORT) {
  199. break;
  200. }
  201. }
  202. /* Publish progress */
  203. job_progress_update(&s->common.job, n);
  204. if (copy) {
  205. block_job_ratelimit_processed_bytes(&s->common, n);
  206. }
  207. }
  208. /* Do not remove the backing file if an error was there but ignored. */
  209. return error;
  210. }
  211. static const BlockJobDriver stream_job_driver = {
  212. .job_driver = {
  213. .instance_size = sizeof(StreamBlockJob),
  214. .job_type = JOB_TYPE_STREAM,
  215. .free = block_job_free,
  216. .run = stream_run,
  217. .prepare = stream_prepare,
  218. .clean = stream_clean,
  219. .user_resume = block_job_user_resume,
  220. },
  221. };
  222. void stream_start(const char *job_id, BlockDriverState *bs,
  223. BlockDriverState *base, const char *backing_file_str,
  224. bool backing_mask_protocol,
  225. BlockDriverState *bottom,
  226. int creation_flags, int64_t speed,
  227. BlockdevOnError on_error,
  228. const char *filter_node_name,
  229. Error **errp)
  230. {
  231. StreamBlockJob *s = NULL;
  232. BlockDriverState *iter;
  233. bool bs_read_only;
  234. int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
  235. BlockDriverState *base_overlay;
  236. BlockDriverState *cor_filter_bs = NULL;
  237. BlockDriverState *above_base;
  238. QDict *opts;
  239. int ret;
  240. GLOBAL_STATE_CODE();
  241. assert(!(base && bottom));
  242. assert(!(backing_file_str && bottom));
  243. bdrv_graph_rdlock_main_loop();
  244. if (bottom) {
  245. /*
  246. * New simple interface. The code is written in terms of old interface
  247. * with @base parameter (still, it doesn't freeze link to base, so in
  248. * this mean old code is correct for new interface). So, for now, just
  249. * emulate base_overlay and above_base. Still, when old interface
  250. * finally removed, we should refactor code to use only "bottom", but
  251. * not "*base*" things.
  252. */
  253. assert(!bottom->drv->is_filter);
  254. base_overlay = above_base = bottom;
  255. } else {
  256. base_overlay = bdrv_find_overlay(bs, base);
  257. if (!base_overlay) {
  258. error_setg(errp, "'%s' is not in the backing chain of '%s'",
  259. base->node_name, bs->node_name);
  260. goto out_rdlock;
  261. }
  262. /*
  263. * Find the node directly above @base. @base_overlay is a COW overlay,
  264. * so it must have a bdrv_cow_child(), but it is the immediate overlay
  265. * of @base, so between the two there can only be filters.
  266. */
  267. above_base = base_overlay;
  268. if (bdrv_cow_bs(above_base) != base) {
  269. above_base = bdrv_cow_bs(above_base);
  270. while (bdrv_filter_bs(above_base) != base) {
  271. above_base = bdrv_filter_bs(above_base);
  272. }
  273. }
  274. }
  275. /* Make sure that the image is opened in read-write mode */
  276. bs_read_only = bdrv_is_read_only(bs);
  277. if (bs_read_only) {
  278. /* Hold the chain during reopen */
  279. if (bdrv_freeze_backing_chain(bs, above_base, errp) < 0) {
  280. goto out_rdlock;
  281. }
  282. ret = bdrv_reopen_set_read_only(bs, false, errp);
  283. /* failure, or cor-filter will hold the chain */
  284. bdrv_unfreeze_backing_chain(bs, above_base);
  285. if (ret < 0) {
  286. goto out_rdlock;
  287. }
  288. }
  289. bdrv_graph_rdunlock_main_loop();
  290. opts = qdict_new();
  291. qdict_put_str(opts, "driver", "copy-on-read");
  292. qdict_put_str(opts, "file", bdrv_get_node_name(bs));
  293. /* Pass the base_overlay node name as 'bottom' to COR driver */
  294. qdict_put_str(opts, "bottom", base_overlay->node_name);
  295. if (filter_node_name) {
  296. qdict_put_str(opts, "node-name", filter_node_name);
  297. }
  298. cor_filter_bs = bdrv_insert_node(bs, opts, BDRV_O_RDWR, errp);
  299. if (!cor_filter_bs) {
  300. goto fail;
  301. }
  302. if (!filter_node_name) {
  303. cor_filter_bs->implicit = true;
  304. }
  305. s = block_job_create(job_id, &stream_job_driver, NULL, cor_filter_bs,
  306. 0, BLK_PERM_ALL,
  307. speed, creation_flags, NULL, NULL, errp);
  308. if (!s) {
  309. goto fail;
  310. }
  311. s->blk = blk_new_with_bs(cor_filter_bs, BLK_PERM_CONSISTENT_READ,
  312. basic_flags | BLK_PERM_WRITE, errp);
  313. if (!s->blk) {
  314. goto fail;
  315. }
  316. /*
  317. * Disable request queuing in the BlockBackend to avoid deadlocks on drain:
  318. * The job reports that it's busy until it reaches a pause point.
  319. */
  320. blk_set_disable_request_queuing(s->blk, true);
  321. blk_set_allow_aio_context_change(s->blk, true);
  322. /*
  323. * Prevent concurrent jobs trying to modify the graph structure here, we
  324. * already have our own plans. Also don't allow resize as the image size is
  325. * queried only at the job start and then cached.
  326. */
  327. bdrv_graph_wrlock();
  328. if (block_job_add_bdrv(&s->common, "active node", bs, 0,
  329. basic_flags | BLK_PERM_WRITE, errp)) {
  330. bdrv_graph_wrunlock();
  331. goto fail;
  332. }
  333. /* Block all intermediate nodes between bs and base, because they will
  334. * disappear from the chain after this operation. The streaming job reads
  335. * every block only once, assuming that it doesn't change, so forbid writes
  336. * and resizes. Reassign the base node pointer because the backing BS of the
  337. * bottom node might change after the call to bdrv_reopen_set_read_only()
  338. * due to parallel block jobs running.
  339. * above_base node might change after the call to
  340. * bdrv_reopen_set_read_only() due to parallel block jobs running.
  341. */
  342. base = bdrv_filter_or_cow_bs(above_base);
  343. for (iter = bdrv_filter_or_cow_bs(bs); iter != base;
  344. iter = bdrv_filter_or_cow_bs(iter))
  345. {
  346. ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
  347. basic_flags, errp);
  348. if (ret < 0) {
  349. bdrv_graph_wrunlock();
  350. goto fail;
  351. }
  352. }
  353. bdrv_graph_wrunlock();
  354. s->base_overlay = base_overlay;
  355. s->above_base = above_base;
  356. s->backing_file_str = g_strdup(backing_file_str);
  357. s->backing_mask_protocol = backing_mask_protocol;
  358. s->cor_filter_bs = cor_filter_bs;
  359. s->target_bs = bs;
  360. s->bs_read_only = bs_read_only;
  361. s->on_error = on_error;
  362. trace_stream_start(bs, base, s);
  363. job_start(&s->common.job);
  364. return;
  365. fail:
  366. if (s) {
  367. job_early_fail(&s->common.job);
  368. }
  369. if (cor_filter_bs) {
  370. bdrv_cor_filter_drop(cor_filter_bs);
  371. }
  372. if (bs_read_only) {
  373. bdrv_reopen_set_read_only(bs, true, NULL);
  374. }
  375. return;
  376. out_rdlock:
  377. bdrv_graph_rdunlock_main_loop();
  378. }