replication.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750
  1. /*
  2. * Replication Block filter
  3. *
  4. * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
  5. * Copyright (c) 2016 Intel Corporation
  6. * Copyright (c) 2016 FUJITSU LIMITED
  7. *
  8. * Author:
  9. * Wen Congyang <wency@cn.fujitsu.com>
  10. *
  11. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12. * See the COPYING file in the top-level directory.
  13. */
  14. #include "qemu/osdep.h"
  15. #include "qemu/module.h"
  16. #include "qemu/option.h"
  17. #include "block/nbd.h"
  18. #include "block/blockjob.h"
  19. #include "block/block_int.h"
  20. #include "block/block_backup.h"
  21. #include "system/block-backend.h"
  22. #include "qapi/error.h"
  23. #include "qapi/qmp/qdict.h"
  24. #include "block/replication.h"
  25. typedef enum {
  26. BLOCK_REPLICATION_NONE, /* block replication is not started */
  27. BLOCK_REPLICATION_RUNNING, /* block replication is running */
  28. BLOCK_REPLICATION_FAILOVER, /* failover is running in background */
  29. BLOCK_REPLICATION_FAILOVER_FAILED, /* failover failed */
  30. BLOCK_REPLICATION_DONE, /* block replication is done */
  31. } ReplicationStage;
  32. typedef struct BDRVReplicationState {
  33. ReplicationMode mode;
  34. ReplicationStage stage;
  35. BlockJob *commit_job;
  36. BdrvChild *hidden_disk;
  37. BdrvChild *secondary_disk;
  38. BlockJob *backup_job;
  39. char *top_id;
  40. ReplicationState *rs;
  41. Error *blocker;
  42. bool orig_hidden_read_only;
  43. bool orig_secondary_read_only;
  44. int error;
  45. } BDRVReplicationState;
  46. static void replication_start(ReplicationState *rs, ReplicationMode mode,
  47. Error **errp);
  48. static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
  49. static void replication_get_error(ReplicationState *rs, Error **errp);
  50. static void replication_stop(ReplicationState *rs, bool failover,
  51. Error **errp);
  52. #define REPLICATION_MODE "mode"
  53. #define REPLICATION_TOP_ID "top-id"
  54. static QemuOptsList replication_runtime_opts = {
  55. .name = "replication",
  56. .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
  57. .desc = {
  58. {
  59. .name = REPLICATION_MODE,
  60. .type = QEMU_OPT_STRING,
  61. },
  62. {
  63. .name = REPLICATION_TOP_ID,
  64. .type = QEMU_OPT_STRING,
  65. },
  66. { /* end of list */ }
  67. },
  68. };
  69. static ReplicationOps replication_ops = {
  70. .start = replication_start,
  71. .checkpoint = replication_do_checkpoint,
  72. .get_error = replication_get_error,
  73. .stop = replication_stop,
  74. };
  75. static int replication_open(BlockDriverState *bs, QDict *options,
  76. int flags, Error **errp)
  77. {
  78. int ret;
  79. BDRVReplicationState *s = bs->opaque;
  80. QemuOpts *opts = NULL;
  81. const char *mode;
  82. const char *top_id;
  83. ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
  84. if (ret < 0) {
  85. return ret;
  86. }
  87. ret = -EINVAL;
  88. opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
  89. if (!qemu_opts_absorb_qdict(opts, options, errp)) {
  90. goto fail;
  91. }
  92. mode = qemu_opt_get(opts, REPLICATION_MODE);
  93. if (!mode) {
  94. error_setg(errp, "Missing the option mode");
  95. goto fail;
  96. }
  97. if (!strcmp(mode, "primary")) {
  98. s->mode = REPLICATION_MODE_PRIMARY;
  99. top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
  100. if (top_id) {
  101. error_setg(errp,
  102. "The primary side does not support option top-id");
  103. goto fail;
  104. }
  105. } else if (!strcmp(mode, "secondary")) {
  106. s->mode = REPLICATION_MODE_SECONDARY;
  107. top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
  108. s->top_id = g_strdup(top_id);
  109. if (!s->top_id) {
  110. error_setg(errp, "Missing the option top-id");
  111. goto fail;
  112. }
  113. } else {
  114. error_setg(errp,
  115. "The option mode's value should be primary or secondary");
  116. goto fail;
  117. }
  118. s->rs = replication_new(bs, &replication_ops);
  119. ret = 0;
  120. fail:
  121. qemu_opts_del(opts);
  122. return ret;
  123. }
  124. static void replication_close(BlockDriverState *bs)
  125. {
  126. BDRVReplicationState *s = bs->opaque;
  127. Job *commit_job;
  128. GLOBAL_STATE_CODE();
  129. if (s->stage == BLOCK_REPLICATION_RUNNING) {
  130. replication_stop(s->rs, false, NULL);
  131. }
  132. if (s->stage == BLOCK_REPLICATION_FAILOVER) {
  133. commit_job = &s->commit_job->job;
  134. assert(commit_job->aio_context == qemu_get_current_aio_context());
  135. job_cancel_sync(commit_job, false);
  136. }
  137. if (s->mode == REPLICATION_MODE_SECONDARY) {
  138. g_free(s->top_id);
  139. }
  140. replication_remove(s->rs);
  141. }
  142. static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
  143. BdrvChildRole role,
  144. BlockReopenQueue *reopen_queue,
  145. uint64_t perm, uint64_t shared,
  146. uint64_t *nperm, uint64_t *nshared)
  147. {
  148. if (role & BDRV_CHILD_PRIMARY) {
  149. *nperm = BLK_PERM_CONSISTENT_READ;
  150. } else {
  151. *nperm = 0;
  152. }
  153. if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
  154. *nperm |= BLK_PERM_WRITE;
  155. }
  156. *nshared = BLK_PERM_CONSISTENT_READ
  157. | BLK_PERM_WRITE
  158. | BLK_PERM_WRITE_UNCHANGED;
  159. return;
  160. }
  161. static int64_t coroutine_fn GRAPH_RDLOCK
  162. replication_co_getlength(BlockDriverState *bs)
  163. {
  164. return bdrv_co_getlength(bs->file->bs);
  165. }
  166. static int replication_get_io_status(BDRVReplicationState *s)
  167. {
  168. switch (s->stage) {
  169. case BLOCK_REPLICATION_NONE:
  170. return -EIO;
  171. case BLOCK_REPLICATION_RUNNING:
  172. return 0;
  173. case BLOCK_REPLICATION_FAILOVER:
  174. return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
  175. case BLOCK_REPLICATION_FAILOVER_FAILED:
  176. return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
  177. case BLOCK_REPLICATION_DONE:
  178. /*
  179. * active commit job completes, and active disk and secondary_disk
  180. * is swapped, so we can operate bs->file directly
  181. */
  182. return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
  183. default:
  184. abort();
  185. }
  186. }
  187. static int replication_return_value(BDRVReplicationState *s, int ret)
  188. {
  189. if (s->mode == REPLICATION_MODE_SECONDARY) {
  190. return ret;
  191. }
  192. if (ret < 0) {
  193. s->error = ret;
  194. ret = 0;
  195. }
  196. return ret;
  197. }
  198. static int coroutine_fn GRAPH_RDLOCK
  199. replication_co_readv(BlockDriverState *bs, int64_t sector_num,
  200. int remaining_sectors, QEMUIOVector *qiov)
  201. {
  202. BDRVReplicationState *s = bs->opaque;
  203. int ret;
  204. if (s->mode == REPLICATION_MODE_PRIMARY) {
  205. /* We only use it to forward primary write requests */
  206. return -EIO;
  207. }
  208. ret = replication_get_io_status(s);
  209. if (ret < 0) {
  210. return ret;
  211. }
  212. ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
  213. remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
  214. return replication_return_value(s, ret);
  215. }
  216. static int coroutine_fn GRAPH_RDLOCK
  217. replication_co_writev(BlockDriverState *bs, int64_t sector_num,
  218. int remaining_sectors, QEMUIOVector *qiov, int flags)
  219. {
  220. BDRVReplicationState *s = bs->opaque;
  221. QEMUIOVector hd_qiov;
  222. uint64_t bytes_done = 0;
  223. BdrvChild *top = bs->file;
  224. BdrvChild *base = s->secondary_disk;
  225. BdrvChild *target;
  226. int ret;
  227. int64_t n;
  228. ret = replication_get_io_status(s);
  229. if (ret < 0) {
  230. goto out;
  231. }
  232. if (ret == 0) {
  233. ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
  234. remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
  235. return replication_return_value(s, ret);
  236. }
  237. /*
  238. * Failover failed, only write to active disk if the sectors
  239. * have already been allocated in active disk/hidden disk.
  240. */
  241. qemu_iovec_init(&hd_qiov, qiov->niov);
  242. while (remaining_sectors > 0) {
  243. int64_t count;
  244. ret = bdrv_co_is_allocated_above(top->bs, base->bs, false,
  245. sector_num * BDRV_SECTOR_SIZE,
  246. remaining_sectors * BDRV_SECTOR_SIZE,
  247. &count);
  248. if (ret < 0) {
  249. goto out1;
  250. }
  251. assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
  252. n = count >> BDRV_SECTOR_BITS;
  253. qemu_iovec_reset(&hd_qiov);
  254. qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
  255. target = ret ? top : base;
  256. ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
  257. n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
  258. if (ret < 0) {
  259. goto out1;
  260. }
  261. remaining_sectors -= n;
  262. sector_num += n;
  263. bytes_done += count;
  264. }
  265. out1:
  266. qemu_iovec_destroy(&hd_qiov);
  267. out:
  268. return ret;
  269. }
  270. static void GRAPH_UNLOCKED
  271. secondary_do_checkpoint(BlockDriverState *bs, Error **errp)
  272. {
  273. BDRVReplicationState *s = bs->opaque;
  274. BdrvChild *active_disk;
  275. Error *local_err = NULL;
  276. int ret;
  277. GRAPH_RDLOCK_GUARD_MAINLOOP();
  278. if (!s->backup_job) {
  279. error_setg(errp, "Backup job was cancelled unexpectedly");
  280. return;
  281. }
  282. backup_do_checkpoint(s->backup_job, &local_err);
  283. if (local_err) {
  284. error_propagate(errp, local_err);
  285. return;
  286. }
  287. active_disk = bs->file;
  288. if (!active_disk->bs->drv) {
  289. error_setg(errp, "Active disk %s is ejected",
  290. active_disk->bs->node_name);
  291. return;
  292. }
  293. ret = bdrv_make_empty(active_disk, errp);
  294. if (ret < 0) {
  295. return;
  296. }
  297. if (!s->hidden_disk->bs->drv) {
  298. error_setg(errp, "Hidden disk %s is ejected",
  299. s->hidden_disk->bs->node_name);
  300. return;
  301. }
  302. ret = bdrv_make_empty(s->hidden_disk, errp);
  303. if (ret < 0) {
  304. return;
  305. }
  306. }
  307. /* This function is supposed to be called twice:
  308. * first with writable = true, then with writable = false.
  309. * The first call puts s->hidden_disk and s->secondary_disk in
  310. * r/w mode, and the second puts them back in their original state.
  311. */
  312. static void reopen_backing_file(BlockDriverState *bs, bool writable,
  313. Error **errp)
  314. {
  315. BDRVReplicationState *s = bs->opaque;
  316. BdrvChild *hidden_disk, *secondary_disk;
  317. BlockReopenQueue *reopen_queue = NULL;
  318. GLOBAL_STATE_CODE();
  319. GRAPH_RDLOCK_GUARD_MAINLOOP();
  320. /*
  321. * s->hidden_disk and s->secondary_disk may not be set yet, as they will
  322. * only be set after the children are writable.
  323. */
  324. hidden_disk = bs->file->bs->backing;
  325. secondary_disk = hidden_disk->bs->backing;
  326. if (writable) {
  327. s->orig_hidden_read_only = bdrv_is_read_only(hidden_disk->bs);
  328. s->orig_secondary_read_only = bdrv_is_read_only(secondary_disk->bs);
  329. }
  330. if (s->orig_hidden_read_only) {
  331. QDict *opts = qdict_new();
  332. qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
  333. reopen_queue = bdrv_reopen_queue(reopen_queue, hidden_disk->bs,
  334. opts, true);
  335. }
  336. if (s->orig_secondary_read_only) {
  337. QDict *opts = qdict_new();
  338. qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
  339. reopen_queue = bdrv_reopen_queue(reopen_queue, secondary_disk->bs,
  340. opts, true);
  341. }
  342. if (reopen_queue) {
  343. bdrv_reopen_multiple(reopen_queue, errp);
  344. }
  345. }
  346. static void backup_job_cleanup(BlockDriverState *bs)
  347. {
  348. BDRVReplicationState *s = bs->opaque;
  349. BlockDriverState *top_bs;
  350. s->backup_job = NULL;
  351. top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
  352. if (!top_bs) {
  353. return;
  354. }
  355. bdrv_op_unblock_all(top_bs, s->blocker);
  356. error_free(s->blocker);
  357. reopen_backing_file(bs, false, NULL);
  358. }
  359. static void backup_job_completed(void *opaque, int ret)
  360. {
  361. BlockDriverState *bs = opaque;
  362. BDRVReplicationState *s = bs->opaque;
  363. if (s->stage != BLOCK_REPLICATION_FAILOVER) {
  364. /* The backup job is cancelled unexpectedly */
  365. s->error = -EIO;
  366. }
  367. backup_job_cleanup(bs);
  368. }
  369. static bool GRAPH_RDLOCK
  370. check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
  371. {
  372. BdrvChild *child;
  373. /* The bs itself is the top_bs */
  374. if (top_bs == bs) {
  375. return true;
  376. }
  377. /* Iterate over top_bs's children */
  378. QLIST_FOREACH(child, &top_bs->children, next) {
  379. if (child->bs == bs || check_top_bs(child->bs, bs)) {
  380. return true;
  381. }
  382. }
  383. return false;
  384. }
  385. static void replication_start(ReplicationState *rs, ReplicationMode mode,
  386. Error **errp)
  387. {
  388. BlockDriverState *bs = rs->opaque;
  389. BDRVReplicationState *s;
  390. BlockDriverState *top_bs;
  391. BdrvChild *active_disk, *hidden_disk, *secondary_disk;
  392. int64_t active_length, hidden_length, disk_length;
  393. Error *local_err = NULL;
  394. BackupPerf perf = { .use_copy_range = true, .max_workers = 1 };
  395. GLOBAL_STATE_CODE();
  396. s = bs->opaque;
  397. if (s->stage == BLOCK_REPLICATION_DONE ||
  398. s->stage == BLOCK_REPLICATION_FAILOVER) {
  399. /*
  400. * This case happens when a secondary is promoted to primary.
  401. * Ignore the request because the secondary side of replication
  402. * doesn't have to do anything anymore.
  403. */
  404. return;
  405. }
  406. if (s->stage != BLOCK_REPLICATION_NONE) {
  407. error_setg(errp, "Block replication is running or done");
  408. return;
  409. }
  410. if (s->mode != mode) {
  411. error_setg(errp, "The parameter mode's value is invalid, needs %d,"
  412. " but got %d", s->mode, mode);
  413. return;
  414. }
  415. switch (s->mode) {
  416. case REPLICATION_MODE_PRIMARY:
  417. break;
  418. case REPLICATION_MODE_SECONDARY:
  419. bdrv_graph_rdlock_main_loop();
  420. active_disk = bs->file;
  421. if (!active_disk || !active_disk->bs || !active_disk->bs->backing) {
  422. error_setg(errp, "Active disk doesn't have backing file");
  423. bdrv_graph_rdunlock_main_loop();
  424. return;
  425. }
  426. hidden_disk = active_disk->bs->backing;
  427. if (!hidden_disk->bs || !hidden_disk->bs->backing) {
  428. error_setg(errp, "Hidden disk doesn't have backing file");
  429. bdrv_graph_rdunlock_main_loop();
  430. return;
  431. }
  432. secondary_disk = hidden_disk->bs->backing;
  433. if (!secondary_disk->bs || !bdrv_has_blk(secondary_disk->bs)) {
  434. error_setg(errp, "The secondary disk doesn't have block backend");
  435. bdrv_graph_rdunlock_main_loop();
  436. return;
  437. }
  438. bdrv_graph_rdunlock_main_loop();
  439. /* verify the length */
  440. active_length = bdrv_getlength(active_disk->bs);
  441. hidden_length = bdrv_getlength(hidden_disk->bs);
  442. disk_length = bdrv_getlength(secondary_disk->bs);
  443. if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
  444. active_length != hidden_length || hidden_length != disk_length) {
  445. error_setg(errp, "Active disk, hidden disk, secondary disk's length"
  446. " are not the same");
  447. return;
  448. }
  449. /* Must be true, or the bdrv_getlength() calls would have failed */
  450. assert(active_disk->bs->drv && hidden_disk->bs->drv);
  451. bdrv_graph_rdlock_main_loop();
  452. if (!active_disk->bs->drv->bdrv_make_empty ||
  453. !hidden_disk->bs->drv->bdrv_make_empty) {
  454. error_setg(errp,
  455. "Active disk or hidden disk doesn't support make_empty");
  456. bdrv_graph_rdunlock_main_loop();
  457. return;
  458. }
  459. bdrv_graph_rdunlock_main_loop();
  460. /* reopen the backing file in r/w mode */
  461. reopen_backing_file(bs, true, &local_err);
  462. if (local_err) {
  463. error_propagate(errp, local_err);
  464. return;
  465. }
  466. bdrv_graph_wrlock();
  467. bdrv_ref(hidden_disk->bs);
  468. s->hidden_disk = bdrv_attach_child(bs, hidden_disk->bs, "hidden disk",
  469. &child_of_bds, BDRV_CHILD_DATA,
  470. &local_err);
  471. if (local_err) {
  472. error_propagate(errp, local_err);
  473. bdrv_graph_wrunlock();
  474. return;
  475. }
  476. bdrv_ref(secondary_disk->bs);
  477. s->secondary_disk = bdrv_attach_child(bs, secondary_disk->bs,
  478. "secondary disk", &child_of_bds,
  479. BDRV_CHILD_DATA, &local_err);
  480. if (local_err) {
  481. error_propagate(errp, local_err);
  482. bdrv_graph_wrunlock();
  483. return;
  484. }
  485. /* start backup job now */
  486. error_setg(&s->blocker,
  487. "Block device is in use by internal backup job");
  488. top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
  489. if (!top_bs || !bdrv_is_root_node(top_bs) ||
  490. !check_top_bs(top_bs, bs)) {
  491. error_setg(errp, "No top_bs or it is invalid");
  492. bdrv_graph_wrunlock();
  493. reopen_backing_file(bs, false, NULL);
  494. return;
  495. }
  496. bdrv_op_block_all(top_bs, s->blocker);
  497. bdrv_graph_wrunlock();
  498. s->backup_job = backup_job_create(
  499. NULL, s->secondary_disk->bs, s->hidden_disk->bs,
  500. 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, false,
  501. NULL, &perf,
  502. BLOCKDEV_ON_ERROR_REPORT,
  503. BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
  504. backup_job_completed, bs, NULL, &local_err);
  505. if (local_err) {
  506. error_propagate(errp, local_err);
  507. backup_job_cleanup(bs);
  508. return;
  509. }
  510. job_start(&s->backup_job->job);
  511. break;
  512. default:
  513. abort();
  514. }
  515. s->stage = BLOCK_REPLICATION_RUNNING;
  516. if (s->mode == REPLICATION_MODE_SECONDARY) {
  517. secondary_do_checkpoint(bs, errp);
  518. }
  519. s->error = 0;
  520. }
  521. static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
  522. {
  523. BlockDriverState *bs = rs->opaque;
  524. BDRVReplicationState *s = bs->opaque;
  525. if (s->stage == BLOCK_REPLICATION_DONE ||
  526. s->stage == BLOCK_REPLICATION_FAILOVER) {
  527. /*
  528. * This case happens when a secondary was promoted to primary.
  529. * Ignore the request because the secondary side of replication
  530. * doesn't have to do anything anymore.
  531. */
  532. return;
  533. }
  534. if (s->mode == REPLICATION_MODE_SECONDARY) {
  535. secondary_do_checkpoint(bs, errp);
  536. }
  537. }
  538. static void replication_get_error(ReplicationState *rs, Error **errp)
  539. {
  540. BlockDriverState *bs = rs->opaque;
  541. BDRVReplicationState *s = bs->opaque;
  542. if (s->stage == BLOCK_REPLICATION_NONE) {
  543. error_setg(errp, "Block replication is not running");
  544. return;
  545. }
  546. if (s->error) {
  547. error_setg(errp, "I/O error occurred");
  548. return;
  549. }
  550. }
  551. static void replication_done(void *opaque, int ret)
  552. {
  553. BlockDriverState *bs = opaque;
  554. BDRVReplicationState *s = bs->opaque;
  555. if (ret == 0) {
  556. s->stage = BLOCK_REPLICATION_DONE;
  557. bdrv_graph_wrlock();
  558. bdrv_unref_child(bs, s->secondary_disk);
  559. s->secondary_disk = NULL;
  560. bdrv_unref_child(bs, s->hidden_disk);
  561. s->hidden_disk = NULL;
  562. bdrv_graph_wrunlock();
  563. s->error = 0;
  564. } else {
  565. s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
  566. s->error = -EIO;
  567. }
  568. }
  569. static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
  570. {
  571. BlockDriverState *bs = rs->opaque;
  572. BDRVReplicationState *s = bs->opaque;
  573. if (s->stage == BLOCK_REPLICATION_DONE ||
  574. s->stage == BLOCK_REPLICATION_FAILOVER) {
  575. /*
  576. * This case happens when a secondary was promoted to primary.
  577. * Ignore the request because the secondary side of replication
  578. * doesn't have to do anything anymore.
  579. */
  580. return;
  581. }
  582. if (s->stage != BLOCK_REPLICATION_RUNNING) {
  583. error_setg(errp, "Block replication is not running");
  584. return;
  585. }
  586. switch (s->mode) {
  587. case REPLICATION_MODE_PRIMARY:
  588. s->stage = BLOCK_REPLICATION_DONE;
  589. s->error = 0;
  590. break;
  591. case REPLICATION_MODE_SECONDARY:
  592. /*
  593. * This BDS will be closed, and the job should be completed
  594. * before the BDS is closed, because we will access hidden
  595. * disk, secondary disk in backup_job_completed().
  596. */
  597. if (s->backup_job) {
  598. job_cancel_sync(&s->backup_job->job, true);
  599. }
  600. if (!failover) {
  601. secondary_do_checkpoint(bs, errp);
  602. s->stage = BLOCK_REPLICATION_DONE;
  603. return;
  604. }
  605. bdrv_graph_rdlock_main_loop();
  606. s->stage = BLOCK_REPLICATION_FAILOVER;
  607. s->commit_job = commit_active_start(
  608. NULL, bs->file->bs, s->secondary_disk->bs,
  609. JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
  610. NULL, replication_done, bs, true, errp);
  611. bdrv_graph_rdunlock_main_loop();
  612. break;
  613. default:
  614. abort();
  615. }
  616. }
  617. static const char *const replication_strong_runtime_opts[] = {
  618. REPLICATION_MODE,
  619. REPLICATION_TOP_ID,
  620. NULL
  621. };
  622. static BlockDriver bdrv_replication = {
  623. .format_name = "replication",
  624. .instance_size = sizeof(BDRVReplicationState),
  625. .bdrv_open = replication_open,
  626. .bdrv_close = replication_close,
  627. .bdrv_child_perm = replication_child_perm,
  628. .bdrv_co_getlength = replication_co_getlength,
  629. .bdrv_co_readv = replication_co_readv,
  630. .bdrv_co_writev = replication_co_writev,
  631. .is_filter = true,
  632. .strong_runtime_opts = replication_strong_runtime_opts,
  633. };
  634. static void bdrv_replication_init(void)
  635. {
  636. bdrv_register(&bdrv_replication);
  637. }
  638. block_init(bdrv_replication_init);