virtio-blk.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517
  1. /*
  2. * Dedicated thread for virtio-blk I/O processing
  3. *
  4. * Copyright 2012 IBM, Corp.
  5. * Copyright 2012 Red Hat, Inc. and/or its affiliates
  6. *
  7. * Authors:
  8. * Stefan Hajnoczi <stefanha@redhat.com>
  9. *
  10. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11. * See the COPYING file in the top-level directory.
  12. *
  13. */
  14. #include "trace.h"
  15. #include "qemu/iov.h"
  16. #include "event-poll.h"
  17. #include "qemu/thread.h"
  18. #include "vring.h"
  19. #include "ioq.h"
  20. #include "migration/migration.h"
  21. #include "hw/virtio-blk.h"
  22. #include "hw/dataplane/virtio-blk.h"
  23. enum {
  24. SEG_MAX = 126, /* maximum number of I/O segments */
  25. VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */
  26. REQ_MAX = VRING_MAX, /* maximum number of requests in the vring,
  27. * is VRING_MAX / 2 with traditional and
  28. * VRING_MAX with indirect descriptors */
  29. };
  30. typedef struct {
  31. struct iocb iocb; /* Linux AIO control block */
  32. QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */
  33. unsigned int head; /* vring descriptor index */
  34. struct iovec *bounce_iov; /* used if guest buffers are unaligned */
  35. QEMUIOVector *read_qiov; /* for read completion /w bounce buffer */
  36. } VirtIOBlockRequest;
  37. struct VirtIOBlockDataPlane {
  38. bool started;
  39. bool stopping;
  40. QEMUBH *start_bh;
  41. QemuThread thread;
  42. VirtIOBlkConf *blk;
  43. int fd; /* image file descriptor */
  44. VirtIODevice *vdev;
  45. Vring vring; /* virtqueue vring */
  46. EventNotifier *guest_notifier; /* irq */
  47. EventPoll event_poll; /* event poller */
  48. EventHandler io_handler; /* Linux AIO completion handler */
  49. EventHandler notify_handler; /* virtqueue notify handler */
  50. IOQueue ioqueue; /* Linux AIO queue (should really be per
  51. dataplane thread) */
  52. VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the
  53. queue */
  54. unsigned int num_reqs;
  55. Error *migration_blocker;
  56. };
  57. /* Raise an interrupt to signal guest, if necessary */
  58. static void notify_guest(VirtIOBlockDataPlane *s)
  59. {
  60. if (!vring_should_notify(s->vdev, &s->vring)) {
  61. return;
  62. }
  63. event_notifier_set(s->guest_notifier);
  64. }
  65. static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
  66. {
  67. VirtIOBlockDataPlane *s = opaque;
  68. VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
  69. struct virtio_blk_inhdr hdr;
  70. int len;
  71. if (likely(ret >= 0)) {
  72. hdr.status = VIRTIO_BLK_S_OK;
  73. len = ret;
  74. } else {
  75. hdr.status = VIRTIO_BLK_S_IOERR;
  76. len = 0;
  77. }
  78. trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
  79. if (req->read_qiov) {
  80. assert(req->bounce_iov);
  81. qemu_iovec_from_buf(req->read_qiov, 0, req->bounce_iov->iov_base, len);
  82. qemu_iovec_destroy(req->read_qiov);
  83. g_slice_free(QEMUIOVector, req->read_qiov);
  84. }
  85. if (req->bounce_iov) {
  86. qemu_vfree(req->bounce_iov->iov_base);
  87. g_slice_free(struct iovec, req->bounce_iov);
  88. }
  89. qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr));
  90. qemu_iovec_destroy(req->inhdr);
  91. g_slice_free(QEMUIOVector, req->inhdr);
  92. /* According to the virtio specification len should be the number of bytes
  93. * written to, but for virtio-blk it seems to be the number of bytes
  94. * transferred plus the status bytes.
  95. */
  96. vring_push(&s->vring, req->head, len + sizeof(hdr));
  97. s->num_reqs--;
  98. }
  99. static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
  100. QEMUIOVector *inhdr, unsigned char status)
  101. {
  102. struct virtio_blk_inhdr hdr = {
  103. .status = status,
  104. };
  105. qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr));
  106. qemu_iovec_destroy(inhdr);
  107. g_slice_free(QEMUIOVector, inhdr);
  108. vring_push(&s->vring, head, sizeof(hdr));
  109. notify_guest(s);
  110. }
  111. /* Get disk serial number */
  112. static void do_get_id_cmd(VirtIOBlockDataPlane *s,
  113. struct iovec *iov, unsigned int iov_cnt,
  114. unsigned int head, QEMUIOVector *inhdr)
  115. {
  116. char id[VIRTIO_BLK_ID_BYTES];
  117. /* Serial number not NUL-terminated when shorter than buffer */
  118. strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
  119. iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
  120. complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
  121. }
  122. static int do_rdwr_cmd(VirtIOBlockDataPlane *s, bool read,
  123. struct iovec *iov, unsigned int iov_cnt,
  124. long long offset, unsigned int head,
  125. QEMUIOVector *inhdr)
  126. {
  127. struct iocb *iocb;
  128. QEMUIOVector qiov;
  129. struct iovec *bounce_iov = NULL;
  130. QEMUIOVector *read_qiov = NULL;
  131. qemu_iovec_init_external(&qiov, iov, iov_cnt);
  132. if (!bdrv_qiov_is_aligned(s->blk->conf.bs, &qiov)) {
  133. void *bounce_buffer = qemu_blockalign(s->blk->conf.bs, qiov.size);
  134. if (read) {
  135. /* Need to copy back from bounce buffer on completion */
  136. read_qiov = g_slice_new(QEMUIOVector);
  137. qemu_iovec_init(read_qiov, iov_cnt);
  138. qemu_iovec_concat_iov(read_qiov, iov, iov_cnt, 0, qiov.size);
  139. } else {
  140. qemu_iovec_to_buf(&qiov, 0, bounce_buffer, qiov.size);
  141. }
  142. /* Redirect I/O to aligned bounce buffer */
  143. bounce_iov = g_slice_new(struct iovec);
  144. bounce_iov->iov_base = bounce_buffer;
  145. bounce_iov->iov_len = qiov.size;
  146. iov = bounce_iov;
  147. iov_cnt = 1;
  148. }
  149. iocb = ioq_rdwr(&s->ioqueue, read, iov, iov_cnt, offset);
  150. /* Fill in virtio block metadata needed for completion */
  151. VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
  152. req->head = head;
  153. req->inhdr = inhdr;
  154. req->bounce_iov = bounce_iov;
  155. req->read_qiov = read_qiov;
  156. return 0;
  157. }
  158. static int process_request(IOQueue *ioq, struct iovec iov[],
  159. unsigned int out_num, unsigned int in_num,
  160. unsigned int head)
  161. {
  162. VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
  163. struct iovec *in_iov = &iov[out_num];
  164. struct virtio_blk_outhdr outhdr;
  165. QEMUIOVector *inhdr;
  166. size_t in_size;
  167. /* Copy in outhdr */
  168. if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
  169. sizeof(outhdr)) != sizeof(outhdr))) {
  170. error_report("virtio-blk request outhdr too short");
  171. return -EFAULT;
  172. }
  173. iov_discard_front(&iov, &out_num, sizeof(outhdr));
  174. /* Grab inhdr for later */
  175. in_size = iov_size(in_iov, in_num);
  176. if (in_size < sizeof(struct virtio_blk_inhdr)) {
  177. error_report("virtio_blk request inhdr too short");
  178. return -EFAULT;
  179. }
  180. inhdr = g_slice_new(QEMUIOVector);
  181. qemu_iovec_init(inhdr, 1);
  182. qemu_iovec_concat_iov(inhdr, in_iov, in_num,
  183. in_size - sizeof(struct virtio_blk_inhdr),
  184. sizeof(struct virtio_blk_inhdr));
  185. iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
  186. /* TODO Linux sets the barrier bit even when not advertised! */
  187. outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
  188. switch (outhdr.type) {
  189. case VIRTIO_BLK_T_IN:
  190. do_rdwr_cmd(s, true, in_iov, in_num, outhdr.sector * 512, head, inhdr);
  191. return 0;
  192. case VIRTIO_BLK_T_OUT:
  193. do_rdwr_cmd(s, false, iov, out_num, outhdr.sector * 512, head, inhdr);
  194. return 0;
  195. case VIRTIO_BLK_T_SCSI_CMD:
  196. /* TODO support SCSI commands */
  197. complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
  198. return 0;
  199. case VIRTIO_BLK_T_FLUSH:
  200. /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
  201. if (qemu_fdatasync(s->fd) < 0) {
  202. complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
  203. } else {
  204. complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
  205. }
  206. return 0;
  207. case VIRTIO_BLK_T_GET_ID:
  208. do_get_id_cmd(s, in_iov, in_num, head, inhdr);
  209. return 0;
  210. default:
  211. error_report("virtio-blk unsupported request type %#x", outhdr.type);
  212. qemu_iovec_destroy(inhdr);
  213. g_slice_free(QEMUIOVector, inhdr);
  214. return -EFAULT;
  215. }
  216. }
  217. static void handle_notify(EventHandler *handler)
  218. {
  219. VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
  220. notify_handler);
  221. /* There is one array of iovecs into which all new requests are extracted
  222. * from the vring. Requests are read from the vring and the translated
  223. * descriptors are written to the iovecs array. The iovecs do not have to
  224. * persist across handle_notify() calls because the kernel copies the
  225. * iovecs on io_submit().
  226. *
  227. * Handling io_submit() EAGAIN may require storing the requests across
  228. * handle_notify() calls until the kernel has sufficient resources to
  229. * accept more I/O. This is not implemented yet.
  230. */
  231. struct iovec iovec[VRING_MAX];
  232. struct iovec *end = &iovec[VRING_MAX];
  233. struct iovec *iov = iovec;
  234. /* When a request is read from the vring, the index of the first descriptor
  235. * (aka head) is returned so that the completed request can be pushed onto
  236. * the vring later.
  237. *
  238. * The number of hypervisor read-only iovecs is out_num. The number of
  239. * hypervisor write-only iovecs is in_num.
  240. */
  241. int head;
  242. unsigned int out_num = 0, in_num = 0;
  243. unsigned int num_queued;
  244. for (;;) {
  245. /* Disable guest->host notifies to avoid unnecessary vmexits */
  246. vring_disable_notification(s->vdev, &s->vring);
  247. for (;;) {
  248. head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
  249. if (head < 0) {
  250. break; /* no more requests */
  251. }
  252. trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
  253. head);
  254. if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
  255. vring_set_broken(&s->vring);
  256. break;
  257. }
  258. iov += out_num + in_num;
  259. }
  260. if (likely(head == -EAGAIN)) { /* vring emptied */
  261. /* Re-enable guest->host notifies and stop processing the vring.
  262. * But if the guest has snuck in more descriptors, keep processing.
  263. */
  264. if (vring_enable_notification(s->vdev, &s->vring)) {
  265. break;
  266. }
  267. } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
  268. /* Since there are no iovecs[] left, stop processing for now. Do
  269. * not re-enable guest->host notifies since the I/O completion
  270. * handler knows to check for more vring descriptors anyway.
  271. */
  272. break;
  273. }
  274. }
  275. num_queued = ioq_num_queued(&s->ioqueue);
  276. if (num_queued > 0) {
  277. s->num_reqs += num_queued;
  278. int rc = ioq_submit(&s->ioqueue);
  279. if (unlikely(rc < 0)) {
  280. fprintf(stderr, "ioq_submit failed %d\n", rc);
  281. exit(1);
  282. }
  283. }
  284. }
  285. static void handle_io(EventHandler *handler)
  286. {
  287. VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
  288. io_handler);
  289. if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) {
  290. notify_guest(s);
  291. }
  292. /* If there were more requests than iovecs, the vring will not be empty yet
  293. * so check again. There should now be enough resources to process more
  294. * requests.
  295. */
  296. if (unlikely(vring_more_avail(&s->vring))) {
  297. handle_notify(&s->notify_handler);
  298. }
  299. }
  300. static void *data_plane_thread(void *opaque)
  301. {
  302. VirtIOBlockDataPlane *s = opaque;
  303. do {
  304. event_poll(&s->event_poll);
  305. } while (!s->stopping || s->num_reqs > 0);
  306. return NULL;
  307. }
  308. static void start_data_plane_bh(void *opaque)
  309. {
  310. VirtIOBlockDataPlane *s = opaque;
  311. qemu_bh_delete(s->start_bh);
  312. s->start_bh = NULL;
  313. qemu_thread_create(&s->thread, data_plane_thread,
  314. s, QEMU_THREAD_JOINABLE);
  315. }
  316. bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
  317. VirtIOBlockDataPlane **dataplane)
  318. {
  319. VirtIOBlockDataPlane *s;
  320. int fd;
  321. *dataplane = NULL;
  322. if (!blk->data_plane) {
  323. return true;
  324. }
  325. if (blk->scsi) {
  326. error_report("device is incompatible with x-data-plane, use scsi=off");
  327. return false;
  328. }
  329. if (blk->config_wce) {
  330. error_report("device is incompatible with x-data-plane, "
  331. "use config-wce=off");
  332. return false;
  333. }
  334. fd = raw_get_aio_fd(blk->conf.bs);
  335. if (fd < 0) {
  336. error_report("drive is incompatible with x-data-plane, "
  337. "use format=raw,cache=none,aio=native");
  338. return false;
  339. }
  340. s = g_new0(VirtIOBlockDataPlane, 1);
  341. s->vdev = vdev;
  342. s->fd = fd;
  343. s->blk = blk;
  344. /* Prevent block operations that conflict with data plane thread */
  345. bdrv_set_in_use(blk->conf.bs, 1);
  346. error_setg(&s->migration_blocker,
  347. "x-data-plane does not support migration");
  348. migrate_add_blocker(s->migration_blocker);
  349. *dataplane = s;
  350. return true;
  351. }
  352. void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
  353. {
  354. if (!s) {
  355. return;
  356. }
  357. virtio_blk_data_plane_stop(s);
  358. migrate_del_blocker(s->migration_blocker);
  359. error_free(s->migration_blocker);
  360. bdrv_set_in_use(s->blk->conf.bs, 0);
  361. g_free(s);
  362. }
  363. void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
  364. {
  365. VirtQueue *vq;
  366. int i;
  367. if (s->started) {
  368. return;
  369. }
  370. vq = virtio_get_queue(s->vdev, 0);
  371. if (!vring_setup(&s->vring, s->vdev, 0)) {
  372. return;
  373. }
  374. event_poll_init(&s->event_poll);
  375. /* Set up guest notifier (irq) */
  376. if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1,
  377. true) != 0) {
  378. fprintf(stderr, "virtio-blk failed to set guest notifier, "
  379. "ensure -enable-kvm is set\n");
  380. exit(1);
  381. }
  382. s->guest_notifier = virtio_queue_get_guest_notifier(vq);
  383. /* Set up virtqueue notify */
  384. if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
  385. 0, true) != 0) {
  386. fprintf(stderr, "virtio-blk failed to set host notifier\n");
  387. exit(1);
  388. }
  389. event_poll_add(&s->event_poll, &s->notify_handler,
  390. virtio_queue_get_host_notifier(vq),
  391. handle_notify);
  392. /* Set up ioqueue */
  393. ioq_init(&s->ioqueue, s->fd, REQ_MAX);
  394. for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
  395. ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
  396. }
  397. event_poll_add(&s->event_poll, &s->io_handler,
  398. ioq_get_notifier(&s->ioqueue), handle_io);
  399. s->started = true;
  400. trace_virtio_blk_data_plane_start(s);
  401. /* Kick right away to begin processing requests already in vring */
  402. event_notifier_set(virtio_queue_get_host_notifier(vq));
  403. /* Spawn thread in BH so it inherits iothread cpusets */
  404. s->start_bh = qemu_bh_new(start_data_plane_bh, s);
  405. qemu_bh_schedule(s->start_bh);
  406. }
  407. void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
  408. {
  409. if (!s->started || s->stopping) {
  410. return;
  411. }
  412. s->stopping = true;
  413. trace_virtio_blk_data_plane_stop(s);
  414. /* Stop thread or cancel pending thread creation BH */
  415. if (s->start_bh) {
  416. qemu_bh_delete(s->start_bh);
  417. s->start_bh = NULL;
  418. } else {
  419. event_poll_notify(&s->event_poll);
  420. qemu_thread_join(&s->thread);
  421. }
  422. ioq_cleanup(&s->ioqueue);
  423. s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false);
  424. event_poll_cleanup(&s->event_poll);
  425. /* Clean up guest notifier (irq) */
  426. s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, 1, false);
  427. vring_teardown(&s->vring);
  428. s->started = false;
  429. s->stopping = false;
  430. }