linux-aio.c 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. /*
  2. * Linux native AIO support.
  3. *
  4. * Copyright (C) 2009 IBM, Corp.
  5. * Copyright (C) 2009 Red Hat, Inc.
  6. *
  7. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  8. * See the COPYING file in the top-level directory.
  9. */
  10. #include "qemu-common.h"
  11. #include "qemu-aio.h"
  12. #include "block_int.h"
  13. #include "block/raw-posix-aio.h"
  14. #include <sys/eventfd.h>
  15. #include <libaio.h>
  16. /*
  17. * Queue size (per-device).
  18. *
  19. * XXX: eventually we need to communicate this to the guest and/or make it
  20. * tunable by the guest. If we get more outstanding requests at a time
  21. * than this we will get EAGAIN from io_submit which is communicated to
  22. * the guest as an I/O error.
  23. */
  24. #define MAX_EVENTS 128
  25. struct qemu_laiocb {
  26. BlockDriverAIOCB common;
  27. struct qemu_laio_state *ctx;
  28. struct iocb iocb;
  29. ssize_t ret;
  30. size_t nbytes;
  31. QEMUIOVector *qiov;
  32. bool is_read;
  33. QLIST_ENTRY(qemu_laiocb) node;
  34. };
  35. struct qemu_laio_state {
  36. io_context_t ctx;
  37. int efd;
  38. int count;
  39. };
  40. static inline ssize_t io_event_ret(struct io_event *ev)
  41. {
  42. return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
  43. }
  44. /*
  45. * Completes an AIO request (calls the callback and frees the ACB).
  46. */
  47. static void qemu_laio_process_completion(struct qemu_laio_state *s,
  48. struct qemu_laiocb *laiocb)
  49. {
  50. int ret;
  51. s->count--;
  52. ret = laiocb->ret;
  53. if (ret != -ECANCELED) {
  54. if (ret == laiocb->nbytes) {
  55. ret = 0;
  56. } else if (ret >= 0) {
  57. /* Short reads mean EOF, pad with zeros. */
  58. if (laiocb->is_read) {
  59. qemu_iovec_memset_skip(laiocb->qiov, 0,
  60. laiocb->qiov->size - ret, ret);
  61. } else {
  62. ret = -EINVAL;
  63. }
  64. }
  65. laiocb->common.cb(laiocb->common.opaque, ret);
  66. }
  67. qemu_aio_release(laiocb);
  68. }
  69. static void qemu_laio_completion_cb(void *opaque)
  70. {
  71. struct qemu_laio_state *s = opaque;
  72. while (1) {
  73. struct io_event events[MAX_EVENTS];
  74. uint64_t val;
  75. ssize_t ret;
  76. struct timespec ts = { 0 };
  77. int nevents, i;
  78. do {
  79. ret = read(s->efd, &val, sizeof(val));
  80. } while (ret == -1 && errno == EINTR);
  81. if (ret == -1 && errno == EAGAIN)
  82. break;
  83. if (ret != 8)
  84. break;
  85. do {
  86. nevents = io_getevents(s->ctx, val, MAX_EVENTS, events, &ts);
  87. } while (nevents == -EINTR);
  88. for (i = 0; i < nevents; i++) {
  89. struct iocb *iocb = events[i].obj;
  90. struct qemu_laiocb *laiocb =
  91. container_of(iocb, struct qemu_laiocb, iocb);
  92. laiocb->ret = io_event_ret(&events[i]);
  93. qemu_laio_process_completion(s, laiocb);
  94. }
  95. }
  96. }
  97. static int qemu_laio_flush_cb(void *opaque)
  98. {
  99. struct qemu_laio_state *s = opaque;
  100. return (s->count > 0) ? 1 : 0;
  101. }
  102. static void laio_cancel(BlockDriverAIOCB *blockacb)
  103. {
  104. struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
  105. struct io_event event;
  106. int ret;
  107. if (laiocb->ret != -EINPROGRESS)
  108. return;
  109. /*
  110. * Note that as of Linux 2.6.31 neither the block device code nor any
  111. * filesystem implements cancellation of AIO request.
  112. * Thus the polling loop below is the normal code path.
  113. */
  114. ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
  115. if (ret == 0) {
  116. laiocb->ret = -ECANCELED;
  117. return;
  118. }
  119. /*
  120. * We have to wait for the iocb to finish.
  121. *
  122. * The only way to get the iocb status update is by polling the io context.
  123. * We might be able to do this slightly more optimal by removing the
  124. * O_NONBLOCK flag.
  125. */
  126. while (laiocb->ret == -EINPROGRESS)
  127. qemu_laio_completion_cb(laiocb->ctx);
  128. }
  129. static AIOPool laio_pool = {
  130. .aiocb_size = sizeof(struct qemu_laiocb),
  131. .cancel = laio_cancel,
  132. };
  133. BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
  134. int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
  135. BlockDriverCompletionFunc *cb, void *opaque, int type)
  136. {
  137. struct qemu_laio_state *s = aio_ctx;
  138. struct qemu_laiocb *laiocb;
  139. struct iocb *iocbs;
  140. off_t offset = sector_num * 512;
  141. laiocb = qemu_aio_get(&laio_pool, bs, cb, opaque);
  142. if (!laiocb)
  143. return NULL;
  144. laiocb->nbytes = nb_sectors * 512;
  145. laiocb->ctx = s;
  146. laiocb->ret = -EINPROGRESS;
  147. laiocb->is_read = (type == QEMU_AIO_READ);
  148. laiocb->qiov = qiov;
  149. iocbs = &laiocb->iocb;
  150. switch (type) {
  151. case QEMU_AIO_WRITE:
  152. io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
  153. break;
  154. case QEMU_AIO_READ:
  155. io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
  156. break;
  157. /* Currently Linux kernel does not support other operations */
  158. default:
  159. fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
  160. __func__, type);
  161. goto out_free_aiocb;
  162. }
  163. io_set_eventfd(&laiocb->iocb, s->efd);
  164. s->count++;
  165. if (io_submit(s->ctx, 1, &iocbs) < 0)
  166. goto out_dec_count;
  167. return &laiocb->common;
  168. out_dec_count:
  169. s->count--;
  170. out_free_aiocb:
  171. qemu_aio_release(laiocb);
  172. return NULL;
  173. }
  174. void *laio_init(void)
  175. {
  176. struct qemu_laio_state *s;
  177. s = g_malloc0(sizeof(*s));
  178. s->efd = eventfd(0, 0);
  179. if (s->efd == -1)
  180. goto out_free_state;
  181. fcntl(s->efd, F_SETFL, O_NONBLOCK);
  182. if (io_setup(MAX_EVENTS, &s->ctx) != 0)
  183. goto out_close_efd;
  184. qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb, NULL,
  185. qemu_laio_flush_cb, NULL, s);
  186. return s;
  187. out_close_efd:
  188. close(s->efd);
  189. out_free_state:
  190. g_free(s);
  191. return NULL;
  192. }