linux-aio.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. /*
  2. * Linux native AIO support.
  3. *
  4. * Copyright (C) 2009 IBM, Corp.
  5. * Copyright (C) 2009 Red Hat, Inc.
  6. *
  7. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  8. * See the COPYING file in the top-level directory.
  9. */
  10. #include "qemu-common.h"
  11. #include "qemu-aio.h"
  12. #include "block/raw-posix-aio.h"
  13. #include <sys/eventfd.h>
  14. #include <libaio.h>
  15. /*
  16. * Queue size (per-device).
  17. *
  18. * XXX: eventually we need to communicate this to the guest and/or make it
  19. * tunable by the guest. If we get more outstanding requests at a time
  20. * than this we will get EAGAIN from io_submit which is communicated to
  21. * the guest as an I/O error.
  22. */
  23. #define MAX_EVENTS 128
  24. struct qemu_laiocb {
  25. BlockDriverAIOCB common;
  26. struct qemu_laio_state *ctx;
  27. struct iocb iocb;
  28. ssize_t ret;
  29. size_t nbytes;
  30. QEMUIOVector *qiov;
  31. bool is_read;
  32. QLIST_ENTRY(qemu_laiocb) node;
  33. };
  34. struct qemu_laio_state {
  35. io_context_t ctx;
  36. int efd;
  37. int count;
  38. };
  39. static inline ssize_t io_event_ret(struct io_event *ev)
  40. {
  41. return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
  42. }
  43. /*
  44. * Completes an AIO request (calls the callback and frees the ACB).
  45. */
  46. static void qemu_laio_process_completion(struct qemu_laio_state *s,
  47. struct qemu_laiocb *laiocb)
  48. {
  49. int ret;
  50. s->count--;
  51. ret = laiocb->ret;
  52. if (ret != -ECANCELED) {
  53. if (ret == laiocb->nbytes) {
  54. ret = 0;
  55. } else if (ret >= 0) {
  56. /* Short reads mean EOF, pad with zeros. */
  57. if (laiocb->is_read) {
  58. qemu_iovec_memset_skip(laiocb->qiov, 0,
  59. laiocb->qiov->size - ret, ret);
  60. } else {
  61. ret = -EINVAL;
  62. }
  63. }
  64. laiocb->common.cb(laiocb->common.opaque, ret);
  65. }
  66. qemu_aio_release(laiocb);
  67. }
  68. static void qemu_laio_completion_cb(void *opaque)
  69. {
  70. struct qemu_laio_state *s = opaque;
  71. while (1) {
  72. struct io_event events[MAX_EVENTS];
  73. uint64_t val;
  74. ssize_t ret;
  75. struct timespec ts = { 0 };
  76. int nevents, i;
  77. do {
  78. ret = read(s->efd, &val, sizeof(val));
  79. } while (ret == -1 && errno == EINTR);
  80. if (ret == -1 && errno == EAGAIN)
  81. break;
  82. if (ret != 8)
  83. break;
  84. do {
  85. nevents = io_getevents(s->ctx, val, MAX_EVENTS, events, &ts);
  86. } while (nevents == -EINTR);
  87. for (i = 0; i < nevents; i++) {
  88. struct iocb *iocb = events[i].obj;
  89. struct qemu_laiocb *laiocb =
  90. container_of(iocb, struct qemu_laiocb, iocb);
  91. laiocb->ret = io_event_ret(&events[i]);
  92. qemu_laio_process_completion(s, laiocb);
  93. }
  94. }
  95. }
  96. static int qemu_laio_flush_cb(void *opaque)
  97. {
  98. struct qemu_laio_state *s = opaque;
  99. return (s->count > 0) ? 1 : 0;
  100. }
  101. static void laio_cancel(BlockDriverAIOCB *blockacb)
  102. {
  103. struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
  104. struct io_event event;
  105. int ret;
  106. if (laiocb->ret != -EINPROGRESS)
  107. return;
  108. /*
  109. * Note that as of Linux 2.6.31 neither the block device code nor any
  110. * filesystem implements cancellation of AIO request.
  111. * Thus the polling loop below is the normal code path.
  112. */
  113. ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
  114. if (ret == 0) {
  115. laiocb->ret = -ECANCELED;
  116. return;
  117. }
  118. /*
  119. * We have to wait for the iocb to finish.
  120. *
  121. * The only way to get the iocb status update is by polling the io context.
  122. * We might be able to do this slightly more optimal by removing the
  123. * O_NONBLOCK flag.
  124. */
  125. while (laiocb->ret == -EINPROGRESS)
  126. qemu_laio_completion_cb(laiocb->ctx);
  127. }
  128. static AIOPool laio_pool = {
  129. .aiocb_size = sizeof(struct qemu_laiocb),
  130. .cancel = laio_cancel,
  131. };
  132. BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
  133. int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
  134. BlockDriverCompletionFunc *cb, void *opaque, int type)
  135. {
  136. struct qemu_laio_state *s = aio_ctx;
  137. struct qemu_laiocb *laiocb;
  138. struct iocb *iocbs;
  139. off_t offset = sector_num * 512;
  140. laiocb = qemu_aio_get(&laio_pool, bs, cb, opaque);
  141. laiocb->nbytes = nb_sectors * 512;
  142. laiocb->ctx = s;
  143. laiocb->ret = -EINPROGRESS;
  144. laiocb->is_read = (type == QEMU_AIO_READ);
  145. laiocb->qiov = qiov;
  146. iocbs = &laiocb->iocb;
  147. switch (type) {
  148. case QEMU_AIO_WRITE:
  149. io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
  150. break;
  151. case QEMU_AIO_READ:
  152. io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
  153. break;
  154. /* Currently Linux kernel does not support other operations */
  155. default:
  156. fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
  157. __func__, type);
  158. goto out_free_aiocb;
  159. }
  160. io_set_eventfd(&laiocb->iocb, s->efd);
  161. s->count++;
  162. if (io_submit(s->ctx, 1, &iocbs) < 0)
  163. goto out_dec_count;
  164. return &laiocb->common;
  165. out_dec_count:
  166. s->count--;
  167. out_free_aiocb:
  168. qemu_aio_release(laiocb);
  169. return NULL;
  170. }
  171. void *laio_init(void)
  172. {
  173. struct qemu_laio_state *s;
  174. s = g_malloc0(sizeof(*s));
  175. s->efd = eventfd(0, 0);
  176. if (s->efd == -1)
  177. goto out_free_state;
  178. fcntl(s->efd, F_SETFL, O_NONBLOCK);
  179. if (io_setup(MAX_EVENTS, &s->ctx) != 0)
  180. goto out_close_efd;
  181. qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb, NULL,
  182. qemu_laio_flush_cb, s);
  183. return s;
  184. out_close_efd:
  185. close(s->efd);
  186. out_free_state:
  187. g_free(s);
  188. return NULL;
  189. }