aio-posix.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789
  1. /*
  2. * QEMU aio implementation
  3. *
  4. * Copyright IBM, Corp. 2008
  5. *
  6. * Authors:
  7. * Anthony Liguori <aliguori@us.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu/osdep.h"
  16. #include "block/block.h"
  17. #include "block/thread-pool.h"
  18. #include "qemu/main-loop.h"
  19. #include "qemu/rcu.h"
  20. #include "qemu/rcu_queue.h"
  21. #include "qemu/sockets.h"
  22. #include "qemu/cutils.h"
  23. #include "trace.h"
  24. #include "aio-posix.h"
  25. /* Stop userspace polling on a handler if it isn't active for some time */
  26. #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
  27. bool aio_poll_disabled(AioContext *ctx)
  28. {
  29. return qatomic_read(&ctx->poll_disable_cnt);
  30. }
  31. void aio_add_ready_handler(AioHandlerList *ready_list,
  32. AioHandler *node,
  33. int revents)
  34. {
  35. QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
  36. node->pfd.revents = revents;
  37. QLIST_INSERT_HEAD(ready_list, node, node_ready);
  38. }
  39. static void aio_add_poll_ready_handler(AioHandlerList *ready_list,
  40. AioHandler *node)
  41. {
  42. QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
  43. node->poll_ready = true;
  44. QLIST_INSERT_HEAD(ready_list, node, node_ready);
  45. }
  46. static AioHandler *find_aio_handler(AioContext *ctx, int fd)
  47. {
  48. AioHandler *node;
  49. QLIST_FOREACH(node, &ctx->aio_handlers, node) {
  50. if (node->pfd.fd == fd) {
  51. if (!QLIST_IS_INSERTED(node, node_deleted)) {
  52. return node;
  53. }
  54. }
  55. }
  56. return NULL;
  57. }
  58. static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
  59. {
  60. /* If the GSource is in the process of being destroyed then
  61. * g_source_remove_poll() causes an assertion failure. Skip
  62. * removal in that case, because glib cleans up its state during
  63. * destruction anyway.
  64. */
  65. if (!g_source_is_destroyed(&ctx->source)) {
  66. g_source_remove_poll(&ctx->source, &node->pfd);
  67. }
  68. node->pfd.revents = 0;
  69. node->poll_ready = false;
  70. /* If the fd monitor has already marked it deleted, leave it alone */
  71. if (QLIST_IS_INSERTED(node, node_deleted)) {
  72. return false;
  73. }
  74. /* If a read is in progress, just mark the node as deleted */
  75. if (qemu_lockcnt_count(&ctx->list_lock)) {
  76. QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
  77. return false;
  78. }
  79. /* Otherwise, delete it for real. We can't just mark it as
  80. * deleted because deleted nodes are only cleaned up while
  81. * no one is walking the handlers list.
  82. */
  83. QLIST_SAFE_REMOVE(node, node_poll);
  84. QLIST_REMOVE(node, node);
  85. return true;
  86. }
  87. void aio_set_fd_handler(AioContext *ctx,
  88. int fd,
  89. IOHandler *io_read,
  90. IOHandler *io_write,
  91. AioPollFn *io_poll,
  92. IOHandler *io_poll_ready,
  93. void *opaque)
  94. {
  95. AioHandler *node;
  96. AioHandler *new_node = NULL;
  97. bool is_new = false;
  98. bool deleted = false;
  99. int poll_disable_change;
  100. if (io_poll && !io_poll_ready) {
  101. io_poll = NULL; /* polling only makes sense if there is a handler */
  102. }
  103. qemu_lockcnt_lock(&ctx->list_lock);
  104. node = find_aio_handler(ctx, fd);
  105. /* Are we deleting the fd handler? */
  106. if (!io_read && !io_write && !io_poll) {
  107. if (node == NULL) {
  108. qemu_lockcnt_unlock(&ctx->list_lock);
  109. return;
  110. }
  111. /* Clean events in order to unregister fd from the ctx epoll. */
  112. node->pfd.events = 0;
  113. poll_disable_change = -!node->io_poll;
  114. } else {
  115. poll_disable_change = !io_poll - (node && !node->io_poll);
  116. if (node == NULL) {
  117. is_new = true;
  118. }
  119. /* Alloc and insert if it's not already there */
  120. new_node = g_new0(AioHandler, 1);
  121. /* Update handler with latest information */
  122. new_node->io_read = io_read;
  123. new_node->io_write = io_write;
  124. new_node->io_poll = io_poll;
  125. new_node->io_poll_ready = io_poll_ready;
  126. new_node->opaque = opaque;
  127. if (is_new) {
  128. new_node->pfd.fd = fd;
  129. } else {
  130. new_node->pfd = node->pfd;
  131. }
  132. g_source_add_poll(&ctx->source, &new_node->pfd);
  133. new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
  134. new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
  135. QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
  136. }
  137. /* No need to order poll_disable_cnt writes against other updates;
  138. * the counter is only used to avoid wasting time and latency on
  139. * iterated polling when the system call will be ultimately necessary.
  140. * Changing handlers is a rare event, and a little wasted polling until
  141. * the aio_notify below is not an issue.
  142. */
  143. qatomic_set(&ctx->poll_disable_cnt,
  144. qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
  145. ctx->fdmon_ops->update(ctx, node, new_node);
  146. if (node) {
  147. deleted = aio_remove_fd_handler(ctx, node);
  148. }
  149. qemu_lockcnt_unlock(&ctx->list_lock);
  150. aio_notify(ctx);
  151. if (deleted) {
  152. g_free(node);
  153. }
  154. }
  155. static void aio_set_fd_poll(AioContext *ctx, int fd,
  156. IOHandler *io_poll_begin,
  157. IOHandler *io_poll_end)
  158. {
  159. AioHandler *node = find_aio_handler(ctx, fd);
  160. if (!node) {
  161. return;
  162. }
  163. node->io_poll_begin = io_poll_begin;
  164. node->io_poll_end = io_poll_end;
  165. }
  166. void aio_set_event_notifier(AioContext *ctx,
  167. EventNotifier *notifier,
  168. EventNotifierHandler *io_read,
  169. AioPollFn *io_poll,
  170. EventNotifierHandler *io_poll_ready)
  171. {
  172. aio_set_fd_handler(ctx, event_notifier_get_fd(notifier),
  173. (IOHandler *)io_read, NULL, io_poll,
  174. (IOHandler *)io_poll_ready, notifier);
  175. }
  176. void aio_set_event_notifier_poll(AioContext *ctx,
  177. EventNotifier *notifier,
  178. EventNotifierHandler *io_poll_begin,
  179. EventNotifierHandler *io_poll_end)
  180. {
  181. aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
  182. (IOHandler *)io_poll_begin,
  183. (IOHandler *)io_poll_end);
  184. }
  185. static bool poll_set_started(AioContext *ctx, AioHandlerList *ready_list,
  186. bool started)
  187. {
  188. AioHandler *node;
  189. bool progress = false;
  190. if (started == ctx->poll_started) {
  191. return false;
  192. }
  193. ctx->poll_started = started;
  194. qemu_lockcnt_inc(&ctx->list_lock);
  195. QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
  196. IOHandler *fn;
  197. if (QLIST_IS_INSERTED(node, node_deleted)) {
  198. continue;
  199. }
  200. if (started) {
  201. fn = node->io_poll_begin;
  202. } else {
  203. fn = node->io_poll_end;
  204. }
  205. if (fn) {
  206. fn(node->opaque);
  207. }
  208. /* Poll one last time in case ->io_poll_end() raced with the event */
  209. if (!started && node->io_poll(node->opaque)) {
  210. aio_add_poll_ready_handler(ready_list, node);
  211. progress = true;
  212. }
  213. }
  214. qemu_lockcnt_dec(&ctx->list_lock);
  215. return progress;
  216. }
  217. bool aio_prepare(AioContext *ctx)
  218. {
  219. AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
  220. /* Poll mode cannot be used with glib's event loop, disable it. */
  221. poll_set_started(ctx, &ready_list, false);
  222. /* TODO what to do with this list? */
  223. return false;
  224. }
  225. bool aio_pending(AioContext *ctx)
  226. {
  227. AioHandler *node;
  228. bool result = false;
  229. /*
  230. * We have to walk very carefully in case aio_set_fd_handler is
  231. * called while we're walking.
  232. */
  233. qemu_lockcnt_inc(&ctx->list_lock);
  234. QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
  235. int revents;
  236. /* TODO should this check poll ready? */
  237. revents = node->pfd.revents & node->pfd.events;
  238. if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read) {
  239. result = true;
  240. break;
  241. }
  242. if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write) {
  243. result = true;
  244. break;
  245. }
  246. }
  247. qemu_lockcnt_dec(&ctx->list_lock);
  248. return result;
  249. }
  250. static void aio_free_deleted_handlers(AioContext *ctx)
  251. {
  252. AioHandler *node;
  253. if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
  254. return;
  255. }
  256. if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
  257. return; /* we are nested, let the parent do the freeing */
  258. }
  259. while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
  260. QLIST_REMOVE(node, node);
  261. QLIST_REMOVE(node, node_deleted);
  262. QLIST_SAFE_REMOVE(node, node_poll);
  263. g_free(node);
  264. }
  265. qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
  266. }
  267. static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
  268. {
  269. bool progress = false;
  270. bool poll_ready;
  271. int revents;
  272. revents = node->pfd.revents & node->pfd.events;
  273. node->pfd.revents = 0;
  274. poll_ready = node->poll_ready;
  275. node->poll_ready = false;
  276. /*
  277. * Start polling AioHandlers when they become ready because activity is
  278. * likely to continue. Note that starvation is theoretically possible when
  279. * fdmon_supports_polling(), but only until the fd fires for the first
  280. * time.
  281. */
  282. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  283. !QLIST_IS_INSERTED(node, node_poll) &&
  284. node->io_poll) {
  285. trace_poll_add(ctx, node, node->pfd.fd, revents);
  286. if (ctx->poll_started && node->io_poll_begin) {
  287. node->io_poll_begin(node->opaque);
  288. }
  289. QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
  290. }
  291. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  292. poll_ready && revents == 0 && node->io_poll_ready) {
  293. /*
  294. * Remove temporarily to avoid infinite loops when ->io_poll_ready()
  295. * calls aio_poll() before clearing the condition that made the poll
  296. * handler become ready.
  297. */
  298. QLIST_SAFE_REMOVE(node, node_poll);
  299. node->io_poll_ready(node->opaque);
  300. if (!QLIST_IS_INSERTED(node, node_poll)) {
  301. QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
  302. }
  303. /*
  304. * Return early since revents was zero. aio_notify() does not count as
  305. * progress.
  306. */
  307. return node->opaque != &ctx->notifier;
  308. }
  309. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  310. (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
  311. node->io_read) {
  312. node->io_read(node->opaque);
  313. /* aio_notify() does not count as progress */
  314. if (node->opaque != &ctx->notifier) {
  315. progress = true;
  316. }
  317. }
  318. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  319. (revents & (G_IO_OUT | G_IO_ERR)) &&
  320. node->io_write) {
  321. node->io_write(node->opaque);
  322. progress = true;
  323. }
  324. return progress;
  325. }
  326. /*
  327. * If we have a list of ready handlers then this is more efficient than
  328. * scanning all handlers with aio_dispatch_handlers().
  329. */
  330. static bool aio_dispatch_ready_handlers(AioContext *ctx,
  331. AioHandlerList *ready_list)
  332. {
  333. bool progress = false;
  334. AioHandler *node;
  335. while ((node = QLIST_FIRST(ready_list))) {
  336. QLIST_REMOVE(node, node_ready);
  337. progress = aio_dispatch_handler(ctx, node) || progress;
  338. }
  339. return progress;
  340. }
  341. /* Slower than aio_dispatch_ready_handlers() but only used via glib */
  342. static bool aio_dispatch_handlers(AioContext *ctx)
  343. {
  344. AioHandler *node, *tmp;
  345. bool progress = false;
  346. QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
  347. progress = aio_dispatch_handler(ctx, node) || progress;
  348. }
  349. return progress;
  350. }
  351. void aio_dispatch(AioContext *ctx)
  352. {
  353. qemu_lockcnt_inc(&ctx->list_lock);
  354. aio_bh_poll(ctx);
  355. aio_dispatch_handlers(ctx);
  356. aio_free_deleted_handlers(ctx);
  357. qemu_lockcnt_dec(&ctx->list_lock);
  358. timerlistgroup_run_timers(&ctx->tlg);
  359. }
  360. static bool run_poll_handlers_once(AioContext *ctx,
  361. AioHandlerList *ready_list,
  362. int64_t now,
  363. int64_t *timeout)
  364. {
  365. bool progress = false;
  366. AioHandler *node;
  367. AioHandler *tmp;
  368. QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
  369. if (node->io_poll(node->opaque)) {
  370. aio_add_poll_ready_handler(ready_list, node);
  371. node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
  372. /*
  373. * Polling was successful, exit try_poll_mode immediately
  374. * to adjust the next polling time.
  375. */
  376. *timeout = 0;
  377. if (node->opaque != &ctx->notifier) {
  378. progress = true;
  379. }
  380. }
  381. /* Caller handles freeing deleted nodes. Don't do it here. */
  382. }
  383. return progress;
  384. }
  385. static bool fdmon_supports_polling(AioContext *ctx)
  386. {
  387. return ctx->fdmon_ops->need_wait != aio_poll_disabled;
  388. }
  389. static bool remove_idle_poll_handlers(AioContext *ctx,
  390. AioHandlerList *ready_list,
  391. int64_t now)
  392. {
  393. AioHandler *node;
  394. AioHandler *tmp;
  395. bool progress = false;
  396. /*
  397. * File descriptor monitoring implementations without userspace polling
  398. * support suffer from starvation when a subset of handlers is polled
  399. * because fds will not be processed in a timely fashion. Don't remove
  400. * idle poll handlers.
  401. */
  402. if (!fdmon_supports_polling(ctx)) {
  403. return false;
  404. }
  405. QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
  406. if (node->poll_idle_timeout == 0LL) {
  407. node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
  408. } else if (now >= node->poll_idle_timeout) {
  409. trace_poll_remove(ctx, node, node->pfd.fd);
  410. node->poll_idle_timeout = 0LL;
  411. QLIST_SAFE_REMOVE(node, node_poll);
  412. if (ctx->poll_started && node->io_poll_end) {
  413. node->io_poll_end(node->opaque);
  414. /*
  415. * Final poll in case ->io_poll_end() races with an event.
  416. * Nevermind about re-adding the handler in the rare case where
  417. * this causes progress.
  418. */
  419. if (node->io_poll(node->opaque)) {
  420. aio_add_poll_ready_handler(ready_list, node);
  421. progress = true;
  422. }
  423. }
  424. }
  425. }
  426. return progress;
  427. }
  428. /* run_poll_handlers:
  429. * @ctx: the AioContext
  430. * @ready_list: the list to place ready handlers on
  431. * @max_ns: maximum time to poll for, in nanoseconds
  432. *
  433. * Polls for a given time.
  434. *
  435. * Note that the caller must have incremented ctx->list_lock.
  436. *
  437. * Returns: true if progress was made, false otherwise
  438. */
  439. static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
  440. int64_t max_ns, int64_t *timeout)
  441. {
  442. bool progress;
  443. int64_t start_time, elapsed_time;
  444. assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
  445. trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
  446. /*
  447. * Optimization: ->io_poll() handlers often contain RCU read critical
  448. * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
  449. * -> rcu_read_lock() -> ... sequences with expensive memory
  450. * synchronization primitives. Make the entire polling loop an RCU
  451. * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
  452. * are cheap.
  453. */
  454. RCU_READ_LOCK_GUARD();
  455. start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
  456. do {
  457. progress = run_poll_handlers_once(ctx, ready_list,
  458. start_time, timeout);
  459. elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
  460. max_ns = qemu_soonest_timeout(*timeout, max_ns);
  461. assert(!(max_ns && progress));
  462. } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
  463. if (remove_idle_poll_handlers(ctx, ready_list,
  464. start_time + elapsed_time)) {
  465. *timeout = 0;
  466. progress = true;
  467. }
  468. /* If time has passed with no successful polling, adjust *timeout to
  469. * keep the same ending time.
  470. */
  471. if (*timeout != -1) {
  472. *timeout -= MIN(*timeout, elapsed_time);
  473. }
  474. trace_run_poll_handlers_end(ctx, progress, *timeout);
  475. return progress;
  476. }
  477. /* try_poll_mode:
  478. * @ctx: the AioContext
  479. * @ready_list: list to add handlers that need to be run
  480. * @timeout: timeout for blocking wait, computed by the caller and updated if
  481. * polling succeeds.
  482. *
  483. * Note that the caller must have incremented ctx->list_lock.
  484. *
  485. * Returns: true if progress was made, false otherwise
  486. */
  487. static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
  488. int64_t *timeout)
  489. {
  490. int64_t max_ns;
  491. if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
  492. return false;
  493. }
  494. max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
  495. if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
  496. /*
  497. * Enable poll mode. It pairs with the poll_set_started() in
  498. * aio_poll() which disables poll mode.
  499. */
  500. poll_set_started(ctx, ready_list, true);
  501. if (run_poll_handlers(ctx, ready_list, max_ns, timeout)) {
  502. return true;
  503. }
  504. }
  505. return false;
  506. }
  507. bool aio_poll(AioContext *ctx, bool blocking)
  508. {
  509. AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
  510. bool progress;
  511. bool use_notify_me;
  512. int64_t timeout;
  513. int64_t start = 0;
  514. /*
  515. * There cannot be two concurrent aio_poll calls for the same AioContext (or
  516. * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
  517. * We rely on this below to avoid slow locked accesses to ctx->notify_me.
  518. *
  519. * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
  520. * is special in that it runs in the main thread, but that thread's context
  521. * is qemu_aio_context.
  522. */
  523. assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
  524. qemu_get_aio_context() : ctx));
  525. qemu_lockcnt_inc(&ctx->list_lock);
  526. if (ctx->poll_max_ns) {
  527. start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
  528. }
  529. timeout = blocking ? aio_compute_timeout(ctx) : 0;
  530. progress = try_poll_mode(ctx, &ready_list, &timeout);
  531. assert(!(timeout && progress));
  532. /*
  533. * aio_notify can avoid the expensive event_notifier_set if
  534. * everything (file descriptors, bottom halves, timers) will
  535. * be re-evaluated before the next blocking poll(). This is
  536. * already true when aio_poll is called with blocking == false;
  537. * if blocking == true, it is only true after poll() returns,
  538. * so disable the optimization now.
  539. */
  540. use_notify_me = timeout != 0;
  541. if (use_notify_me) {
  542. qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
  543. /*
  544. * Write ctx->notify_me before reading ctx->notified. Pairs with
  545. * smp_mb in aio_notify().
  546. */
  547. smp_mb();
  548. /* Don't block if aio_notify() was called */
  549. if (qatomic_read(&ctx->notified)) {
  550. timeout = 0;
  551. }
  552. }
  553. /* If polling is allowed, non-blocking aio_poll does not need the
  554. * system call---a single round of run_poll_handlers_once suffices.
  555. */
  556. if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
  557. /*
  558. * Disable poll mode. poll mode should be disabled before the call
  559. * of ctx->fdmon_ops->wait() so that guest's notification can wake
  560. * up IO threads when some work becomes pending. It is essential to
  561. * avoid hangs or unnecessary latency.
  562. */
  563. if (poll_set_started(ctx, &ready_list, false)) {
  564. timeout = 0;
  565. progress = true;
  566. }
  567. ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
  568. }
  569. if (use_notify_me) {
  570. /* Finish the poll before clearing the flag. */
  571. qatomic_store_release(&ctx->notify_me,
  572. qatomic_read(&ctx->notify_me) - 2);
  573. }
  574. aio_notify_accept(ctx);
  575. /* Adjust polling time */
  576. if (ctx->poll_max_ns) {
  577. int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
  578. if (block_ns <= ctx->poll_ns) {
  579. /* This is the sweet spot, no adjustment needed */
  580. } else if (block_ns > ctx->poll_max_ns) {
  581. /* We'd have to poll for too long, poll less */
  582. int64_t old = ctx->poll_ns;
  583. if (ctx->poll_shrink) {
  584. ctx->poll_ns /= ctx->poll_shrink;
  585. } else {
  586. ctx->poll_ns = 0;
  587. }
  588. trace_poll_shrink(ctx, old, ctx->poll_ns);
  589. } else if (ctx->poll_ns < ctx->poll_max_ns &&
  590. block_ns < ctx->poll_max_ns) {
  591. /* There is room to grow, poll longer */
  592. int64_t old = ctx->poll_ns;
  593. int64_t grow = ctx->poll_grow;
  594. if (grow == 0) {
  595. grow = 2;
  596. }
  597. if (ctx->poll_ns) {
  598. ctx->poll_ns *= grow;
  599. } else {
  600. ctx->poll_ns = 4000; /* start polling at 4 microseconds */
  601. }
  602. if (ctx->poll_ns > ctx->poll_max_ns) {
  603. ctx->poll_ns = ctx->poll_max_ns;
  604. }
  605. trace_poll_grow(ctx, old, ctx->poll_ns);
  606. }
  607. }
  608. progress |= aio_bh_poll(ctx);
  609. progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
  610. aio_free_deleted_handlers(ctx);
  611. qemu_lockcnt_dec(&ctx->list_lock);
  612. progress |= timerlistgroup_run_timers(&ctx->tlg);
  613. return progress;
  614. }
  615. void aio_context_setup(AioContext *ctx)
  616. {
  617. ctx->fdmon_ops = &fdmon_poll_ops;
  618. ctx->epollfd = -1;
  619. /* Use the fastest fd monitoring implementation if available */
  620. if (fdmon_io_uring_setup(ctx)) {
  621. return;
  622. }
  623. fdmon_epoll_setup(ctx);
  624. }
  625. void aio_context_destroy(AioContext *ctx)
  626. {
  627. fdmon_io_uring_destroy(ctx);
  628. fdmon_epoll_disable(ctx);
  629. aio_free_deleted_handlers(ctx);
  630. }
  631. void aio_context_use_g_source(AioContext *ctx)
  632. {
  633. /*
  634. * Disable io_uring when the glib main loop is used because it doesn't
  635. * support mixed glib/aio_poll() usage. It relies on aio_poll() being
  636. * called regularly so that changes to the monitored file descriptors are
  637. * submitted, otherwise a list of pending fd handlers builds up.
  638. */
  639. fdmon_io_uring_destroy(ctx);
  640. aio_free_deleted_handlers(ctx);
  641. }
  642. void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
  643. int64_t grow, int64_t shrink, Error **errp)
  644. {
  645. /* No thread synchronization here, it doesn't matter if an incorrect value
  646. * is used once.
  647. */
  648. ctx->poll_max_ns = max_ns;
  649. ctx->poll_ns = 0;
  650. ctx->poll_grow = grow;
  651. ctx->poll_shrink = shrink;
  652. aio_notify(ctx);
  653. }
  654. void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch)
  655. {
  656. /*
  657. * No thread synchronization here, it doesn't matter if an incorrect value
  658. * is used once.
  659. */
  660. ctx->aio_max_batch = max_batch;
  661. aio_notify(ctx);
  662. }