2
0

aio-posix.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800
  1. /*
  2. * QEMU aio implementation
  3. *
  4. * Copyright IBM, Corp. 2008
  5. *
  6. * Authors:
  7. * Anthony Liguori <aliguori@us.ibm.com>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2. See
  10. * the COPYING file in the top-level directory.
  11. *
  12. * Contributions after 2012-01-13 are licensed under the terms of the
  13. * GNU GPL, version 2 or (at your option) any later version.
  14. */
  15. #include "qemu/osdep.h"
  16. #include "block/block.h"
  17. #include "block/thread-pool.h"
  18. #include "qemu/main-loop.h"
  19. #include "qemu/rcu.h"
  20. #include "qemu/rcu_queue.h"
  21. #include "qemu/sockets.h"
  22. #include "qemu/cutils.h"
  23. #include "trace.h"
  24. #include "aio-posix.h"
  25. /* Stop userspace polling on a handler if it isn't active for some time */
  26. #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
  27. bool aio_poll_disabled(AioContext *ctx)
  28. {
  29. return qatomic_read(&ctx->poll_disable_cnt);
  30. }
  31. void aio_add_ready_handler(AioHandlerList *ready_list,
  32. AioHandler *node,
  33. int revents)
  34. {
  35. QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
  36. node->pfd.revents = revents;
  37. QLIST_INSERT_HEAD(ready_list, node, node_ready);
  38. }
  39. static void aio_add_poll_ready_handler(AioHandlerList *ready_list,
  40. AioHandler *node)
  41. {
  42. QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
  43. node->poll_ready = true;
  44. QLIST_INSERT_HEAD(ready_list, node, node_ready);
  45. }
  46. static AioHandler *find_aio_handler(AioContext *ctx, int fd)
  47. {
  48. AioHandler *node;
  49. QLIST_FOREACH(node, &ctx->aio_handlers, node) {
  50. if (node->pfd.fd == fd) {
  51. if (!QLIST_IS_INSERTED(node, node_deleted)) {
  52. return node;
  53. }
  54. }
  55. }
  56. return NULL;
  57. }
  58. static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
  59. {
  60. /* If the GSource is in the process of being destroyed then
  61. * g_source_remove_poll() causes an assertion failure. Skip
  62. * removal in that case, because glib cleans up its state during
  63. * destruction anyway.
  64. */
  65. if (!g_source_is_destroyed(&ctx->source)) {
  66. g_source_remove_poll(&ctx->source, &node->pfd);
  67. }
  68. node->pfd.revents = 0;
  69. node->poll_ready = false;
  70. /* If the fd monitor has already marked it deleted, leave it alone */
  71. if (QLIST_IS_INSERTED(node, node_deleted)) {
  72. return false;
  73. }
  74. /* If a read is in progress, just mark the node as deleted */
  75. if (qemu_lockcnt_count(&ctx->list_lock)) {
  76. QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
  77. return false;
  78. }
  79. /* Otherwise, delete it for real. We can't just mark it as
  80. * deleted because deleted nodes are only cleaned up while
  81. * no one is walking the handlers list.
  82. */
  83. QLIST_SAFE_REMOVE(node, node_poll);
  84. QLIST_REMOVE(node, node);
  85. return true;
  86. }
  87. void aio_set_fd_handler(AioContext *ctx,
  88. int fd,
  89. bool is_external,
  90. IOHandler *io_read,
  91. IOHandler *io_write,
  92. AioPollFn *io_poll,
  93. IOHandler *io_poll_ready,
  94. void *opaque)
  95. {
  96. AioHandler *node;
  97. AioHandler *new_node = NULL;
  98. bool is_new = false;
  99. bool deleted = false;
  100. int poll_disable_change;
  101. if (io_poll && !io_poll_ready) {
  102. io_poll = NULL; /* polling only makes sense if there is a handler */
  103. }
  104. qemu_lockcnt_lock(&ctx->list_lock);
  105. node = find_aio_handler(ctx, fd);
  106. /* Are we deleting the fd handler? */
  107. if (!io_read && !io_write && !io_poll) {
  108. if (node == NULL) {
  109. qemu_lockcnt_unlock(&ctx->list_lock);
  110. return;
  111. }
  112. /* Clean events in order to unregister fd from the ctx epoll. */
  113. node->pfd.events = 0;
  114. poll_disable_change = -!node->io_poll;
  115. } else {
  116. poll_disable_change = !io_poll - (node && !node->io_poll);
  117. if (node == NULL) {
  118. is_new = true;
  119. }
  120. /* Alloc and insert if it's not already there */
  121. new_node = g_new0(AioHandler, 1);
  122. /* Update handler with latest information */
  123. new_node->io_read = io_read;
  124. new_node->io_write = io_write;
  125. new_node->io_poll = io_poll;
  126. new_node->io_poll_ready = io_poll_ready;
  127. new_node->opaque = opaque;
  128. new_node->is_external = is_external;
  129. if (is_new) {
  130. new_node->pfd.fd = fd;
  131. } else {
  132. new_node->pfd = node->pfd;
  133. }
  134. g_source_add_poll(&ctx->source, &new_node->pfd);
  135. new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
  136. new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
  137. QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
  138. }
  139. /* No need to order poll_disable_cnt writes against other updates;
  140. * the counter is only used to avoid wasting time and latency on
  141. * iterated polling when the system call will be ultimately necessary.
  142. * Changing handlers is a rare event, and a little wasted polling until
  143. * the aio_notify below is not an issue.
  144. */
  145. qatomic_set(&ctx->poll_disable_cnt,
  146. qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
  147. ctx->fdmon_ops->update(ctx, node, new_node);
  148. if (node) {
  149. deleted = aio_remove_fd_handler(ctx, node);
  150. }
  151. qemu_lockcnt_unlock(&ctx->list_lock);
  152. aio_notify(ctx);
  153. if (deleted) {
  154. g_free(node);
  155. }
  156. }
  157. static void aio_set_fd_poll(AioContext *ctx, int fd,
  158. IOHandler *io_poll_begin,
  159. IOHandler *io_poll_end)
  160. {
  161. AioHandler *node = find_aio_handler(ctx, fd);
  162. if (!node) {
  163. return;
  164. }
  165. node->io_poll_begin = io_poll_begin;
  166. node->io_poll_end = io_poll_end;
  167. }
  168. void aio_set_event_notifier(AioContext *ctx,
  169. EventNotifier *notifier,
  170. bool is_external,
  171. EventNotifierHandler *io_read,
  172. AioPollFn *io_poll,
  173. EventNotifierHandler *io_poll_ready)
  174. {
  175. aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
  176. (IOHandler *)io_read, NULL, io_poll,
  177. (IOHandler *)io_poll_ready, notifier);
  178. }
  179. void aio_set_event_notifier_poll(AioContext *ctx,
  180. EventNotifier *notifier,
  181. EventNotifierHandler *io_poll_begin,
  182. EventNotifierHandler *io_poll_end)
  183. {
  184. aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
  185. (IOHandler *)io_poll_begin,
  186. (IOHandler *)io_poll_end);
  187. }
  188. static bool poll_set_started(AioContext *ctx, AioHandlerList *ready_list,
  189. bool started)
  190. {
  191. AioHandler *node;
  192. bool progress = false;
  193. if (started == ctx->poll_started) {
  194. return false;
  195. }
  196. ctx->poll_started = started;
  197. qemu_lockcnt_inc(&ctx->list_lock);
  198. QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
  199. IOHandler *fn;
  200. if (QLIST_IS_INSERTED(node, node_deleted)) {
  201. continue;
  202. }
  203. if (started) {
  204. fn = node->io_poll_begin;
  205. } else {
  206. fn = node->io_poll_end;
  207. }
  208. if (fn) {
  209. fn(node->opaque);
  210. }
  211. /* Poll one last time in case ->io_poll_end() raced with the event */
  212. if (!started && node->io_poll(node->opaque)) {
  213. aio_add_poll_ready_handler(ready_list, node);
  214. progress = true;
  215. }
  216. }
  217. qemu_lockcnt_dec(&ctx->list_lock);
  218. return progress;
  219. }
  220. bool aio_prepare(AioContext *ctx)
  221. {
  222. AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
  223. /* Poll mode cannot be used with glib's event loop, disable it. */
  224. poll_set_started(ctx, &ready_list, false);
  225. /* TODO what to do with this list? */
  226. return false;
  227. }
  228. bool aio_pending(AioContext *ctx)
  229. {
  230. AioHandler *node;
  231. bool result = false;
  232. /*
  233. * We have to walk very carefully in case aio_set_fd_handler is
  234. * called while we're walking.
  235. */
  236. qemu_lockcnt_inc(&ctx->list_lock);
  237. QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
  238. int revents;
  239. /* TODO should this check poll ready? */
  240. revents = node->pfd.revents & node->pfd.events;
  241. if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
  242. aio_node_check(ctx, node->is_external)) {
  243. result = true;
  244. break;
  245. }
  246. if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
  247. aio_node_check(ctx, node->is_external)) {
  248. result = true;
  249. break;
  250. }
  251. }
  252. qemu_lockcnt_dec(&ctx->list_lock);
  253. return result;
  254. }
  255. static void aio_free_deleted_handlers(AioContext *ctx)
  256. {
  257. AioHandler *node;
  258. if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
  259. return;
  260. }
  261. if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
  262. return; /* we are nested, let the parent do the freeing */
  263. }
  264. while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
  265. QLIST_REMOVE(node, node);
  266. QLIST_REMOVE(node, node_deleted);
  267. QLIST_SAFE_REMOVE(node, node_poll);
  268. g_free(node);
  269. }
  270. qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
  271. }
  272. static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
  273. {
  274. bool progress = false;
  275. bool poll_ready;
  276. int revents;
  277. revents = node->pfd.revents & node->pfd.events;
  278. node->pfd.revents = 0;
  279. poll_ready = node->poll_ready;
  280. node->poll_ready = false;
  281. /*
  282. * Start polling AioHandlers when they become ready because activity is
  283. * likely to continue. Note that starvation is theoretically possible when
  284. * fdmon_supports_polling(), but only until the fd fires for the first
  285. * time.
  286. */
  287. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  288. !QLIST_IS_INSERTED(node, node_poll) &&
  289. node->io_poll) {
  290. trace_poll_add(ctx, node, node->pfd.fd, revents);
  291. if (ctx->poll_started && node->io_poll_begin) {
  292. node->io_poll_begin(node->opaque);
  293. }
  294. QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
  295. }
  296. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  297. poll_ready && revents == 0 &&
  298. aio_node_check(ctx, node->is_external) &&
  299. node->io_poll_ready) {
  300. /*
  301. * Remove temporarily to avoid infinite loops when ->io_poll_ready()
  302. * calls aio_poll() before clearing the condition that made the poll
  303. * handler become ready.
  304. */
  305. QLIST_SAFE_REMOVE(node, node_poll);
  306. node->io_poll_ready(node->opaque);
  307. if (!QLIST_IS_INSERTED(node, node_poll)) {
  308. QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
  309. }
  310. /*
  311. * Return early since revents was zero. aio_notify() does not count as
  312. * progress.
  313. */
  314. return node->opaque != &ctx->notifier;
  315. }
  316. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  317. (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
  318. aio_node_check(ctx, node->is_external) &&
  319. node->io_read) {
  320. node->io_read(node->opaque);
  321. /* aio_notify() does not count as progress */
  322. if (node->opaque != &ctx->notifier) {
  323. progress = true;
  324. }
  325. }
  326. if (!QLIST_IS_INSERTED(node, node_deleted) &&
  327. (revents & (G_IO_OUT | G_IO_ERR)) &&
  328. aio_node_check(ctx, node->is_external) &&
  329. node->io_write) {
  330. node->io_write(node->opaque);
  331. progress = true;
  332. }
  333. return progress;
  334. }
  335. /*
  336. * If we have a list of ready handlers then this is more efficient than
  337. * scanning all handlers with aio_dispatch_handlers().
  338. */
  339. static bool aio_dispatch_ready_handlers(AioContext *ctx,
  340. AioHandlerList *ready_list)
  341. {
  342. bool progress = false;
  343. AioHandler *node;
  344. while ((node = QLIST_FIRST(ready_list))) {
  345. QLIST_REMOVE(node, node_ready);
  346. progress = aio_dispatch_handler(ctx, node) || progress;
  347. }
  348. return progress;
  349. }
  350. /* Slower than aio_dispatch_ready_handlers() but only used via glib */
  351. static bool aio_dispatch_handlers(AioContext *ctx)
  352. {
  353. AioHandler *node, *tmp;
  354. bool progress = false;
  355. QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
  356. progress = aio_dispatch_handler(ctx, node) || progress;
  357. }
  358. return progress;
  359. }
  360. void aio_dispatch(AioContext *ctx)
  361. {
  362. qemu_lockcnt_inc(&ctx->list_lock);
  363. aio_bh_poll(ctx);
  364. aio_dispatch_handlers(ctx);
  365. aio_free_deleted_handlers(ctx);
  366. qemu_lockcnt_dec(&ctx->list_lock);
  367. timerlistgroup_run_timers(&ctx->tlg);
  368. }
  369. static bool run_poll_handlers_once(AioContext *ctx,
  370. AioHandlerList *ready_list,
  371. int64_t now,
  372. int64_t *timeout)
  373. {
  374. bool progress = false;
  375. AioHandler *node;
  376. AioHandler *tmp;
  377. QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
  378. if (aio_node_check(ctx, node->is_external) &&
  379. node->io_poll(node->opaque)) {
  380. aio_add_poll_ready_handler(ready_list, node);
  381. node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
  382. /*
  383. * Polling was successful, exit try_poll_mode immediately
  384. * to adjust the next polling time.
  385. */
  386. *timeout = 0;
  387. if (node->opaque != &ctx->notifier) {
  388. progress = true;
  389. }
  390. }
  391. /* Caller handles freeing deleted nodes. Don't do it here. */
  392. }
  393. return progress;
  394. }
  395. static bool fdmon_supports_polling(AioContext *ctx)
  396. {
  397. return ctx->fdmon_ops->need_wait != aio_poll_disabled;
  398. }
  399. static bool remove_idle_poll_handlers(AioContext *ctx,
  400. AioHandlerList *ready_list,
  401. int64_t now)
  402. {
  403. AioHandler *node;
  404. AioHandler *tmp;
  405. bool progress = false;
  406. /*
  407. * File descriptor monitoring implementations without userspace polling
  408. * support suffer from starvation when a subset of handlers is polled
  409. * because fds will not be processed in a timely fashion. Don't remove
  410. * idle poll handlers.
  411. */
  412. if (!fdmon_supports_polling(ctx)) {
  413. return false;
  414. }
  415. QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
  416. if (node->poll_idle_timeout == 0LL) {
  417. node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
  418. } else if (now >= node->poll_idle_timeout) {
  419. trace_poll_remove(ctx, node, node->pfd.fd);
  420. node->poll_idle_timeout = 0LL;
  421. QLIST_SAFE_REMOVE(node, node_poll);
  422. if (ctx->poll_started && node->io_poll_end) {
  423. node->io_poll_end(node->opaque);
  424. /*
  425. * Final poll in case ->io_poll_end() races with an event.
  426. * Nevermind about re-adding the handler in the rare case where
  427. * this causes progress.
  428. */
  429. if (node->io_poll(node->opaque)) {
  430. aio_add_poll_ready_handler(ready_list, node);
  431. progress = true;
  432. }
  433. }
  434. }
  435. }
  436. return progress;
  437. }
  438. /* run_poll_handlers:
  439. * @ctx: the AioContext
  440. * @ready_list: the list to place ready handlers on
  441. * @max_ns: maximum time to poll for, in nanoseconds
  442. *
  443. * Polls for a given time.
  444. *
  445. * Note that the caller must have incremented ctx->list_lock.
  446. *
  447. * Returns: true if progress was made, false otherwise
  448. */
  449. static bool run_poll_handlers(AioContext *ctx, AioHandlerList *ready_list,
  450. int64_t max_ns, int64_t *timeout)
  451. {
  452. bool progress;
  453. int64_t start_time, elapsed_time;
  454. assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
  455. trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
  456. /*
  457. * Optimization: ->io_poll() handlers often contain RCU read critical
  458. * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
  459. * -> rcu_read_lock() -> ... sequences with expensive memory
  460. * synchronization primitives. Make the entire polling loop an RCU
  461. * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
  462. * are cheap.
  463. */
  464. RCU_READ_LOCK_GUARD();
  465. start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
  466. do {
  467. progress = run_poll_handlers_once(ctx, ready_list,
  468. start_time, timeout);
  469. elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
  470. max_ns = qemu_soonest_timeout(*timeout, max_ns);
  471. assert(!(max_ns && progress));
  472. } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
  473. if (remove_idle_poll_handlers(ctx, ready_list,
  474. start_time + elapsed_time)) {
  475. *timeout = 0;
  476. progress = true;
  477. }
  478. /* If time has passed with no successful polling, adjust *timeout to
  479. * keep the same ending time.
  480. */
  481. if (*timeout != -1) {
  482. *timeout -= MIN(*timeout, elapsed_time);
  483. }
  484. trace_run_poll_handlers_end(ctx, progress, *timeout);
  485. return progress;
  486. }
  487. /* try_poll_mode:
  488. * @ctx: the AioContext
  489. * @ready_list: list to add handlers that need to be run
  490. * @timeout: timeout for blocking wait, computed by the caller and updated if
  491. * polling succeeds.
  492. *
  493. * Note that the caller must have incremented ctx->list_lock.
  494. *
  495. * Returns: true if progress was made, false otherwise
  496. */
  497. static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
  498. int64_t *timeout)
  499. {
  500. int64_t max_ns;
  501. if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
  502. return false;
  503. }
  504. max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
  505. if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
  506. /*
  507. * Enable poll mode. It pairs with the poll_set_started() in
  508. * aio_poll() which disables poll mode.
  509. */
  510. poll_set_started(ctx, ready_list, true);
  511. if (run_poll_handlers(ctx, ready_list, max_ns, timeout)) {
  512. return true;
  513. }
  514. }
  515. return false;
  516. }
  517. bool aio_poll(AioContext *ctx, bool blocking)
  518. {
  519. AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
  520. bool progress;
  521. bool use_notify_me;
  522. int64_t timeout;
  523. int64_t start = 0;
  524. /*
  525. * There cannot be two concurrent aio_poll calls for the same AioContext (or
  526. * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
  527. * We rely on this below to avoid slow locked accesses to ctx->notify_me.
  528. *
  529. * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
  530. * is special in that it runs in the main thread, but that thread's context
  531. * is qemu_aio_context.
  532. */
  533. assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
  534. qemu_get_aio_context() : ctx));
  535. qemu_lockcnt_inc(&ctx->list_lock);
  536. if (ctx->poll_max_ns) {
  537. start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
  538. }
  539. timeout = blocking ? aio_compute_timeout(ctx) : 0;
  540. progress = try_poll_mode(ctx, &ready_list, &timeout);
  541. assert(!(timeout && progress));
  542. /*
  543. * aio_notify can avoid the expensive event_notifier_set if
  544. * everything (file descriptors, bottom halves, timers) will
  545. * be re-evaluated before the next blocking poll(). This is
  546. * already true when aio_poll is called with blocking == false;
  547. * if blocking == true, it is only true after poll() returns,
  548. * so disable the optimization now.
  549. */
  550. use_notify_me = timeout != 0;
  551. if (use_notify_me) {
  552. qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
  553. /*
  554. * Write ctx->notify_me before reading ctx->notified. Pairs with
  555. * smp_mb in aio_notify().
  556. */
  557. smp_mb();
  558. /* Don't block if aio_notify() was called */
  559. if (qatomic_read(&ctx->notified)) {
  560. timeout = 0;
  561. }
  562. }
  563. /* If polling is allowed, non-blocking aio_poll does not need the
  564. * system call---a single round of run_poll_handlers_once suffices.
  565. */
  566. if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
  567. /*
  568. * Disable poll mode. poll mode should be disabled before the call
  569. * of ctx->fdmon_ops->wait() so that guest's notification can wake
  570. * up IO threads when some work becomes pending. It is essential to
  571. * avoid hangs or unnecessary latency.
  572. */
  573. if (poll_set_started(ctx, &ready_list, false)) {
  574. timeout = 0;
  575. progress = true;
  576. }
  577. ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
  578. }
  579. if (use_notify_me) {
  580. /* Finish the poll before clearing the flag. */
  581. qatomic_store_release(&ctx->notify_me,
  582. qatomic_read(&ctx->notify_me) - 2);
  583. }
  584. aio_notify_accept(ctx);
  585. /* Adjust polling time */
  586. if (ctx->poll_max_ns) {
  587. int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
  588. if (block_ns <= ctx->poll_ns) {
  589. /* This is the sweet spot, no adjustment needed */
  590. } else if (block_ns > ctx->poll_max_ns) {
  591. /* We'd have to poll for too long, poll less */
  592. int64_t old = ctx->poll_ns;
  593. if (ctx->poll_shrink) {
  594. ctx->poll_ns /= ctx->poll_shrink;
  595. } else {
  596. ctx->poll_ns = 0;
  597. }
  598. trace_poll_shrink(ctx, old, ctx->poll_ns);
  599. } else if (ctx->poll_ns < ctx->poll_max_ns &&
  600. block_ns < ctx->poll_max_ns) {
  601. /* There is room to grow, poll longer */
  602. int64_t old = ctx->poll_ns;
  603. int64_t grow = ctx->poll_grow;
  604. if (grow == 0) {
  605. grow = 2;
  606. }
  607. if (ctx->poll_ns) {
  608. ctx->poll_ns *= grow;
  609. } else {
  610. ctx->poll_ns = 4000; /* start polling at 4 microseconds */
  611. }
  612. if (ctx->poll_ns > ctx->poll_max_ns) {
  613. ctx->poll_ns = ctx->poll_max_ns;
  614. }
  615. trace_poll_grow(ctx, old, ctx->poll_ns);
  616. }
  617. }
  618. progress |= aio_bh_poll(ctx);
  619. progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
  620. aio_free_deleted_handlers(ctx);
  621. qemu_lockcnt_dec(&ctx->list_lock);
  622. progress |= timerlistgroup_run_timers(&ctx->tlg);
  623. return progress;
  624. }
  625. void aio_context_setup(AioContext *ctx)
  626. {
  627. ctx->fdmon_ops = &fdmon_poll_ops;
  628. ctx->epollfd = -1;
  629. /* Use the fastest fd monitoring implementation if available */
  630. if (fdmon_io_uring_setup(ctx)) {
  631. return;
  632. }
  633. fdmon_epoll_setup(ctx);
  634. }
  635. void aio_context_destroy(AioContext *ctx)
  636. {
  637. fdmon_io_uring_destroy(ctx);
  638. fdmon_epoll_disable(ctx);
  639. aio_free_deleted_handlers(ctx);
  640. }
  641. void aio_context_use_g_source(AioContext *ctx)
  642. {
  643. /*
  644. * Disable io_uring when the glib main loop is used because it doesn't
  645. * support mixed glib/aio_poll() usage. It relies on aio_poll() being
  646. * called regularly so that changes to the monitored file descriptors are
  647. * submitted, otherwise a list of pending fd handlers builds up.
  648. */
  649. fdmon_io_uring_destroy(ctx);
  650. aio_free_deleted_handlers(ctx);
  651. }
  652. void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
  653. int64_t grow, int64_t shrink, Error **errp)
  654. {
  655. /* No thread synchronization here, it doesn't matter if an incorrect value
  656. * is used once.
  657. */
  658. ctx->poll_max_ns = max_ns;
  659. ctx->poll_ns = 0;
  660. ctx->poll_grow = grow;
  661. ctx->poll_shrink = shrink;
  662. aio_notify(ctx);
  663. }
  664. void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
  665. Error **errp)
  666. {
  667. /*
  668. * No thread synchronization here, it doesn't matter if an incorrect value
  669. * is used once.
  670. */
  671. ctx->aio_max_batch = max_batch;
  672. aio_notify(ctx);
  673. }