blkio.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154
  1. /* SPDX-License-Identifier: LGPL-2.1-or-later */
  2. /*
  3. * libblkio BlockDriver
  4. *
  5. * Copyright Red Hat, Inc.
  6. *
  7. * Author:
  8. * Stefan Hajnoczi <stefanha@redhat.com>
  9. */
  10. #include "qemu/osdep.h"
  11. #include <blkio.h>
  12. #include "block/block_int.h"
  13. #include "exec/memory.h"
  14. #include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
  15. #include "qemu/defer-call.h"
  16. #include "qapi/error.h"
  17. #include "qemu/error-report.h"
  18. #include "qapi/qmp/qdict.h"
  19. #include "qemu/module.h"
  20. #include "system/block-backend.h"
  21. #include "exec/memory.h" /* for ram_block_discard_disable() */
  22. #include "block/block-io.h"
  23. /*
  24. * Allocated bounce buffers are kept in a list sorted by buffer address.
  25. */
  26. typedef struct BlkioBounceBuf {
  27. QLIST_ENTRY(BlkioBounceBuf) next;
  28. /* The bounce buffer */
  29. struct iovec buf;
  30. } BlkioBounceBuf;
  31. typedef struct {
  32. /*
  33. * libblkio is not thread-safe so this lock protects ->blkio and
  34. * ->blkioq.
  35. */
  36. QemuMutex blkio_lock;
  37. struct blkio *blkio;
  38. struct blkioq *blkioq; /* make this multi-queue in the future... */
  39. int completion_fd;
  40. /*
  41. * Polling fetches the next completion into this field.
  42. *
  43. * No lock is necessary since only one thread calls aio_poll() and invokes
  44. * fd and poll handlers.
  45. */
  46. struct blkio_completion poll_completion;
  47. /*
  48. * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
  49. *
  50. * Lock ordering: ->bounce_lock before ->blkio_lock.
  51. */
  52. CoMutex bounce_lock;
  53. /* Bounce buffer pool */
  54. struct blkio_mem_region bounce_pool;
  55. /* Sorted list of allocated bounce buffers */
  56. QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
  57. /* Queue for coroutines waiting for bounce buffer space */
  58. CoQueue bounce_available;
  59. /* The value of the "mem-region-alignment" property */
  60. uint64_t mem_region_alignment;
  61. /* Can we skip adding/deleting blkio_mem_regions? */
  62. bool needs_mem_regions;
  63. /* Are file descriptors necessary for blkio_mem_regions? */
  64. bool needs_mem_region_fd;
  65. /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
  66. bool may_pin_mem_regions;
  67. } BDRVBlkioState;
  68. /* Called with s->bounce_lock held */
  69. static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
  70. {
  71. /* There can be no allocated bounce buffers during resize */
  72. assert(QLIST_EMPTY(&s->bounce_bufs));
  73. /* Pad size to reduce frequency of resize calls */
  74. bytes += 128 * 1024;
  75. /* Align the pool size to avoid blkio_alloc_mem_region() failure */
  76. bytes = QEMU_ALIGN_UP(bytes, s->mem_region_alignment);
  77. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  78. int ret;
  79. if (s->bounce_pool.addr) {
  80. blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
  81. blkio_free_mem_region(s->blkio, &s->bounce_pool);
  82. memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
  83. }
  84. /* Automatically freed when s->blkio is destroyed */
  85. ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
  86. if (ret < 0) {
  87. return ret;
  88. }
  89. ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
  90. if (ret < 0) {
  91. blkio_free_mem_region(s->blkio, &s->bounce_pool);
  92. memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
  93. return ret;
  94. }
  95. }
  96. return 0;
  97. }
  98. /* Called with s->bounce_lock held */
  99. static bool
  100. blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
  101. int64_t bytes)
  102. {
  103. void *addr = s->bounce_pool.addr;
  104. BlkioBounceBuf *cur = NULL;
  105. BlkioBounceBuf *prev = NULL;
  106. ptrdiff_t space;
  107. /*
  108. * This is just a linear search over the holes between requests. An
  109. * efficient allocator would be nice.
  110. */
  111. QLIST_FOREACH(cur, &s->bounce_bufs, next) {
  112. space = cur->buf.iov_base - addr;
  113. if (bytes <= space) {
  114. QLIST_INSERT_BEFORE(cur, bounce, next);
  115. bounce->buf.iov_base = addr;
  116. bounce->buf.iov_len = bytes;
  117. return true;
  118. }
  119. addr = cur->buf.iov_base + cur->buf.iov_len;
  120. prev = cur;
  121. }
  122. /* Is there space after the last request? */
  123. space = s->bounce_pool.addr + s->bounce_pool.len - addr;
  124. if (bytes > space) {
  125. return false;
  126. }
  127. if (prev) {
  128. QLIST_INSERT_AFTER(prev, bounce, next);
  129. } else {
  130. QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
  131. }
  132. bounce->buf.iov_base = addr;
  133. bounce->buf.iov_len = bytes;
  134. return true;
  135. }
  136. static int coroutine_fn
  137. blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
  138. int64_t bytes)
  139. {
  140. /*
  141. * Ensure fairness: first time around we join the back of the queue,
  142. * subsequently we join the front so we don't lose our place.
  143. */
  144. CoQueueWaitFlags wait_flags = 0;
  145. QEMU_LOCK_GUARD(&s->bounce_lock);
  146. /* Ensure fairness: don't even try if other requests are already waiting */
  147. if (!qemu_co_queue_empty(&s->bounce_available)) {
  148. qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
  149. wait_flags);
  150. wait_flags = CO_QUEUE_WAIT_FRONT;
  151. }
  152. while (true) {
  153. if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
  154. /* Kick the next queued request since there may be space */
  155. qemu_co_queue_next(&s->bounce_available);
  156. return 0;
  157. }
  158. /*
  159. * If there are no in-flight requests then the pool was simply too
  160. * small.
  161. */
  162. if (QLIST_EMPTY(&s->bounce_bufs)) {
  163. bool ok;
  164. int ret;
  165. ret = blkio_resize_bounce_pool(s, bytes);
  166. if (ret < 0) {
  167. /* Kick the next queued request since that may fail too */
  168. qemu_co_queue_next(&s->bounce_available);
  169. return ret;
  170. }
  171. ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
  172. assert(ok); /* must have space this time */
  173. return 0;
  174. }
  175. qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
  176. wait_flags);
  177. wait_flags = CO_QUEUE_WAIT_FRONT;
  178. }
  179. }
  180. static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
  181. BlkioBounceBuf *bounce)
  182. {
  183. QEMU_LOCK_GUARD(&s->bounce_lock);
  184. QLIST_REMOVE(bounce, next);
  185. /* Wake up waiting coroutines since space may now be available */
  186. qemu_co_queue_next(&s->bounce_available);
  187. }
  188. /* For async to .bdrv_co_*() conversion */
  189. typedef struct {
  190. Coroutine *coroutine;
  191. int ret;
  192. } BlkioCoData;
  193. static void blkio_completion_fd_read(void *opaque)
  194. {
  195. BlockDriverState *bs = opaque;
  196. BDRVBlkioState *s = bs->opaque;
  197. uint64_t val;
  198. int ret;
  199. /* Polling may have already fetched a completion */
  200. if (s->poll_completion.user_data != NULL) {
  201. BlkioCoData *cod = s->poll_completion.user_data;
  202. cod->ret = s->poll_completion.ret;
  203. /* Clear it in case aio_co_wake() enters a nested event loop */
  204. s->poll_completion.user_data = NULL;
  205. aio_co_wake(cod->coroutine);
  206. }
  207. /* Reset completion fd status */
  208. ret = read(s->completion_fd, &val, sizeof(val));
  209. /* Ignore errors, there's nothing we can do */
  210. (void)ret;
  211. /*
  212. * Reading one completion at a time makes nested event loop re-entrancy
  213. * simple. Change this loop to get multiple completions in one go if it
  214. * becomes a performance bottleneck.
  215. */
  216. while (true) {
  217. struct blkio_completion completion;
  218. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  219. ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
  220. }
  221. if (ret != 1) {
  222. break;
  223. }
  224. BlkioCoData *cod = completion.user_data;
  225. cod->ret = completion.ret;
  226. aio_co_wake(cod->coroutine);
  227. }
  228. }
  229. static bool blkio_completion_fd_poll(void *opaque)
  230. {
  231. BlockDriverState *bs = opaque;
  232. BDRVBlkioState *s = bs->opaque;
  233. int ret;
  234. /* Just in case we already fetched a completion */
  235. if (s->poll_completion.user_data != NULL) {
  236. return true;
  237. }
  238. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  239. ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
  240. }
  241. return ret == 1;
  242. }
  243. static void blkio_completion_fd_poll_ready(void *opaque)
  244. {
  245. blkio_completion_fd_read(opaque);
  246. }
  247. static void blkio_attach_aio_context(BlockDriverState *bs,
  248. AioContext *new_context)
  249. {
  250. BDRVBlkioState *s = bs->opaque;
  251. aio_set_fd_handler(new_context, s->completion_fd,
  252. blkio_completion_fd_read, NULL,
  253. blkio_completion_fd_poll,
  254. blkio_completion_fd_poll_ready, bs);
  255. }
  256. static void blkio_detach_aio_context(BlockDriverState *bs)
  257. {
  258. BDRVBlkioState *s = bs->opaque;
  259. aio_set_fd_handler(bdrv_get_aio_context(bs), s->completion_fd, NULL, NULL,
  260. NULL, NULL, NULL);
  261. }
  262. /*
  263. * Called by defer_call_end() or immediately if not in a deferred section.
  264. * Called without blkio_lock.
  265. */
  266. static void blkio_deferred_fn(void *opaque)
  267. {
  268. BDRVBlkioState *s = opaque;
  269. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  270. blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
  271. }
  272. }
  273. /*
  274. * Schedule I/O submission after enqueuing a new request. Called without
  275. * blkio_lock.
  276. */
  277. static void blkio_submit_io(BlockDriverState *bs)
  278. {
  279. BDRVBlkioState *s = bs->opaque;
  280. defer_call(blkio_deferred_fn, s);
  281. }
  282. static int coroutine_fn
  283. blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
  284. {
  285. BDRVBlkioState *s = bs->opaque;
  286. BlkioCoData cod = {
  287. .coroutine = qemu_coroutine_self(),
  288. };
  289. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  290. blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
  291. }
  292. blkio_submit_io(bs);
  293. qemu_coroutine_yield();
  294. return cod.ret;
  295. }
  296. static int coroutine_fn
  297. blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
  298. QEMUIOVector *qiov, BdrvRequestFlags flags)
  299. {
  300. BlkioCoData cod = {
  301. .coroutine = qemu_coroutine_self(),
  302. };
  303. BDRVBlkioState *s = bs->opaque;
  304. bool use_bounce_buffer =
  305. s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
  306. BlkioBounceBuf bounce;
  307. struct iovec *iov = qiov->iov;
  308. int iovcnt = qiov->niov;
  309. if (use_bounce_buffer) {
  310. int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
  311. if (ret < 0) {
  312. return ret;
  313. }
  314. iov = &bounce.buf;
  315. iovcnt = 1;
  316. }
  317. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  318. blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
  319. }
  320. blkio_submit_io(bs);
  321. qemu_coroutine_yield();
  322. if (use_bounce_buffer) {
  323. if (cod.ret == 0) {
  324. qemu_iovec_from_buf(qiov, 0,
  325. bounce.buf.iov_base,
  326. bounce.buf.iov_len);
  327. }
  328. blkio_free_bounce_buffer(s, &bounce);
  329. }
  330. return cod.ret;
  331. }
  332. static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
  333. int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
  334. {
  335. uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
  336. BlkioCoData cod = {
  337. .coroutine = qemu_coroutine_self(),
  338. };
  339. BDRVBlkioState *s = bs->opaque;
  340. bool use_bounce_buffer =
  341. s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
  342. BlkioBounceBuf bounce;
  343. struct iovec *iov = qiov->iov;
  344. int iovcnt = qiov->niov;
  345. if (use_bounce_buffer) {
  346. int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
  347. if (ret < 0) {
  348. return ret;
  349. }
  350. qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
  351. iov = &bounce.buf;
  352. iovcnt = 1;
  353. }
  354. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  355. blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
  356. }
  357. blkio_submit_io(bs);
  358. qemu_coroutine_yield();
  359. if (use_bounce_buffer) {
  360. blkio_free_bounce_buffer(s, &bounce);
  361. }
  362. return cod.ret;
  363. }
  364. static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
  365. {
  366. BDRVBlkioState *s = bs->opaque;
  367. BlkioCoData cod = {
  368. .coroutine = qemu_coroutine_self(),
  369. };
  370. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  371. blkioq_flush(s->blkioq, &cod, 0);
  372. }
  373. blkio_submit_io(bs);
  374. qemu_coroutine_yield();
  375. return cod.ret;
  376. }
  377. static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
  378. int64_t offset, int64_t bytes, BdrvRequestFlags flags)
  379. {
  380. BDRVBlkioState *s = bs->opaque;
  381. BlkioCoData cod = {
  382. .coroutine = qemu_coroutine_self(),
  383. };
  384. uint32_t blkio_flags = 0;
  385. if (flags & BDRV_REQ_FUA) {
  386. blkio_flags |= BLKIO_REQ_FUA;
  387. }
  388. if (!(flags & BDRV_REQ_MAY_UNMAP)) {
  389. blkio_flags |= BLKIO_REQ_NO_UNMAP;
  390. }
  391. if (flags & BDRV_REQ_NO_FALLBACK) {
  392. blkio_flags |= BLKIO_REQ_NO_FALLBACK;
  393. }
  394. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  395. blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
  396. }
  397. blkio_submit_io(bs);
  398. qemu_coroutine_yield();
  399. return cod.ret;
  400. }
  401. typedef enum {
  402. BMRR_OK,
  403. BMRR_SKIP,
  404. BMRR_FAIL,
  405. } BlkioMemRegionResult;
  406. /*
  407. * Produce a struct blkio_mem_region for a given address and size.
  408. *
  409. * This function produces identical results when called multiple times with the
  410. * same arguments. This property is necessary because blkio_unmap_mem_region()
  411. * must receive the same struct blkio_mem_region field values that were passed
  412. * to blkio_map_mem_region().
  413. */
  414. static BlkioMemRegionResult
  415. blkio_mem_region_from_host(BlockDriverState *bs,
  416. void *host, size_t size,
  417. struct blkio_mem_region *region,
  418. Error **errp)
  419. {
  420. BDRVBlkioState *s = bs->opaque;
  421. int fd = -1;
  422. ram_addr_t fd_offset = 0;
  423. if (((uintptr_t)host | size) % s->mem_region_alignment) {
  424. error_setg(errp, "unaligned buf %p with size %zu", host, size);
  425. return BMRR_FAIL;
  426. }
  427. /* Attempt to find the fd for the underlying memory */
  428. if (s->needs_mem_region_fd) {
  429. RAMBlock *ram_block;
  430. RAMBlock *end_block;
  431. ram_addr_t offset;
  432. /*
  433. * bdrv_register_buf() is called with the BQL held so mr lives at least
  434. * until this function returns.
  435. */
  436. ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
  437. if (ram_block) {
  438. fd = qemu_ram_get_fd(ram_block);
  439. }
  440. if (fd == -1) {
  441. /*
  442. * Ideally every RAMBlock would have an fd. pc-bios and other
  443. * things don't. Luckily they are usually not I/O buffers and we
  444. * can just ignore them.
  445. */
  446. return BMRR_SKIP;
  447. }
  448. /* Make sure the fd covers the entire range */
  449. end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
  450. if (ram_block != end_block) {
  451. error_setg(errp, "registered buffer at %p with size %zu extends "
  452. "beyond RAMBlock", host, size);
  453. return BMRR_FAIL;
  454. }
  455. }
  456. *region = (struct blkio_mem_region){
  457. .addr = host,
  458. .len = size,
  459. .fd = fd,
  460. .fd_offset = fd_offset,
  461. };
  462. return BMRR_OK;
  463. }
  464. static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
  465. Error **errp)
  466. {
  467. BDRVBlkioState *s = bs->opaque;
  468. struct blkio_mem_region region;
  469. BlkioMemRegionResult region_result;
  470. int ret;
  471. /*
  472. * Mapping memory regions conflicts with RAM discard (virtio-mem) when
  473. * there is pinning, so only do it when necessary.
  474. */
  475. if (!s->needs_mem_regions && s->may_pin_mem_regions) {
  476. return true;
  477. }
  478. region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
  479. if (region_result == BMRR_SKIP) {
  480. return true;
  481. } else if (region_result != BMRR_OK) {
  482. return false;
  483. }
  484. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  485. ret = blkio_map_mem_region(s->blkio, &region);
  486. }
  487. if (ret < 0) {
  488. error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
  489. host, size, blkio_get_error_msg());
  490. return false;
  491. }
  492. return true;
  493. }
  494. static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
  495. {
  496. BDRVBlkioState *s = bs->opaque;
  497. struct blkio_mem_region region;
  498. /* See blkio_register_buf() */
  499. if (!s->needs_mem_regions && s->may_pin_mem_regions) {
  500. return;
  501. }
  502. if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
  503. return;
  504. }
  505. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  506. blkio_unmap_mem_region(s->blkio, &region);
  507. }
  508. }
  509. static int blkio_io_uring_connect(BlockDriverState *bs, QDict *options,
  510. int flags, Error **errp)
  511. {
  512. const char *filename = qdict_get_str(options, "filename");
  513. BDRVBlkioState *s = bs->opaque;
  514. int ret;
  515. ret = blkio_set_str(s->blkio, "path", filename);
  516. qdict_del(options, "filename");
  517. if (ret < 0) {
  518. error_setg_errno(errp, -ret, "failed to set path: %s",
  519. blkio_get_error_msg());
  520. return ret;
  521. }
  522. if (flags & BDRV_O_NOCACHE) {
  523. ret = blkio_set_bool(s->blkio, "direct", true);
  524. if (ret < 0) {
  525. error_setg_errno(errp, -ret, "failed to set direct: %s",
  526. blkio_get_error_msg());
  527. return ret;
  528. }
  529. }
  530. ret = blkio_connect(s->blkio);
  531. if (ret < 0) {
  532. error_setg_errno(errp, -ret, "blkio_connect failed: %s",
  533. blkio_get_error_msg());
  534. return ret;
  535. }
  536. return 0;
  537. }
  538. static int blkio_nvme_io_uring_connect(BlockDriverState *bs, QDict *options,
  539. int flags, Error **errp)
  540. {
  541. const char *path = qdict_get_try_str(options, "path");
  542. BDRVBlkioState *s = bs->opaque;
  543. int ret;
  544. if (!path) {
  545. error_setg(errp, "missing 'path' option");
  546. return -EINVAL;
  547. }
  548. ret = blkio_set_str(s->blkio, "path", path);
  549. qdict_del(options, "path");
  550. if (ret < 0) {
  551. error_setg_errno(errp, -ret, "failed to set path: %s",
  552. blkio_get_error_msg());
  553. return ret;
  554. }
  555. if (!(flags & BDRV_O_NOCACHE)) {
  556. error_setg(errp, "cache.direct=off is not supported");
  557. return -EINVAL;
  558. }
  559. ret = blkio_connect(s->blkio);
  560. if (ret < 0) {
  561. error_setg_errno(errp, -ret, "blkio_connect failed: %s",
  562. blkio_get_error_msg());
  563. return ret;
  564. }
  565. return 0;
  566. }
  567. static int blkio_virtio_blk_connect(BlockDriverState *bs, QDict *options,
  568. int flags, Error **errp)
  569. {
  570. const char *path = qdict_get_try_str(options, "path");
  571. BDRVBlkioState *s = bs->opaque;
  572. bool fd_supported = false;
  573. int fd = -1, ret;
  574. if (!path) {
  575. error_setg(errp, "missing 'path' option");
  576. return -EINVAL;
  577. }
  578. if (!(flags & BDRV_O_NOCACHE)) {
  579. error_setg(errp, "cache.direct=off is not supported");
  580. return -EINVAL;
  581. }
  582. if (blkio_set_int(s->blkio, "fd", -1) == 0) {
  583. fd_supported = true;
  584. }
  585. /*
  586. * If the libblkio driver supports fd passing, let's always use qemu_open()
  587. * to open the `path`, so we can handle fd passing from the management
  588. * layer through the "/dev/fdset/N" special path.
  589. */
  590. if (fd_supported) {
  591. /*
  592. * `path` can contain the path of a character device
  593. * (e.g. /dev/vhost-vdpa-0 or /dev/vfio/vfio) or a unix socket.
  594. *
  595. * So, we should always open it with O_RDWR flag, also if BDRV_O_RDWR
  596. * is not set in the open flags, because the exchange of IOCTL commands
  597. * for example will fail.
  598. *
  599. * In order to open the device read-only, we are using the `read-only`
  600. * property of the libblkio driver in blkio_open().
  601. */
  602. fd = qemu_open(path, O_RDWR, NULL);
  603. if (fd < 0) {
  604. /*
  605. * qemu_open() can fail if the user specifies a path that is not
  606. * a file or device, for example in the case of Unix Domain Socket
  607. * for the virtio-blk-vhost-user driver. In such cases let's have
  608. * libblkio open the path directly.
  609. */
  610. fd_supported = false;
  611. } else {
  612. ret = blkio_set_int(s->blkio, "fd", fd);
  613. if (ret < 0) {
  614. fd_supported = false;
  615. qemu_close(fd);
  616. fd = -1;
  617. }
  618. }
  619. }
  620. if (!fd_supported) {
  621. ret = blkio_set_str(s->blkio, "path", path);
  622. if (ret < 0) {
  623. error_setg_errno(errp, -ret, "failed to set path: %s",
  624. blkio_get_error_msg());
  625. return ret;
  626. }
  627. }
  628. ret = blkio_connect(s->blkio);
  629. if (ret < 0 && fd >= 0) {
  630. /* Failed to give the FD to libblkio, close it */
  631. qemu_close(fd);
  632. fd = -1;
  633. }
  634. /*
  635. * Before https://gitlab.com/libblkio/libblkio/-/merge_requests/208
  636. * (libblkio <= v1.3.0), setting the `fd` property is not enough to check
  637. * whether the driver supports the `fd` property or not. In that case,
  638. * blkio_connect() will fail with -EINVAL.
  639. * So let's try calling blkio_connect() again by directly setting `path`
  640. * to cover this scenario.
  641. */
  642. if (fd_supported && ret == -EINVAL) {
  643. /*
  644. * We need to clear the `fd` property we set previously by setting
  645. * it to -1.
  646. */
  647. ret = blkio_set_int(s->blkio, "fd", -1);
  648. if (ret < 0) {
  649. error_setg_errno(errp, -ret, "failed to set fd: %s",
  650. blkio_get_error_msg());
  651. return ret;
  652. }
  653. ret = blkio_set_str(s->blkio, "path", path);
  654. if (ret < 0) {
  655. error_setg_errno(errp, -ret, "failed to set path: %s",
  656. blkio_get_error_msg());
  657. return ret;
  658. }
  659. ret = blkio_connect(s->blkio);
  660. }
  661. if (ret < 0) {
  662. error_setg_errno(errp, -ret, "blkio_connect failed: %s",
  663. blkio_get_error_msg());
  664. return ret;
  665. }
  666. qdict_del(options, "path");
  667. return 0;
  668. }
  669. static int blkio_open(BlockDriverState *bs, QDict *options, int flags,
  670. Error **errp)
  671. {
  672. const char *blkio_driver = bs->drv->protocol_name;
  673. BDRVBlkioState *s = bs->opaque;
  674. int ret;
  675. ret = blkio_create(blkio_driver, &s->blkio);
  676. if (ret < 0) {
  677. error_setg_errno(errp, -ret, "blkio_create failed: %s",
  678. blkio_get_error_msg());
  679. return ret;
  680. }
  681. if (!(flags & BDRV_O_RDWR)) {
  682. ret = blkio_set_bool(s->blkio, "read-only", true);
  683. if (ret < 0) {
  684. error_setg_errno(errp, -ret, "failed to set read-only: %s",
  685. blkio_get_error_msg());
  686. blkio_destroy(&s->blkio);
  687. return ret;
  688. }
  689. }
  690. if (strcmp(blkio_driver, "io_uring") == 0) {
  691. ret = blkio_io_uring_connect(bs, options, flags, errp);
  692. } else if (strcmp(blkio_driver, "nvme-io_uring") == 0) {
  693. ret = blkio_nvme_io_uring_connect(bs, options, flags, errp);
  694. } else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) {
  695. ret = blkio_virtio_blk_connect(bs, options, flags, errp);
  696. } else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) {
  697. ret = blkio_virtio_blk_connect(bs, options, flags, errp);
  698. } else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) {
  699. ret = blkio_virtio_blk_connect(bs, options, flags, errp);
  700. } else {
  701. g_assert_not_reached();
  702. }
  703. if (ret < 0) {
  704. blkio_destroy(&s->blkio);
  705. return ret;
  706. }
  707. ret = blkio_get_bool(s->blkio,
  708. "needs-mem-regions",
  709. &s->needs_mem_regions);
  710. if (ret < 0) {
  711. error_setg_errno(errp, -ret,
  712. "failed to get needs-mem-regions: %s",
  713. blkio_get_error_msg());
  714. blkio_destroy(&s->blkio);
  715. return ret;
  716. }
  717. ret = blkio_get_bool(s->blkio,
  718. "needs-mem-region-fd",
  719. &s->needs_mem_region_fd);
  720. if (ret < 0) {
  721. error_setg_errno(errp, -ret,
  722. "failed to get needs-mem-region-fd: %s",
  723. blkio_get_error_msg());
  724. blkio_destroy(&s->blkio);
  725. return ret;
  726. }
  727. ret = blkio_get_uint64(s->blkio,
  728. "mem-region-alignment",
  729. &s->mem_region_alignment);
  730. if (ret < 0) {
  731. error_setg_errno(errp, -ret,
  732. "failed to get mem-region-alignment: %s",
  733. blkio_get_error_msg());
  734. blkio_destroy(&s->blkio);
  735. return ret;
  736. }
  737. ret = blkio_get_bool(s->blkio,
  738. "may-pin-mem-regions",
  739. &s->may_pin_mem_regions);
  740. if (ret < 0) {
  741. /* Be conservative (assume pinning) if the property is not supported */
  742. s->may_pin_mem_regions = s->needs_mem_regions;
  743. }
  744. /*
  745. * Notify if libblkio drivers pin memory and prevent features like
  746. * virtio-mem from working.
  747. */
  748. if (s->may_pin_mem_regions) {
  749. ret = ram_block_discard_disable(true);
  750. if (ret < 0) {
  751. error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
  752. blkio_destroy(&s->blkio);
  753. return ret;
  754. }
  755. }
  756. ret = blkio_start(s->blkio);
  757. if (ret < 0) {
  758. error_setg_errno(errp, -ret, "blkio_start failed: %s",
  759. blkio_get_error_msg());
  760. blkio_destroy(&s->blkio);
  761. if (s->may_pin_mem_regions) {
  762. ram_block_discard_disable(false);
  763. }
  764. return ret;
  765. }
  766. bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
  767. bs->supported_zero_flags = BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK;
  768. #ifdef CONFIG_BLKIO_WRITE_ZEROS_FUA
  769. bs->supported_zero_flags |= BDRV_REQ_FUA;
  770. #endif
  771. qemu_mutex_init(&s->blkio_lock);
  772. qemu_co_mutex_init(&s->bounce_lock);
  773. qemu_co_queue_init(&s->bounce_available);
  774. QLIST_INIT(&s->bounce_bufs);
  775. s->blkioq = blkio_get_queue(s->blkio, 0);
  776. s->completion_fd = blkioq_get_completion_fd(s->blkioq);
  777. blkioq_set_completion_fd_enabled(s->blkioq, true);
  778. blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
  779. return 0;
  780. }
  781. static void blkio_close(BlockDriverState *bs)
  782. {
  783. BDRVBlkioState *s = bs->opaque;
  784. /* There is no destroy() API for s->bounce_lock */
  785. qemu_mutex_destroy(&s->blkio_lock);
  786. blkio_detach_aio_context(bs);
  787. blkio_destroy(&s->blkio);
  788. if (s->may_pin_mem_regions) {
  789. ram_block_discard_disable(false);
  790. }
  791. }
  792. static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
  793. {
  794. BDRVBlkioState *s = bs->opaque;
  795. uint64_t capacity;
  796. int ret;
  797. WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
  798. ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
  799. }
  800. if (ret < 0) {
  801. return -ret;
  802. }
  803. return capacity;
  804. }
  805. static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
  806. bool exact, PreallocMode prealloc,
  807. BdrvRequestFlags flags, Error **errp)
  808. {
  809. int64_t current_length;
  810. if (prealloc != PREALLOC_MODE_OFF) {
  811. error_setg(errp, "Unsupported preallocation mode '%s'",
  812. PreallocMode_str(prealloc));
  813. return -ENOTSUP;
  814. }
  815. current_length = blkio_co_getlength(bs);
  816. if (offset > current_length) {
  817. error_setg(errp, "Cannot grow device");
  818. return -EINVAL;
  819. } else if (exact && offset != current_length) {
  820. error_setg(errp, "Cannot resize device");
  821. return -ENOTSUP;
  822. }
  823. return 0;
  824. }
  825. static int coroutine_fn
  826. blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  827. {
  828. return 0;
  829. }
  830. static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
  831. {
  832. BDRVBlkioState *s = bs->opaque;
  833. QEMU_LOCK_GUARD(&s->blkio_lock);
  834. int value;
  835. int ret;
  836. ret = blkio_get_int(s->blkio, "request-alignment", &value);
  837. if (ret < 0) {
  838. error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
  839. blkio_get_error_msg());
  840. return;
  841. }
  842. bs->bl.request_alignment = value;
  843. if (bs->bl.request_alignment < 1 ||
  844. bs->bl.request_alignment >= INT_MAX ||
  845. !is_power_of_2(bs->bl.request_alignment)) {
  846. error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
  847. "must be a power of 2 less than INT_MAX",
  848. bs->bl.request_alignment);
  849. return;
  850. }
  851. ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
  852. if (ret < 0) {
  853. error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
  854. blkio_get_error_msg());
  855. return;
  856. }
  857. bs->bl.opt_transfer = value;
  858. if (bs->bl.opt_transfer > INT_MAX ||
  859. (bs->bl.opt_transfer % bs->bl.request_alignment)) {
  860. error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
  861. "be a multiple of %" PRIu32, bs->bl.opt_transfer,
  862. bs->bl.request_alignment);
  863. return;
  864. }
  865. ret = blkio_get_int(s->blkio, "max-transfer", &value);
  866. if (ret < 0) {
  867. error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
  868. blkio_get_error_msg());
  869. return;
  870. }
  871. bs->bl.max_transfer = value;
  872. if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
  873. (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
  874. error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
  875. "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
  876. bs->bl.max_transfer, bs->bl.request_alignment,
  877. bs->bl.opt_transfer);
  878. return;
  879. }
  880. ret = blkio_get_int(s->blkio, "buf-alignment", &value);
  881. if (ret < 0) {
  882. error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
  883. blkio_get_error_msg());
  884. return;
  885. }
  886. if (value < 1) {
  887. error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
  888. "positive", value);
  889. return;
  890. }
  891. bs->bl.min_mem_alignment = value;
  892. ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
  893. if (ret < 0) {
  894. error_setg_errno(errp, -ret,
  895. "failed to get \"optimal-buf-alignment\": %s",
  896. blkio_get_error_msg());
  897. return;
  898. }
  899. if (value < 1) {
  900. error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
  901. "must be positive", value);
  902. return;
  903. }
  904. bs->bl.opt_mem_alignment = value;
  905. ret = blkio_get_int(s->blkio, "max-segments", &value);
  906. if (ret < 0) {
  907. error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
  908. blkio_get_error_msg());
  909. return;
  910. }
  911. if (value < 1) {
  912. error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
  913. value);
  914. return;
  915. }
  916. bs->bl.max_iov = value;
  917. }
  918. /*
  919. * TODO
  920. * Missing libblkio APIs:
  921. * - block_status
  922. * - co_invalidate_cache
  923. *
  924. * Out of scope?
  925. * - create
  926. * - truncate
  927. */
  928. /*
  929. * Do not include .format_name and .protocol_name because module_block.py
  930. * does not parse macros in the source code.
  931. */
  932. #define BLKIO_DRIVER_COMMON \
  933. .instance_size = sizeof(BDRVBlkioState), \
  934. .bdrv_open = blkio_open, \
  935. .bdrv_close = blkio_close, \
  936. .bdrv_co_getlength = blkio_co_getlength, \
  937. .bdrv_co_truncate = blkio_truncate, \
  938. .bdrv_co_get_info = blkio_co_get_info, \
  939. .bdrv_attach_aio_context = blkio_attach_aio_context, \
  940. .bdrv_detach_aio_context = blkio_detach_aio_context, \
  941. .bdrv_co_pdiscard = blkio_co_pdiscard, \
  942. .bdrv_co_preadv = blkio_co_preadv, \
  943. .bdrv_co_pwritev = blkio_co_pwritev, \
  944. .bdrv_co_flush_to_disk = blkio_co_flush, \
  945. .bdrv_co_pwrite_zeroes = blkio_co_pwrite_zeroes, \
  946. .bdrv_refresh_limits = blkio_refresh_limits, \
  947. .bdrv_register_buf = blkio_register_buf, \
  948. .bdrv_unregister_buf = blkio_unregister_buf,
  949. /*
  950. * Use the same .format_name and .protocol_name as the libblkio driver name for
  951. * consistency.
  952. */
  953. static BlockDriver bdrv_io_uring = {
  954. .format_name = "io_uring",
  955. .protocol_name = "io_uring",
  956. .bdrv_needs_filename = true,
  957. BLKIO_DRIVER_COMMON
  958. };
  959. static BlockDriver bdrv_nvme_io_uring = {
  960. .format_name = "nvme-io_uring",
  961. .protocol_name = "nvme-io_uring",
  962. BLKIO_DRIVER_COMMON
  963. };
  964. static BlockDriver bdrv_virtio_blk_vfio_pci = {
  965. .format_name = "virtio-blk-vfio-pci",
  966. .protocol_name = "virtio-blk-vfio-pci",
  967. BLKIO_DRIVER_COMMON
  968. };
  969. static BlockDriver bdrv_virtio_blk_vhost_user = {
  970. .format_name = "virtio-blk-vhost-user",
  971. .protocol_name = "virtio-blk-vhost-user",
  972. BLKIO_DRIVER_COMMON
  973. };
  974. static BlockDriver bdrv_virtio_blk_vhost_vdpa = {
  975. .format_name = "virtio-blk-vhost-vdpa",
  976. .protocol_name = "virtio-blk-vhost-vdpa",
  977. BLKIO_DRIVER_COMMON
  978. };
  979. static void bdrv_blkio_init(void)
  980. {
  981. bdrv_register(&bdrv_io_uring);
  982. bdrv_register(&bdrv_nvme_io_uring);
  983. bdrv_register(&bdrv_virtio_blk_vfio_pci);
  984. bdrv_register(&bdrv_virtio_blk_vhost_user);
  985. bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
  986. }
  987. block_init(bdrv_blkio_init);