vhost-user-blk.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654
  1. /*
  2. * vhost-user-blk sample application
  3. *
  4. * Copyright (c) 2017 Intel Corporation. All rights reserved.
  5. *
  6. * Author:
  7. * Changpeng Liu <changpeng.liu@intel.com>
  8. *
  9. * This work is based on the "vhost-user-scsi" sample and "virtio-blk" driver
  10. * implementation by:
  11. * Felipe Franciosi <felipe@nutanix.com>
  12. * Anthony Liguori <aliguori@us.ibm.com>
  13. *
  14. * This work is licensed under the terms of the GNU GPL, version 2 only.
  15. * See the COPYING file in the top-level directory.
  16. */
  17. #include "qemu/osdep.h"
  18. #include "standard-headers/linux/virtio_blk.h"
  19. #include "contrib/libvhost-user/libvhost-user-glib.h"
  20. #include "contrib/libvhost-user/libvhost-user.h"
  21. #if defined(__linux__)
  22. #include <linux/fs.h>
  23. #include <sys/ioctl.h>
  24. #endif
  25. enum {
  26. VHOST_USER_BLK_MAX_QUEUES = 8,
  27. };
  28. struct virtio_blk_inhdr {
  29. unsigned char status;
  30. };
  31. /* vhost user block device */
  32. typedef struct VubDev {
  33. VugDev parent;
  34. int blk_fd;
  35. struct virtio_blk_config blkcfg;
  36. bool enable_ro;
  37. char *blk_name;
  38. GMainLoop *loop;
  39. } VubDev;
  40. typedef struct VubReq {
  41. VuVirtqElement *elem;
  42. int64_t sector_num;
  43. size_t size;
  44. struct virtio_blk_inhdr *in;
  45. struct virtio_blk_outhdr *out;
  46. VubDev *vdev_blk;
  47. struct VuVirtq *vq;
  48. } VubReq;
  49. /* refer util/iov.c */
  50. static size_t vub_iov_size(const struct iovec *iov,
  51. const unsigned int iov_cnt)
  52. {
  53. size_t len;
  54. unsigned int i;
  55. len = 0;
  56. for (i = 0; i < iov_cnt; i++) {
  57. len += iov[i].iov_len;
  58. }
  59. return len;
  60. }
  61. static size_t vub_iov_to_buf(const struct iovec *iov,
  62. const unsigned int iov_cnt, void *buf)
  63. {
  64. size_t len;
  65. unsigned int i;
  66. len = 0;
  67. for (i = 0; i < iov_cnt; i++) {
  68. memcpy(buf + len, iov[i].iov_base, iov[i].iov_len);
  69. len += iov[i].iov_len;
  70. }
  71. return len;
  72. }
  73. static void vub_panic_cb(VuDev *vu_dev, const char *buf)
  74. {
  75. VugDev *gdev;
  76. VubDev *vdev_blk;
  77. assert(vu_dev);
  78. gdev = container_of(vu_dev, VugDev, parent);
  79. vdev_blk = container_of(gdev, VubDev, parent);
  80. if (buf) {
  81. g_warning("vu_panic: %s", buf);
  82. }
  83. g_main_loop_quit(vdev_blk->loop);
  84. }
  85. static void vub_req_complete(VubReq *req)
  86. {
  87. VugDev *gdev = &req->vdev_blk->parent;
  88. VuDev *vu_dev = &gdev->parent;
  89. /* IO size with 1 extra status byte */
  90. vu_queue_push(vu_dev, req->vq, req->elem,
  91. req->size + 1);
  92. vu_queue_notify(vu_dev, req->vq);
  93. if (req->elem) {
  94. free(req->elem);
  95. }
  96. g_free(req);
  97. }
  98. static int vub_open(const char *file_name, bool wce)
  99. {
  100. int fd;
  101. int flags = O_RDWR;
  102. if (!wce) {
  103. flags |= O_DIRECT;
  104. }
  105. fd = open(file_name, flags);
  106. if (fd < 0) {
  107. fprintf(stderr, "Cannot open file %s, %s\n", file_name,
  108. strerror(errno));
  109. return -1;
  110. }
  111. return fd;
  112. }
  113. static ssize_t
  114. vub_readv(VubReq *req, struct iovec *iov, uint32_t iovcnt)
  115. {
  116. VubDev *vdev_blk = req->vdev_blk;
  117. ssize_t rc;
  118. if (!iovcnt) {
  119. fprintf(stderr, "Invalid Read IOV count\n");
  120. return -1;
  121. }
  122. req->size = vub_iov_size(iov, iovcnt);
  123. rc = preadv(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
  124. if (rc < 0) {
  125. fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
  126. vdev_blk->blk_name, req->sector_num, req->size,
  127. strerror(errno));
  128. return -1;
  129. }
  130. return rc;
  131. }
  132. static ssize_t
  133. vub_writev(VubReq *req, struct iovec *iov, uint32_t iovcnt)
  134. {
  135. VubDev *vdev_blk = req->vdev_blk;
  136. ssize_t rc;
  137. if (!iovcnt) {
  138. fprintf(stderr, "Invalid Write IOV count\n");
  139. return -1;
  140. }
  141. req->size = vub_iov_size(iov, iovcnt);
  142. rc = pwritev(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
  143. if (rc < 0) {
  144. fprintf(stderr, "%s, Sector %"PRIu64", Size %lu failed with %s\n",
  145. vdev_blk->blk_name, req->sector_num, req->size,
  146. strerror(errno));
  147. return -1;
  148. }
  149. return rc;
  150. }
  151. static int
  152. vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
  153. uint32_t type)
  154. {
  155. struct virtio_blk_discard_write_zeroes *desc;
  156. ssize_t size;
  157. void *buf;
  158. size = vub_iov_size(iov, iovcnt);
  159. if (size != sizeof(*desc)) {
  160. fprintf(stderr, "Invalid size %ld, expect %ld\n", size, sizeof(*desc));
  161. return -1;
  162. }
  163. buf = g_new0(char, size);
  164. vub_iov_to_buf(iov, iovcnt, buf);
  165. #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
  166. VubDev *vdev_blk = req->vdev_blk;
  167. desc = (struct virtio_blk_discard_write_zeroes *)buf;
  168. uint64_t range[2] = { le64toh(desc->sector) << 9,
  169. le32toh(desc->num_sectors) << 9 };
  170. if (type == VIRTIO_BLK_T_DISCARD) {
  171. if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) {
  172. g_free(buf);
  173. return 0;
  174. }
  175. } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
  176. if (ioctl(vdev_blk->blk_fd, BLKZEROOUT, range) == 0) {
  177. g_free(buf);
  178. return 0;
  179. }
  180. }
  181. #endif
  182. g_free(buf);
  183. return -1;
  184. }
  185. static void
  186. vub_flush(VubReq *req)
  187. {
  188. VubDev *vdev_blk = req->vdev_blk;
  189. fdatasync(vdev_blk->blk_fd);
  190. }
  191. static int vub_virtio_process_req(VubDev *vdev_blk,
  192. VuVirtq *vq)
  193. {
  194. VugDev *gdev = &vdev_blk->parent;
  195. VuDev *vu_dev = &gdev->parent;
  196. VuVirtqElement *elem;
  197. uint32_t type;
  198. unsigned in_num;
  199. unsigned out_num;
  200. VubReq *req;
  201. elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
  202. if (!elem) {
  203. return -1;
  204. }
  205. /* refer to hw/block/virtio_blk.c */
  206. if (elem->out_num < 1 || elem->in_num < 1) {
  207. fprintf(stderr, "virtio-blk request missing headers\n");
  208. free(elem);
  209. return -1;
  210. }
  211. req = g_new0(VubReq, 1);
  212. req->vdev_blk = vdev_blk;
  213. req->vq = vq;
  214. req->elem = elem;
  215. in_num = elem->in_num;
  216. out_num = elem->out_num;
  217. /* don't support VIRTIO_F_ANY_LAYOUT and virtio 1.0 only */
  218. if (elem->out_sg[0].iov_len < sizeof(struct virtio_blk_outhdr)) {
  219. fprintf(stderr, "Invalid outhdr size\n");
  220. goto err;
  221. }
  222. req->out = (struct virtio_blk_outhdr *)elem->out_sg[0].iov_base;
  223. out_num--;
  224. if (elem->in_sg[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
  225. fprintf(stderr, "Invalid inhdr size\n");
  226. goto err;
  227. }
  228. req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base;
  229. in_num--;
  230. type = le32toh(req->out->type);
  231. switch (type & ~VIRTIO_BLK_T_BARRIER) {
  232. case VIRTIO_BLK_T_IN:
  233. case VIRTIO_BLK_T_OUT: {
  234. ssize_t ret = 0;
  235. bool is_write = type & VIRTIO_BLK_T_OUT;
  236. req->sector_num = le64toh(req->out->sector);
  237. if (is_write) {
  238. ret = vub_writev(req, &elem->out_sg[1], out_num);
  239. } else {
  240. ret = vub_readv(req, &elem->in_sg[0], in_num);
  241. }
  242. if (ret >= 0) {
  243. req->in->status = VIRTIO_BLK_S_OK;
  244. } else {
  245. req->in->status = VIRTIO_BLK_S_IOERR;
  246. }
  247. vub_req_complete(req);
  248. break;
  249. }
  250. case VIRTIO_BLK_T_FLUSH:
  251. vub_flush(req);
  252. req->in->status = VIRTIO_BLK_S_OK;
  253. vub_req_complete(req);
  254. break;
  255. case VIRTIO_BLK_T_GET_ID: {
  256. size_t size = MIN(vub_iov_size(&elem->in_sg[0], in_num),
  257. VIRTIO_BLK_ID_BYTES);
  258. snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
  259. req->in->status = VIRTIO_BLK_S_OK;
  260. req->size = elem->in_sg[0].iov_len;
  261. vub_req_complete(req);
  262. break;
  263. }
  264. case VIRTIO_BLK_T_DISCARD:
  265. case VIRTIO_BLK_T_WRITE_ZEROES: {
  266. int rc;
  267. rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
  268. if (rc == 0) {
  269. req->in->status = VIRTIO_BLK_S_OK;
  270. } else {
  271. req->in->status = VIRTIO_BLK_S_IOERR;
  272. }
  273. vub_req_complete(req);
  274. break;
  275. }
  276. default:
  277. req->in->status = VIRTIO_BLK_S_UNSUPP;
  278. vub_req_complete(req);
  279. break;
  280. }
  281. return 0;
  282. err:
  283. free(elem);
  284. g_free(req);
  285. return -1;
  286. }
  287. static void vub_process_vq(VuDev *vu_dev, int idx)
  288. {
  289. VugDev *gdev;
  290. VubDev *vdev_blk;
  291. VuVirtq *vq;
  292. int ret;
  293. gdev = container_of(vu_dev, VugDev, parent);
  294. vdev_blk = container_of(gdev, VubDev, parent);
  295. assert(vdev_blk);
  296. vq = vu_get_queue(vu_dev, idx);
  297. assert(vq);
  298. while (1) {
  299. ret = vub_virtio_process_req(vdev_blk, vq);
  300. if (ret) {
  301. break;
  302. }
  303. }
  304. }
  305. static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
  306. {
  307. VuVirtq *vq;
  308. assert(vu_dev);
  309. vq = vu_get_queue(vu_dev, idx);
  310. vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
  311. }
  312. static uint64_t
  313. vub_get_features(VuDev *dev)
  314. {
  315. uint64_t features;
  316. VugDev *gdev;
  317. VubDev *vdev_blk;
  318. gdev = container_of(dev, VugDev, parent);
  319. vdev_blk = container_of(gdev, VubDev, parent);
  320. features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
  321. 1ull << VIRTIO_BLK_F_SEG_MAX |
  322. 1ull << VIRTIO_BLK_F_TOPOLOGY |
  323. 1ull << VIRTIO_BLK_F_BLK_SIZE |
  324. 1ull << VIRTIO_BLK_F_FLUSH |
  325. #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
  326. 1ull << VIRTIO_BLK_F_DISCARD |
  327. 1ull << VIRTIO_BLK_F_WRITE_ZEROES |
  328. #endif
  329. 1ull << VIRTIO_BLK_F_CONFIG_WCE |
  330. 1ull << VIRTIO_F_VERSION_1 |
  331. 1ull << VHOST_USER_F_PROTOCOL_FEATURES;
  332. if (vdev_blk->enable_ro) {
  333. features |= 1ull << VIRTIO_BLK_F_RO;
  334. }
  335. return features;
  336. }
  337. static uint64_t
  338. vub_get_protocol_features(VuDev *dev)
  339. {
  340. return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
  341. 1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
  342. }
  343. static int
  344. vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
  345. {
  346. VugDev *gdev;
  347. VubDev *vdev_blk;
  348. gdev = container_of(vu_dev, VugDev, parent);
  349. vdev_blk = container_of(gdev, VubDev, parent);
  350. memcpy(config, &vdev_blk->blkcfg, len);
  351. return 0;
  352. }
  353. static int
  354. vub_set_config(VuDev *vu_dev, const uint8_t *data,
  355. uint32_t offset, uint32_t size, uint32_t flags)
  356. {
  357. VugDev *gdev;
  358. VubDev *vdev_blk;
  359. uint8_t wce;
  360. int fd;
  361. /* don't support live migration */
  362. if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
  363. return -1;
  364. }
  365. gdev = container_of(vu_dev, VugDev, parent);
  366. vdev_blk = container_of(gdev, VubDev, parent);
  367. if (offset != offsetof(struct virtio_blk_config, wce) ||
  368. size != 1) {
  369. return -1;
  370. }
  371. wce = *data;
  372. if (wce == vdev_blk->blkcfg.wce) {
  373. /* Do nothing as same with old configuration */
  374. return 0;
  375. }
  376. vdev_blk->blkcfg.wce = wce;
  377. fprintf(stdout, "Write Cache Policy Changed\n");
  378. if (vdev_blk->blk_fd >= 0) {
  379. close(vdev_blk->blk_fd);
  380. vdev_blk->blk_fd = -1;
  381. }
  382. fd = vub_open(vdev_blk->blk_name, wce);
  383. if (fd < 0) {
  384. fprintf(stderr, "Error to open block device %s\n", vdev_blk->blk_name);
  385. vdev_blk->blk_fd = -1;
  386. return -1;
  387. }
  388. vdev_blk->blk_fd = fd;
  389. return 0;
  390. }
  391. static const VuDevIface vub_iface = {
  392. .get_features = vub_get_features,
  393. .queue_set_started = vub_queue_set_started,
  394. .get_protocol_features = vub_get_protocol_features,
  395. .get_config = vub_get_config,
  396. .set_config = vub_set_config,
  397. };
  398. static int unix_sock_new(char *unix_fn)
  399. {
  400. int sock;
  401. struct sockaddr_un un;
  402. size_t len;
  403. assert(unix_fn);
  404. sock = socket(AF_UNIX, SOCK_STREAM, 0);
  405. if (sock <= 0) {
  406. perror("socket");
  407. return -1;
  408. }
  409. un.sun_family = AF_UNIX;
  410. (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn);
  411. len = sizeof(un.sun_family) + strlen(un.sun_path);
  412. (void)unlink(unix_fn);
  413. if (bind(sock, (struct sockaddr *)&un, len) < 0) {
  414. perror("bind");
  415. goto fail;
  416. }
  417. if (listen(sock, 1) < 0) {
  418. perror("listen");
  419. goto fail;
  420. }
  421. return sock;
  422. fail:
  423. (void)close(sock);
  424. return -1;
  425. }
  426. static void vub_free(struct VubDev *vdev_blk)
  427. {
  428. if (!vdev_blk) {
  429. return;
  430. }
  431. g_main_loop_unref(vdev_blk->loop);
  432. if (vdev_blk->blk_fd >= 0) {
  433. close(vdev_blk->blk_fd);
  434. }
  435. g_free(vdev_blk);
  436. }
  437. static uint32_t
  438. vub_get_blocksize(int fd)
  439. {
  440. uint32_t blocksize = 512;
  441. #if defined(__linux__) && defined(BLKSSZGET)
  442. if (ioctl(fd, BLKSSZGET, &blocksize) == 0) {
  443. return blocksize;
  444. }
  445. #endif
  446. return blocksize;
  447. }
  448. static void
  449. vub_initialize_config(int fd, struct virtio_blk_config *config)
  450. {
  451. off64_t capacity;
  452. capacity = lseek64(fd, 0, SEEK_END);
  453. config->capacity = capacity >> 9;
  454. config->blk_size = vub_get_blocksize(fd);
  455. config->size_max = 65536;
  456. config->seg_max = 128 - 2;
  457. config->min_io_size = 1;
  458. config->opt_io_size = 1;
  459. config->num_queues = 1;
  460. #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
  461. config->max_discard_sectors = 32768;
  462. config->max_discard_seg = 1;
  463. config->discard_sector_alignment = config->blk_size >> 9;
  464. config->max_write_zeroes_sectors = 32768;
  465. config->max_write_zeroes_seg = 1;
  466. #endif
  467. }
  468. static VubDev *
  469. vub_new(char *blk_file)
  470. {
  471. VubDev *vdev_blk;
  472. vdev_blk = g_new0(VubDev, 1);
  473. vdev_blk->loop = g_main_loop_new(NULL, FALSE);
  474. vdev_blk->blk_fd = vub_open(blk_file, 0);
  475. if (vdev_blk->blk_fd < 0) {
  476. fprintf(stderr, "Error to open block device %s\n", blk_file);
  477. vub_free(vdev_blk);
  478. return NULL;
  479. }
  480. vdev_blk->enable_ro = false;
  481. vdev_blk->blkcfg.wce = 0;
  482. vdev_blk->blk_name = blk_file;
  483. /* fill virtio_blk_config with block parameters */
  484. vub_initialize_config(vdev_blk->blk_fd, &vdev_blk->blkcfg);
  485. return vdev_blk;
  486. }
  487. int main(int argc, char **argv)
  488. {
  489. int opt;
  490. char *unix_socket = NULL;
  491. char *blk_file = NULL;
  492. bool enable_ro = false;
  493. int lsock = -1, csock = -1;
  494. VubDev *vdev_blk = NULL;
  495. while ((opt = getopt(argc, argv, "b:rs:h")) != -1) {
  496. switch (opt) {
  497. case 'b':
  498. blk_file = g_strdup(optarg);
  499. break;
  500. case 's':
  501. unix_socket = g_strdup(optarg);
  502. break;
  503. case 'r':
  504. enable_ro = true;
  505. break;
  506. case 'h':
  507. default:
  508. printf("Usage: %s [ -b block device or file, -s UNIX domain socket"
  509. " | -r Enable read-only ] | [ -h ]\n", argv[0]);
  510. return 0;
  511. }
  512. }
  513. if (!unix_socket || !blk_file) {
  514. printf("Usage: %s [ -b block device or file, -s UNIX domain socket"
  515. " | -r Enable read-only ] | [ -h ]\n", argv[0]);
  516. return -1;
  517. }
  518. lsock = unix_sock_new(unix_socket);
  519. if (lsock < 0) {
  520. goto err;
  521. }
  522. csock = accept(lsock, (void *)0, (void *)0);
  523. if (csock < 0) {
  524. fprintf(stderr, "Accept error %s\n", strerror(errno));
  525. goto err;
  526. }
  527. vdev_blk = vub_new(blk_file);
  528. if (!vdev_blk) {
  529. goto err;
  530. }
  531. if (enable_ro) {
  532. vdev_blk->enable_ro = true;
  533. }
  534. if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock,
  535. vub_panic_cb, &vub_iface)) {
  536. fprintf(stderr, "Failed to initialized libvhost-user-glib\n");
  537. goto err;
  538. }
  539. g_main_loop_run(vdev_blk->loop);
  540. vug_deinit(&vdev_blk->parent);
  541. err:
  542. vub_free(vdev_blk);
  543. if (csock >= 0) {
  544. close(csock);
  545. }
  546. if (lsock >= 0) {
  547. close(lsock);
  548. }
  549. g_free(unix_socket);
  550. g_free(blk_file);
  551. return 0;
  552. }