vhost-user-blk.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672
  1. /*
  2. * vhost-user-blk sample application
  3. *
  4. * Copyright (c) 2017 Intel Corporation. All rights reserved.
  5. *
  6. * Author:
  7. * Changpeng Liu <changpeng.liu@intel.com>
  8. *
  9. * This work is based on the "vhost-user-scsi" sample and "virtio-blk" driver
  10. * implementation by:
  11. * Felipe Franciosi <felipe@nutanix.com>
  12. * Anthony Liguori <aliguori@us.ibm.com>
  13. *
  14. * This work is licensed under the terms of the GNU GPL, version 2 only.
  15. * See the COPYING file in the top-level directory.
  16. */
  17. #include "qemu/osdep.h"
  18. #include "qemu/bswap.h"
  19. #include "standard-headers/linux/virtio_blk.h"
  20. #include "libvhost-user-glib.h"
  21. #if defined(__linux__)
  22. #include <linux/fs.h>
  23. #include <sys/ioctl.h>
  24. #endif
  25. enum {
  26. VHOST_USER_BLK_MAX_QUEUES = 8,
  27. };
  28. struct virtio_blk_inhdr {
  29. unsigned char status;
  30. };
  31. /* vhost user block device */
  32. typedef struct VubDev {
  33. VugDev parent;
  34. int blk_fd;
  35. struct virtio_blk_config blkcfg;
  36. bool enable_ro;
  37. char *blk_name;
  38. GMainLoop *loop;
  39. } VubDev;
  40. typedef struct VubReq {
  41. VuVirtqElement *elem;
  42. int64_t sector_num;
  43. size_t size;
  44. struct virtio_blk_inhdr *in;
  45. struct virtio_blk_outhdr *out;
  46. VubDev *vdev_blk;
  47. struct VuVirtq *vq;
  48. } VubReq;
  49. /* refer util/iov.c */
  50. static size_t vub_iov_size(const struct iovec *iov,
  51. const unsigned int iov_cnt)
  52. {
  53. size_t len;
  54. unsigned int i;
  55. len = 0;
  56. for (i = 0; i < iov_cnt; i++) {
  57. len += iov[i].iov_len;
  58. }
  59. return len;
  60. }
  61. static size_t vub_iov_to_buf(const struct iovec *iov,
  62. const unsigned int iov_cnt, void *buf)
  63. {
  64. size_t len;
  65. unsigned int i;
  66. len = 0;
  67. for (i = 0; i < iov_cnt; i++) {
  68. memcpy(buf + len, iov[i].iov_base, iov[i].iov_len);
  69. len += iov[i].iov_len;
  70. }
  71. return len;
  72. }
  73. static void vub_panic_cb(VuDev *vu_dev, const char *buf)
  74. {
  75. VugDev *gdev;
  76. VubDev *vdev_blk;
  77. assert(vu_dev);
  78. gdev = container_of(vu_dev, VugDev, parent);
  79. vdev_blk = container_of(gdev, VubDev, parent);
  80. if (buf) {
  81. g_warning("vu_panic: %s", buf);
  82. }
  83. g_main_loop_quit(vdev_blk->loop);
  84. }
  85. static void vub_req_complete(VubReq *req)
  86. {
  87. VugDev *gdev = &req->vdev_blk->parent;
  88. VuDev *vu_dev = &gdev->parent;
  89. /* IO size with 1 extra status byte */
  90. vu_queue_push(vu_dev, req->vq, req->elem,
  91. req->size + 1);
  92. vu_queue_notify(vu_dev, req->vq);
  93. g_free(req->elem);
  94. g_free(req);
  95. }
  96. static int vub_open(const char *file_name, bool wce)
  97. {
  98. int fd;
  99. int flags = O_RDWR;
  100. if (!wce) {
  101. flags |= O_DIRECT;
  102. }
  103. fd = open(file_name, flags);
  104. if (fd < 0) {
  105. fprintf(stderr, "Cannot open file %s, %s\n", file_name,
  106. strerror(errno));
  107. return -1;
  108. }
  109. return fd;
  110. }
  111. static ssize_t
  112. vub_readv(VubReq *req, struct iovec *iov, uint32_t iovcnt)
  113. {
  114. VubDev *vdev_blk = req->vdev_blk;
  115. ssize_t rc;
  116. if (!iovcnt) {
  117. fprintf(stderr, "Invalid Read IOV count\n");
  118. return -1;
  119. }
  120. req->size = vub_iov_size(iov, iovcnt);
  121. rc = preadv(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
  122. if (rc < 0) {
  123. fprintf(stderr, "%s, Sector %"PRIu64", Size %zu failed with %s\n",
  124. vdev_blk->blk_name, req->sector_num, req->size,
  125. strerror(errno));
  126. return -1;
  127. }
  128. return rc;
  129. }
  130. static ssize_t
  131. vub_writev(VubReq *req, struct iovec *iov, uint32_t iovcnt)
  132. {
  133. VubDev *vdev_blk = req->vdev_blk;
  134. ssize_t rc;
  135. if (!iovcnt) {
  136. fprintf(stderr, "Invalid Write IOV count\n");
  137. return -1;
  138. }
  139. req->size = vub_iov_size(iov, iovcnt);
  140. rc = pwritev(vdev_blk->blk_fd, iov, iovcnt, req->sector_num * 512);
  141. if (rc < 0) {
  142. fprintf(stderr, "%s, Sector %"PRIu64", Size %zu failed with %s\n",
  143. vdev_blk->blk_name, req->sector_num, req->size,
  144. strerror(errno));
  145. return -1;
  146. }
  147. return rc;
  148. }
  149. static int
  150. vub_discard_write_zeroes(VubReq *req, struct iovec *iov, uint32_t iovcnt,
  151. uint32_t type)
  152. {
  153. struct virtio_blk_discard_write_zeroes *desc;
  154. ssize_t size;
  155. void *buf;
  156. size = vub_iov_size(iov, iovcnt);
  157. if (size != sizeof(*desc)) {
  158. fprintf(stderr, "Invalid size %zd, expect %zd\n", size, sizeof(*desc));
  159. return -1;
  160. }
  161. buf = g_new0(char, size);
  162. vub_iov_to_buf(iov, iovcnt, buf);
  163. #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
  164. VubDev *vdev_blk = req->vdev_blk;
  165. desc = buf;
  166. uint64_t range[2] = { le64_to_cpu(desc->sector) << 9,
  167. (uint64_t)le32_to_cpu(desc->num_sectors) << 9 };
  168. if (type == VIRTIO_BLK_T_DISCARD) {
  169. if (ioctl(vdev_blk->blk_fd, BLKDISCARD, range) == 0) {
  170. g_free(buf);
  171. return 0;
  172. }
  173. } else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
  174. if (ioctl(vdev_blk->blk_fd, BLKZEROOUT, range) == 0) {
  175. g_free(buf);
  176. return 0;
  177. }
  178. }
  179. #endif
  180. g_free(buf);
  181. return -1;
  182. }
  183. static void
  184. vub_flush(VubReq *req)
  185. {
  186. VubDev *vdev_blk = req->vdev_blk;
  187. fdatasync(vdev_blk->blk_fd);
  188. }
  189. static int vub_virtio_process_req(VubDev *vdev_blk,
  190. VuVirtq *vq)
  191. {
  192. VugDev *gdev = &vdev_blk->parent;
  193. VuDev *vu_dev = &gdev->parent;
  194. VuVirtqElement *elem;
  195. uint32_t type;
  196. unsigned in_num;
  197. unsigned out_num;
  198. VubReq *req;
  199. elem = vu_queue_pop(vu_dev, vq, sizeof(VuVirtqElement) + sizeof(VubReq));
  200. if (!elem) {
  201. return -1;
  202. }
  203. /* refer to hw/block/virtio_blk.c */
  204. if (elem->out_num < 1 || elem->in_num < 1) {
  205. fprintf(stderr, "virtio-blk request missing headers\n");
  206. g_free(elem);
  207. return -1;
  208. }
  209. req = g_new0(VubReq, 1);
  210. req->vdev_blk = vdev_blk;
  211. req->vq = vq;
  212. req->elem = elem;
  213. in_num = elem->in_num;
  214. out_num = elem->out_num;
  215. /* don't support VIRTIO_F_ANY_LAYOUT and virtio 1.0 only */
  216. if (elem->out_sg[0].iov_len < sizeof(struct virtio_blk_outhdr)) {
  217. fprintf(stderr, "Invalid outhdr size\n");
  218. goto err;
  219. }
  220. req->out = (struct virtio_blk_outhdr *)elem->out_sg[0].iov_base;
  221. out_num--;
  222. if (elem->in_sg[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
  223. fprintf(stderr, "Invalid inhdr size\n");
  224. goto err;
  225. }
  226. req->in = (struct virtio_blk_inhdr *)elem->in_sg[in_num - 1].iov_base;
  227. in_num--;
  228. type = le32_to_cpu(req->out->type);
  229. switch (type & ~VIRTIO_BLK_T_BARRIER) {
  230. case VIRTIO_BLK_T_IN:
  231. case VIRTIO_BLK_T_OUT: {
  232. ssize_t ret = 0;
  233. bool is_write = type & VIRTIO_BLK_T_OUT;
  234. req->sector_num = le64_to_cpu(req->out->sector);
  235. if (is_write) {
  236. ret = vub_writev(req, &elem->out_sg[1], out_num);
  237. } else {
  238. ret = vub_readv(req, &elem->in_sg[0], in_num);
  239. }
  240. if (ret >= 0) {
  241. req->in->status = VIRTIO_BLK_S_OK;
  242. } else {
  243. req->in->status = VIRTIO_BLK_S_IOERR;
  244. }
  245. vub_req_complete(req);
  246. break;
  247. }
  248. case VIRTIO_BLK_T_FLUSH:
  249. vub_flush(req);
  250. req->in->status = VIRTIO_BLK_S_OK;
  251. vub_req_complete(req);
  252. break;
  253. case VIRTIO_BLK_T_GET_ID: {
  254. size_t size = MIN(vub_iov_size(&elem->in_sg[0], in_num),
  255. VIRTIO_BLK_ID_BYTES);
  256. snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
  257. req->in->status = VIRTIO_BLK_S_OK;
  258. req->size = elem->in_sg[0].iov_len;
  259. vub_req_complete(req);
  260. break;
  261. }
  262. case VIRTIO_BLK_T_DISCARD:
  263. case VIRTIO_BLK_T_WRITE_ZEROES: {
  264. int rc;
  265. rc = vub_discard_write_zeroes(req, &elem->out_sg[1], out_num, type);
  266. if (rc == 0) {
  267. req->in->status = VIRTIO_BLK_S_OK;
  268. } else {
  269. req->in->status = VIRTIO_BLK_S_IOERR;
  270. }
  271. vub_req_complete(req);
  272. break;
  273. }
  274. default:
  275. req->in->status = VIRTIO_BLK_S_UNSUPP;
  276. vub_req_complete(req);
  277. break;
  278. }
  279. return 0;
  280. err:
  281. g_free(elem);
  282. g_free(req);
  283. return -1;
  284. }
  285. static void vub_process_vq(VuDev *vu_dev, int idx)
  286. {
  287. VugDev *gdev;
  288. VubDev *vdev_blk;
  289. VuVirtq *vq;
  290. int ret;
  291. gdev = container_of(vu_dev, VugDev, parent);
  292. vdev_blk = container_of(gdev, VubDev, parent);
  293. assert(vdev_blk);
  294. vq = vu_get_queue(vu_dev, idx);
  295. assert(vq);
  296. while (1) {
  297. ret = vub_virtio_process_req(vdev_blk, vq);
  298. if (ret) {
  299. break;
  300. }
  301. }
  302. }
  303. static void vub_queue_set_started(VuDev *vu_dev, int idx, bool started)
  304. {
  305. VuVirtq *vq;
  306. assert(vu_dev);
  307. vq = vu_get_queue(vu_dev, idx);
  308. vu_set_queue_handler(vu_dev, vq, started ? vub_process_vq : NULL);
  309. }
  310. static uint64_t
  311. vub_get_features(VuDev *dev)
  312. {
  313. uint64_t features;
  314. VugDev *gdev;
  315. VubDev *vdev_blk;
  316. gdev = container_of(dev, VugDev, parent);
  317. vdev_blk = container_of(gdev, VubDev, parent);
  318. features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
  319. 1ull << VIRTIO_BLK_F_SEG_MAX |
  320. 1ull << VIRTIO_BLK_F_TOPOLOGY |
  321. 1ull << VIRTIO_BLK_F_BLK_SIZE |
  322. 1ull << VIRTIO_BLK_F_FLUSH |
  323. #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
  324. 1ull << VIRTIO_BLK_F_DISCARD |
  325. 1ull << VIRTIO_BLK_F_WRITE_ZEROES |
  326. #endif
  327. 1ull << VIRTIO_BLK_F_CONFIG_WCE;
  328. if (vdev_blk->enable_ro) {
  329. features |= 1ull << VIRTIO_BLK_F_RO;
  330. }
  331. return features;
  332. }
  333. static uint64_t
  334. vub_get_protocol_features(VuDev *dev)
  335. {
  336. return 1ull << VHOST_USER_PROTOCOL_F_CONFIG |
  337. 1ull << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD;
  338. }
  339. static int
  340. vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
  341. {
  342. VugDev *gdev;
  343. VubDev *vdev_blk;
  344. if (len > sizeof(struct virtio_blk_config)) {
  345. return -1;
  346. }
  347. gdev = container_of(vu_dev, VugDev, parent);
  348. vdev_blk = container_of(gdev, VubDev, parent);
  349. memcpy(config, &vdev_blk->blkcfg, len);
  350. return 0;
  351. }
  352. static int
  353. vub_set_config(VuDev *vu_dev, const uint8_t *data,
  354. uint32_t offset, uint32_t size, uint32_t flags)
  355. {
  356. VugDev *gdev;
  357. VubDev *vdev_blk;
  358. uint8_t wce;
  359. int fd;
  360. /* don't support live migration */
  361. if (flags != VHOST_SET_CONFIG_TYPE_FRONTEND) {
  362. return -1;
  363. }
  364. gdev = container_of(vu_dev, VugDev, parent);
  365. vdev_blk = container_of(gdev, VubDev, parent);
  366. if (offset != offsetof(struct virtio_blk_config, wce) ||
  367. size != 1) {
  368. return -1;
  369. }
  370. wce = *data;
  371. if (wce == vdev_blk->blkcfg.wce) {
  372. /* Do nothing as same with old configuration */
  373. return 0;
  374. }
  375. vdev_blk->blkcfg.wce = wce;
  376. fprintf(stdout, "Write Cache Policy Changed\n");
  377. if (vdev_blk->blk_fd >= 0) {
  378. close(vdev_blk->blk_fd);
  379. vdev_blk->blk_fd = -1;
  380. }
  381. fd = vub_open(vdev_blk->blk_name, wce);
  382. if (fd < 0) {
  383. fprintf(stderr, "Error to open block device %s\n", vdev_blk->blk_name);
  384. vdev_blk->blk_fd = -1;
  385. return -1;
  386. }
  387. vdev_blk->blk_fd = fd;
  388. return 0;
  389. }
  390. static const VuDevIface vub_iface = {
  391. .get_features = vub_get_features,
  392. .queue_set_started = vub_queue_set_started,
  393. .get_protocol_features = vub_get_protocol_features,
  394. .get_config = vub_get_config,
  395. .set_config = vub_set_config,
  396. };
  397. static int unix_sock_new(char *unix_fn)
  398. {
  399. int sock;
  400. struct sockaddr_un un;
  401. assert(unix_fn);
  402. sock = socket(AF_UNIX, SOCK_STREAM, 0);
  403. if (sock < 0) {
  404. perror("socket");
  405. return -1;
  406. }
  407. un.sun_family = AF_UNIX;
  408. (void)snprintf(un.sun_path, sizeof(un.sun_path), "%s", unix_fn);
  409. (void)unlink(unix_fn);
  410. if (bind(sock, (struct sockaddr *)&un, sizeof(un)) < 0) {
  411. perror("bind");
  412. goto fail;
  413. }
  414. if (listen(sock, 1) < 0) {
  415. perror("listen");
  416. goto fail;
  417. }
  418. return sock;
  419. fail:
  420. (void)close(sock);
  421. return -1;
  422. }
  423. static void vub_free(struct VubDev *vdev_blk)
  424. {
  425. if (!vdev_blk) {
  426. return;
  427. }
  428. g_main_loop_unref(vdev_blk->loop);
  429. if (vdev_blk->blk_fd >= 0) {
  430. close(vdev_blk->blk_fd);
  431. }
  432. g_free(vdev_blk);
  433. }
  434. static uint32_t
  435. vub_get_blocksize(int fd)
  436. {
  437. uint32_t blocksize = 512;
  438. #if defined(__linux__) && defined(BLKSSZGET)
  439. if (ioctl(fd, BLKSSZGET, &blocksize) == 0) {
  440. return blocksize;
  441. }
  442. #endif
  443. return blocksize;
  444. }
  445. static void
  446. vub_initialize_config(int fd, struct virtio_blk_config *config)
  447. {
  448. off_t capacity;
  449. capacity = lseek(fd, 0, SEEK_END);
  450. config->capacity = capacity >> 9;
  451. config->blk_size = vub_get_blocksize(fd);
  452. config->size_max = 65536;
  453. config->seg_max = 128 - 2;
  454. config->min_io_size = 1;
  455. config->opt_io_size = 1;
  456. config->num_queues = 1;
  457. #if defined(__linux__) && defined(BLKDISCARD) && defined(BLKZEROOUT)
  458. config->max_discard_sectors = 32768;
  459. config->max_discard_seg = 1;
  460. config->discard_sector_alignment = config->blk_size >> 9;
  461. config->max_write_zeroes_sectors = 32768;
  462. config->max_write_zeroes_seg = 1;
  463. #endif
  464. }
  465. static VubDev *
  466. vub_new(char *blk_file)
  467. {
  468. VubDev *vdev_blk;
  469. vdev_blk = g_new0(VubDev, 1);
  470. vdev_blk->loop = g_main_loop_new(NULL, FALSE);
  471. vdev_blk->blk_fd = vub_open(blk_file, 0);
  472. if (vdev_blk->blk_fd < 0) {
  473. fprintf(stderr, "Error to open block device %s\n", blk_file);
  474. vub_free(vdev_blk);
  475. return NULL;
  476. }
  477. vdev_blk->enable_ro = false;
  478. vdev_blk->blkcfg.wce = 0;
  479. vdev_blk->blk_name = blk_file;
  480. /* fill virtio_blk_config with block parameters */
  481. vub_initialize_config(vdev_blk->blk_fd, &vdev_blk->blkcfg);
  482. return vdev_blk;
  483. }
  484. static int opt_fdnum = -1;
  485. static char *opt_socket_path;
  486. static char *opt_blk_file;
  487. static gboolean opt_print_caps;
  488. static gboolean opt_read_only;
  489. static GOptionEntry entries[] = {
  490. { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &opt_print_caps,
  491. "Print capabilities", NULL },
  492. { "fd", 'f', 0, G_OPTION_ARG_INT, &opt_fdnum,
  493. "Use inherited fd socket", "FDNUM" },
  494. { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &opt_socket_path,
  495. "Use UNIX socket path", "PATH" },
  496. {"blk-file", 'b', 0, G_OPTION_ARG_FILENAME, &opt_blk_file,
  497. "block device or file path", "PATH"},
  498. { "read-only", 'r', 0, G_OPTION_ARG_NONE, &opt_read_only,
  499. "Enable read-only", NULL },
  500. { NULL, },
  501. };
  502. int main(int argc, char **argv)
  503. {
  504. int lsock = -1, csock = -1;
  505. VubDev *vdev_blk = NULL;
  506. GError *error = NULL;
  507. GOptionContext *context;
  508. context = g_option_context_new(NULL);
  509. g_option_context_add_main_entries(context, entries, NULL);
  510. if (!g_option_context_parse(context, &argc, &argv, &error)) {
  511. g_printerr("Option parsing failed: %s\n", error->message);
  512. exit(EXIT_FAILURE);
  513. }
  514. if (opt_print_caps) {
  515. g_print("{\n");
  516. g_print(" \"type\": \"block\",\n");
  517. g_print(" \"features\": [\n");
  518. g_print(" \"read-only\",\n");
  519. g_print(" \"blk-file\"\n");
  520. g_print(" ]\n");
  521. g_print("}\n");
  522. exit(EXIT_SUCCESS);
  523. }
  524. if (!opt_blk_file) {
  525. g_print("%s\n", g_option_context_get_help(context, true, NULL));
  526. exit(EXIT_FAILURE);
  527. }
  528. if (opt_socket_path) {
  529. lsock = unix_sock_new(opt_socket_path);
  530. if (lsock < 0) {
  531. exit(EXIT_FAILURE);
  532. }
  533. } else if (opt_fdnum < 0) {
  534. g_print("%s\n", g_option_context_get_help(context, true, NULL));
  535. exit(EXIT_FAILURE);
  536. } else {
  537. lsock = opt_fdnum;
  538. }
  539. csock = accept(lsock, NULL, NULL);
  540. if (csock < 0) {
  541. g_printerr("Accept error %s\n", strerror(errno));
  542. exit(EXIT_FAILURE);
  543. }
  544. vdev_blk = vub_new(opt_blk_file);
  545. if (!vdev_blk) {
  546. exit(EXIT_FAILURE);
  547. }
  548. if (opt_read_only) {
  549. vdev_blk->enable_ro = true;
  550. }
  551. if (!vug_init(&vdev_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock,
  552. vub_panic_cb, &vub_iface)) {
  553. g_printerr("Failed to initialize libvhost-user-glib\n");
  554. exit(EXIT_FAILURE);
  555. }
  556. g_main_loop_run(vdev_blk->loop);
  557. g_main_loop_unref(vdev_blk->loop);
  558. g_option_context_free(context);
  559. vug_deinit(&vdev_blk->parent);
  560. vub_free(vdev_blk);
  561. if (csock >= 0) {
  562. close(csock);
  563. }
  564. if (lsock >= 0) {
  565. close(lsock);
  566. }
  567. g_free(opt_socket_path);
  568. g_free(opt_blk_file);
  569. return 0;
  570. }