2
0

xen_disk.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858
  1. /*
  2. * xen paravirt block device backend
  3. *
  4. * (c) Gerd Hoffmann <kraxel@redhat.com>
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; under version 2 of the License.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License along
  16. * with this program; if not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #include <stdio.h>
  19. #include <stdlib.h>
  20. #include <stdarg.h>
  21. #include <string.h>
  22. #include <unistd.h>
  23. #include <signal.h>
  24. #include <inttypes.h>
  25. #include <time.h>
  26. #include <fcntl.h>
  27. #include <errno.h>
  28. #include <sys/ioctl.h>
  29. #include <sys/types.h>
  30. #include <sys/stat.h>
  31. #include <sys/mman.h>
  32. #include <sys/uio.h>
  33. #include <xs.h>
  34. #include <xenctrl.h>
  35. #include <xen/io/xenbus.h>
  36. #include "hw.h"
  37. #include "block_int.h"
  38. #include "qemu-char.h"
  39. #include "xen_blkif.h"
  40. #include "xen_backend.h"
  41. #include "blockdev.h"
  42. /* ------------------------------------------------------------- */
  43. static int syncwrite = 0;
  44. static int batch_maps = 0;
  45. static int max_requests = 32;
  46. static int use_aio = 1;
  47. /* ------------------------------------------------------------- */
  48. #define BLOCK_SIZE 512
  49. #define IOCB_COUNT (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
  50. struct ioreq {
  51. blkif_request_t req;
  52. int16_t status;
  53. /* parsed request */
  54. off_t start;
  55. QEMUIOVector v;
  56. int presync;
  57. int postsync;
  58. /* grant mapping */
  59. uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  60. uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  61. int prot;
  62. void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
  63. void *pages;
  64. /* aio status */
  65. int aio_inflight;
  66. int aio_errors;
  67. struct XenBlkDev *blkdev;
  68. QLIST_ENTRY(ioreq) list;
  69. BlockAcctCookie acct;
  70. };
  71. struct XenBlkDev {
  72. struct XenDevice xendev; /* must be first */
  73. char *params;
  74. char *mode;
  75. char *type;
  76. char *dev;
  77. char *devtype;
  78. const char *fileproto;
  79. const char *filename;
  80. int ring_ref;
  81. void *sring;
  82. int64_t file_blk;
  83. int64_t file_size;
  84. int protocol;
  85. blkif_back_rings_t rings;
  86. int more_work;
  87. int cnt_map;
  88. /* request lists */
  89. QLIST_HEAD(inflight_head, ioreq) inflight;
  90. QLIST_HEAD(finished_head, ioreq) finished;
  91. QLIST_HEAD(freelist_head, ioreq) freelist;
  92. int requests_total;
  93. int requests_inflight;
  94. int requests_finished;
  95. /* qemu block driver */
  96. DriveInfo *dinfo;
  97. BlockDriverState *bs;
  98. QEMUBH *bh;
  99. };
  100. /* ------------------------------------------------------------- */
  101. static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
  102. {
  103. struct ioreq *ioreq = NULL;
  104. if (QLIST_EMPTY(&blkdev->freelist)) {
  105. if (blkdev->requests_total >= max_requests) {
  106. goto out;
  107. }
  108. /* allocate new struct */
  109. ioreq = g_malloc0(sizeof(*ioreq));
  110. ioreq->blkdev = blkdev;
  111. blkdev->requests_total++;
  112. qemu_iovec_init(&ioreq->v, BLKIF_MAX_SEGMENTS_PER_REQUEST);
  113. } else {
  114. /* get one from freelist */
  115. ioreq = QLIST_FIRST(&blkdev->freelist);
  116. QLIST_REMOVE(ioreq, list);
  117. qemu_iovec_reset(&ioreq->v);
  118. }
  119. QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
  120. blkdev->requests_inflight++;
  121. out:
  122. return ioreq;
  123. }
  124. static void ioreq_finish(struct ioreq *ioreq)
  125. {
  126. struct XenBlkDev *blkdev = ioreq->blkdev;
  127. QLIST_REMOVE(ioreq, list);
  128. QLIST_INSERT_HEAD(&blkdev->finished, ioreq, list);
  129. blkdev->requests_inflight--;
  130. blkdev->requests_finished++;
  131. }
  132. static void ioreq_release(struct ioreq *ioreq)
  133. {
  134. struct XenBlkDev *blkdev = ioreq->blkdev;
  135. QLIST_REMOVE(ioreq, list);
  136. memset(ioreq, 0, sizeof(*ioreq));
  137. ioreq->blkdev = blkdev;
  138. QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
  139. blkdev->requests_finished--;
  140. }
  141. /*
  142. * translate request into iovec + start offset
  143. * do sanity checks along the way
  144. */
  145. static int ioreq_parse(struct ioreq *ioreq)
  146. {
  147. struct XenBlkDev *blkdev = ioreq->blkdev;
  148. uintptr_t mem;
  149. size_t len;
  150. int i;
  151. xen_be_printf(&blkdev->xendev, 3,
  152. "op %d, nr %d, handle %d, id %" PRId64 ", sector %" PRId64 "\n",
  153. ioreq->req.operation, ioreq->req.nr_segments,
  154. ioreq->req.handle, ioreq->req.id, ioreq->req.sector_number);
  155. switch (ioreq->req.operation) {
  156. case BLKIF_OP_READ:
  157. ioreq->prot = PROT_WRITE; /* to memory */
  158. break;
  159. case BLKIF_OP_WRITE_BARRIER:
  160. if (!ioreq->req.nr_segments) {
  161. ioreq->presync = 1;
  162. return 0;
  163. }
  164. if (!syncwrite) {
  165. ioreq->presync = ioreq->postsync = 1;
  166. }
  167. /* fall through */
  168. case BLKIF_OP_WRITE:
  169. ioreq->prot = PROT_READ; /* from memory */
  170. if (syncwrite) {
  171. ioreq->postsync = 1;
  172. }
  173. break;
  174. default:
  175. xen_be_printf(&blkdev->xendev, 0, "error: unknown operation (%d)\n",
  176. ioreq->req.operation);
  177. goto err;
  178. };
  179. if (ioreq->req.operation != BLKIF_OP_READ && blkdev->mode[0] != 'w') {
  180. xen_be_printf(&blkdev->xendev, 0, "error: write req for ro device\n");
  181. goto err;
  182. }
  183. ioreq->start = ioreq->req.sector_number * blkdev->file_blk;
  184. for (i = 0; i < ioreq->req.nr_segments; i++) {
  185. if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
  186. xen_be_printf(&blkdev->xendev, 0, "error: nr_segments too big\n");
  187. goto err;
  188. }
  189. if (ioreq->req.seg[i].first_sect > ioreq->req.seg[i].last_sect) {
  190. xen_be_printf(&blkdev->xendev, 0, "error: first > last sector\n");
  191. goto err;
  192. }
  193. if (ioreq->req.seg[i].last_sect * BLOCK_SIZE >= XC_PAGE_SIZE) {
  194. xen_be_printf(&blkdev->xendev, 0, "error: page crossing\n");
  195. goto err;
  196. }
  197. ioreq->domids[i] = blkdev->xendev.dom;
  198. ioreq->refs[i] = ioreq->req.seg[i].gref;
  199. mem = ioreq->req.seg[i].first_sect * blkdev->file_blk;
  200. len = (ioreq->req.seg[i].last_sect - ioreq->req.seg[i].first_sect + 1) * blkdev->file_blk;
  201. qemu_iovec_add(&ioreq->v, (void*)mem, len);
  202. }
  203. if (ioreq->start + ioreq->v.size > blkdev->file_size) {
  204. xen_be_printf(&blkdev->xendev, 0, "error: access beyond end of file\n");
  205. goto err;
  206. }
  207. return 0;
  208. err:
  209. ioreq->status = BLKIF_RSP_ERROR;
  210. return -1;
  211. }
  212. static void ioreq_unmap(struct ioreq *ioreq)
  213. {
  214. XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
  215. int i;
  216. if (ioreq->v.niov == 0) {
  217. return;
  218. }
  219. if (batch_maps) {
  220. if (!ioreq->pages) {
  221. return;
  222. }
  223. if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
  224. xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
  225. strerror(errno));
  226. }
  227. ioreq->blkdev->cnt_map -= ioreq->v.niov;
  228. ioreq->pages = NULL;
  229. } else {
  230. for (i = 0; i < ioreq->v.niov; i++) {
  231. if (!ioreq->page[i]) {
  232. continue;
  233. }
  234. if (xc_gnttab_munmap(gnt, ioreq->page[i], 1) != 0) {
  235. xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
  236. strerror(errno));
  237. }
  238. ioreq->blkdev->cnt_map--;
  239. ioreq->page[i] = NULL;
  240. }
  241. }
  242. }
  243. static int ioreq_map(struct ioreq *ioreq)
  244. {
  245. XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
  246. int i;
  247. if (ioreq->v.niov == 0) {
  248. return 0;
  249. }
  250. if (batch_maps) {
  251. ioreq->pages = xc_gnttab_map_grant_refs
  252. (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
  253. if (ioreq->pages == NULL) {
  254. xen_be_printf(&ioreq->blkdev->xendev, 0,
  255. "can't map %d grant refs (%s, %d maps)\n",
  256. ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
  257. return -1;
  258. }
  259. for (i = 0; i < ioreq->v.niov; i++) {
  260. ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
  261. (uintptr_t)ioreq->v.iov[i].iov_base;
  262. }
  263. ioreq->blkdev->cnt_map += ioreq->v.niov;
  264. } else {
  265. for (i = 0; i < ioreq->v.niov; i++) {
  266. ioreq->page[i] = xc_gnttab_map_grant_ref
  267. (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
  268. if (ioreq->page[i] == NULL) {
  269. xen_be_printf(&ioreq->blkdev->xendev, 0,
  270. "can't map grant ref %d (%s, %d maps)\n",
  271. ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
  272. ioreq_unmap(ioreq);
  273. return -1;
  274. }
  275. ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
  276. ioreq->blkdev->cnt_map++;
  277. }
  278. }
  279. return 0;
  280. }
  281. static int ioreq_runio_qemu_sync(struct ioreq *ioreq)
  282. {
  283. struct XenBlkDev *blkdev = ioreq->blkdev;
  284. int i, rc;
  285. off_t pos;
  286. if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
  287. goto err_no_map;
  288. }
  289. if (ioreq->presync) {
  290. bdrv_flush(blkdev->bs);
  291. }
  292. switch (ioreq->req.operation) {
  293. case BLKIF_OP_READ:
  294. pos = ioreq->start;
  295. for (i = 0; i < ioreq->v.niov; i++) {
  296. rc = bdrv_read(blkdev->bs, pos / BLOCK_SIZE,
  297. ioreq->v.iov[i].iov_base,
  298. ioreq->v.iov[i].iov_len / BLOCK_SIZE);
  299. if (rc != 0) {
  300. xen_be_printf(&blkdev->xendev, 0, "rd I/O error (%p, len %zd)\n",
  301. ioreq->v.iov[i].iov_base,
  302. ioreq->v.iov[i].iov_len);
  303. goto err;
  304. }
  305. pos += ioreq->v.iov[i].iov_len;
  306. }
  307. break;
  308. case BLKIF_OP_WRITE:
  309. case BLKIF_OP_WRITE_BARRIER:
  310. if (!ioreq->req.nr_segments) {
  311. break;
  312. }
  313. pos = ioreq->start;
  314. for (i = 0; i < ioreq->v.niov; i++) {
  315. rc = bdrv_write(blkdev->bs, pos / BLOCK_SIZE,
  316. ioreq->v.iov[i].iov_base,
  317. ioreq->v.iov[i].iov_len / BLOCK_SIZE);
  318. if (rc != 0) {
  319. xen_be_printf(&blkdev->xendev, 0, "wr I/O error (%p, len %zd)\n",
  320. ioreq->v.iov[i].iov_base,
  321. ioreq->v.iov[i].iov_len);
  322. goto err;
  323. }
  324. pos += ioreq->v.iov[i].iov_len;
  325. }
  326. break;
  327. default:
  328. /* unknown operation (shouldn't happen -- parse catches this) */
  329. goto err;
  330. }
  331. if (ioreq->postsync) {
  332. bdrv_flush(blkdev->bs);
  333. }
  334. ioreq->status = BLKIF_RSP_OKAY;
  335. ioreq_unmap(ioreq);
  336. ioreq_finish(ioreq);
  337. return 0;
  338. err:
  339. ioreq_unmap(ioreq);
  340. err_no_map:
  341. ioreq_finish(ioreq);
  342. ioreq->status = BLKIF_RSP_ERROR;
  343. return -1;
  344. }
  345. static void qemu_aio_complete(void *opaque, int ret)
  346. {
  347. struct ioreq *ioreq = opaque;
  348. if (ret != 0) {
  349. xen_be_printf(&ioreq->blkdev->xendev, 0, "%s I/O error\n",
  350. ioreq->req.operation == BLKIF_OP_READ ? "read" : "write");
  351. ioreq->aio_errors++;
  352. }
  353. ioreq->aio_inflight--;
  354. if (ioreq->aio_inflight > 0) {
  355. return;
  356. }
  357. ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
  358. ioreq_unmap(ioreq);
  359. ioreq_finish(ioreq);
  360. bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct);
  361. qemu_bh_schedule(ioreq->blkdev->bh);
  362. }
  363. static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
  364. {
  365. struct XenBlkDev *blkdev = ioreq->blkdev;
  366. if (ioreq->req.nr_segments && ioreq_map(ioreq) == -1) {
  367. goto err_no_map;
  368. }
  369. ioreq->aio_inflight++;
  370. if (ioreq->presync) {
  371. bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */
  372. }
  373. switch (ioreq->req.operation) {
  374. case BLKIF_OP_READ:
  375. bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ);
  376. ioreq->aio_inflight++;
  377. bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE,
  378. &ioreq->v, ioreq->v.size / BLOCK_SIZE,
  379. qemu_aio_complete, ioreq);
  380. break;
  381. case BLKIF_OP_WRITE:
  382. case BLKIF_OP_WRITE_BARRIER:
  383. if (!ioreq->req.nr_segments) {
  384. break;
  385. }
  386. bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE);
  387. ioreq->aio_inflight++;
  388. bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE,
  389. &ioreq->v, ioreq->v.size / BLOCK_SIZE,
  390. qemu_aio_complete, ioreq);
  391. break;
  392. default:
  393. /* unknown operation (shouldn't happen -- parse catches this) */
  394. goto err;
  395. }
  396. if (ioreq->postsync) {
  397. bdrv_flush(blkdev->bs); /* FIXME: aio_flush() ??? */
  398. }
  399. qemu_aio_complete(ioreq, 0);
  400. return 0;
  401. err:
  402. ioreq_unmap(ioreq);
  403. err_no_map:
  404. ioreq_finish(ioreq);
  405. ioreq->status = BLKIF_RSP_ERROR;
  406. return -1;
  407. }
  408. static int blk_send_response_one(struct ioreq *ioreq)
  409. {
  410. struct XenBlkDev *blkdev = ioreq->blkdev;
  411. int send_notify = 0;
  412. int have_requests = 0;
  413. blkif_response_t resp;
  414. void *dst;
  415. resp.id = ioreq->req.id;
  416. resp.operation = ioreq->req.operation;
  417. resp.status = ioreq->status;
  418. /* Place on the response ring for the relevant domain. */
  419. switch (blkdev->protocol) {
  420. case BLKIF_PROTOCOL_NATIVE:
  421. dst = RING_GET_RESPONSE(&blkdev->rings.native, blkdev->rings.native.rsp_prod_pvt);
  422. break;
  423. case BLKIF_PROTOCOL_X86_32:
  424. dst = RING_GET_RESPONSE(&blkdev->rings.x86_32_part,
  425. blkdev->rings.x86_32_part.rsp_prod_pvt);
  426. break;
  427. case BLKIF_PROTOCOL_X86_64:
  428. dst = RING_GET_RESPONSE(&blkdev->rings.x86_64_part,
  429. blkdev->rings.x86_64_part.rsp_prod_pvt);
  430. break;
  431. default:
  432. dst = NULL;
  433. }
  434. memcpy(dst, &resp, sizeof(resp));
  435. blkdev->rings.common.rsp_prod_pvt++;
  436. RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blkdev->rings.common, send_notify);
  437. if (blkdev->rings.common.rsp_prod_pvt == blkdev->rings.common.req_cons) {
  438. /*
  439. * Tail check for pending requests. Allows frontend to avoid
  440. * notifications if requests are already in flight (lower
  441. * overheads and promotes batching).
  442. */
  443. RING_FINAL_CHECK_FOR_REQUESTS(&blkdev->rings.common, have_requests);
  444. } else if (RING_HAS_UNCONSUMED_REQUESTS(&blkdev->rings.common)) {
  445. have_requests = 1;
  446. }
  447. if (have_requests) {
  448. blkdev->more_work++;
  449. }
  450. return send_notify;
  451. }
  452. /* walk finished list, send outstanding responses, free requests */
  453. static void blk_send_response_all(struct XenBlkDev *blkdev)
  454. {
  455. struct ioreq *ioreq;
  456. int send_notify = 0;
  457. while (!QLIST_EMPTY(&blkdev->finished)) {
  458. ioreq = QLIST_FIRST(&blkdev->finished);
  459. send_notify += blk_send_response_one(ioreq);
  460. ioreq_release(ioreq);
  461. }
  462. if (send_notify) {
  463. xen_be_send_notify(&blkdev->xendev);
  464. }
  465. }
  466. static int blk_get_request(struct XenBlkDev *blkdev, struct ioreq *ioreq, RING_IDX rc)
  467. {
  468. switch (blkdev->protocol) {
  469. case BLKIF_PROTOCOL_NATIVE:
  470. memcpy(&ioreq->req, RING_GET_REQUEST(&blkdev->rings.native, rc),
  471. sizeof(ioreq->req));
  472. break;
  473. case BLKIF_PROTOCOL_X86_32:
  474. blkif_get_x86_32_req(&ioreq->req,
  475. RING_GET_REQUEST(&blkdev->rings.x86_32_part, rc));
  476. break;
  477. case BLKIF_PROTOCOL_X86_64:
  478. blkif_get_x86_64_req(&ioreq->req,
  479. RING_GET_REQUEST(&blkdev->rings.x86_64_part, rc));
  480. break;
  481. }
  482. return 0;
  483. }
  484. static void blk_handle_requests(struct XenBlkDev *blkdev)
  485. {
  486. RING_IDX rc, rp;
  487. struct ioreq *ioreq;
  488. blkdev->more_work = 0;
  489. rc = blkdev->rings.common.req_cons;
  490. rp = blkdev->rings.common.sring->req_prod;
  491. xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
  492. if (use_aio) {
  493. blk_send_response_all(blkdev);
  494. }
  495. while (rc != rp) {
  496. /* pull request from ring */
  497. if (RING_REQUEST_CONS_OVERFLOW(&blkdev->rings.common, rc)) {
  498. break;
  499. }
  500. ioreq = ioreq_start(blkdev);
  501. if (ioreq == NULL) {
  502. blkdev->more_work++;
  503. break;
  504. }
  505. blk_get_request(blkdev, ioreq, rc);
  506. blkdev->rings.common.req_cons = ++rc;
  507. /* parse them */
  508. if (ioreq_parse(ioreq) != 0) {
  509. if (blk_send_response_one(ioreq)) {
  510. xen_be_send_notify(&blkdev->xendev);
  511. }
  512. ioreq_release(ioreq);
  513. continue;
  514. }
  515. if (use_aio) {
  516. /* run i/o in aio mode */
  517. ioreq_runio_qemu_aio(ioreq);
  518. } else {
  519. /* run i/o in sync mode */
  520. ioreq_runio_qemu_sync(ioreq);
  521. }
  522. }
  523. if (!use_aio) {
  524. blk_send_response_all(blkdev);
  525. }
  526. if (blkdev->more_work && blkdev->requests_inflight < max_requests) {
  527. qemu_bh_schedule(blkdev->bh);
  528. }
  529. }
  530. /* ------------------------------------------------------------- */
  531. static void blk_bh(void *opaque)
  532. {
  533. struct XenBlkDev *blkdev = opaque;
  534. blk_handle_requests(blkdev);
  535. }
  536. static void blk_alloc(struct XenDevice *xendev)
  537. {
  538. struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
  539. QLIST_INIT(&blkdev->inflight);
  540. QLIST_INIT(&blkdev->finished);
  541. QLIST_INIT(&blkdev->freelist);
  542. blkdev->bh = qemu_bh_new(blk_bh, blkdev);
  543. if (xen_mode != XEN_EMULATE) {
  544. batch_maps = 1;
  545. }
  546. }
  547. static int blk_init(struct XenDevice *xendev)
  548. {
  549. struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
  550. int index, qflags, info = 0;
  551. /* read xenstore entries */
  552. if (blkdev->params == NULL) {
  553. char *h = NULL;
  554. blkdev->params = xenstore_read_be_str(&blkdev->xendev, "params");
  555. if (blkdev->params != NULL) {
  556. h = strchr(blkdev->params, ':');
  557. }
  558. if (h != NULL) {
  559. blkdev->fileproto = blkdev->params;
  560. blkdev->filename = h+1;
  561. *h = 0;
  562. } else {
  563. blkdev->fileproto = "<unset>";
  564. blkdev->filename = blkdev->params;
  565. }
  566. }
  567. if (!strcmp("aio", blkdev->fileproto)) {
  568. blkdev->fileproto = "raw";
  569. }
  570. if (blkdev->mode == NULL) {
  571. blkdev->mode = xenstore_read_be_str(&blkdev->xendev, "mode");
  572. }
  573. if (blkdev->type == NULL) {
  574. blkdev->type = xenstore_read_be_str(&blkdev->xendev, "type");
  575. }
  576. if (blkdev->dev == NULL) {
  577. blkdev->dev = xenstore_read_be_str(&blkdev->xendev, "dev");
  578. }
  579. if (blkdev->devtype == NULL) {
  580. blkdev->devtype = xenstore_read_be_str(&blkdev->xendev, "device-type");
  581. }
  582. /* do we have all we need? */
  583. if (blkdev->params == NULL ||
  584. blkdev->mode == NULL ||
  585. blkdev->type == NULL ||
  586. blkdev->dev == NULL) {
  587. goto out_error;
  588. }
  589. /* read-only ? */
  590. if (strcmp(blkdev->mode, "w") == 0) {
  591. qflags = BDRV_O_RDWR;
  592. } else {
  593. qflags = 0;
  594. info |= VDISK_READONLY;
  595. }
  596. /* cdrom ? */
  597. if (blkdev->devtype && !strcmp(blkdev->devtype, "cdrom")) {
  598. info |= VDISK_CDROM;
  599. }
  600. /* init qemu block driver */
  601. index = (blkdev->xendev.dev - 202 * 256) / 16;
  602. blkdev->dinfo = drive_get(IF_XEN, 0, index);
  603. if (!blkdev->dinfo) {
  604. /* setup via xenbus -> create new block driver instance */
  605. xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
  606. blkdev->bs = bdrv_new(blkdev->dev);
  607. if (blkdev->bs) {
  608. if (bdrv_open(blkdev->bs, blkdev->filename, qflags,
  609. bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) {
  610. bdrv_delete(blkdev->bs);
  611. blkdev->bs = NULL;
  612. }
  613. }
  614. if (!blkdev->bs) {
  615. goto out_error;
  616. }
  617. } else {
  618. /* setup via qemu cmdline -> already setup for us */
  619. xen_be_printf(&blkdev->xendev, 2, "get configured bdrv (cmdline setup)\n");
  620. blkdev->bs = blkdev->dinfo->bdrv;
  621. }
  622. bdrv_attach_dev_nofail(blkdev->bs, blkdev);
  623. blkdev->file_blk = BLOCK_SIZE;
  624. blkdev->file_size = bdrv_getlength(blkdev->bs);
  625. if (blkdev->file_size < 0) {
  626. xen_be_printf(&blkdev->xendev, 1, "bdrv_getlength: %d (%s) | drv %s\n",
  627. (int)blkdev->file_size, strerror(-blkdev->file_size),
  628. blkdev->bs->drv ? blkdev->bs->drv->format_name : "-");
  629. blkdev->file_size = 0;
  630. }
  631. xen_be_printf(xendev, 1, "type \"%s\", fileproto \"%s\", filename \"%s\","
  632. " size %" PRId64 " (%" PRId64 " MB)\n",
  633. blkdev->type, blkdev->fileproto, blkdev->filename,
  634. blkdev->file_size, blkdev->file_size >> 20);
  635. /* fill info */
  636. xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
  637. xenstore_write_be_int(&blkdev->xendev, "info", info);
  638. xenstore_write_be_int(&blkdev->xendev, "sector-size", blkdev->file_blk);
  639. xenstore_write_be_int(&blkdev->xendev, "sectors",
  640. blkdev->file_size / blkdev->file_blk);
  641. return 0;
  642. out_error:
  643. g_free(blkdev->params);
  644. blkdev->params = NULL;
  645. g_free(blkdev->mode);
  646. blkdev->mode = NULL;
  647. g_free(blkdev->type);
  648. blkdev->type = NULL;
  649. g_free(blkdev->dev);
  650. blkdev->dev = NULL;
  651. g_free(blkdev->devtype);
  652. blkdev->devtype = NULL;
  653. return -1;
  654. }
  655. static int blk_connect(struct XenDevice *xendev)
  656. {
  657. struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
  658. if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
  659. return -1;
  660. }
  661. if (xenstore_read_fe_int(&blkdev->xendev, "event-channel",
  662. &blkdev->xendev.remote_port) == -1) {
  663. return -1;
  664. }
  665. blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
  666. if (blkdev->xendev.protocol) {
  667. if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_32) == 0) {
  668. blkdev->protocol = BLKIF_PROTOCOL_X86_32;
  669. }
  670. if (strcmp(blkdev->xendev.protocol, XEN_IO_PROTO_ABI_X86_64) == 0) {
  671. blkdev->protocol = BLKIF_PROTOCOL_X86_64;
  672. }
  673. }
  674. blkdev->sring = xc_gnttab_map_grant_ref(blkdev->xendev.gnttabdev,
  675. blkdev->xendev.dom,
  676. blkdev->ring_ref,
  677. PROT_READ | PROT_WRITE);
  678. if (!blkdev->sring) {
  679. return -1;
  680. }
  681. blkdev->cnt_map++;
  682. switch (blkdev->protocol) {
  683. case BLKIF_PROTOCOL_NATIVE:
  684. {
  685. blkif_sring_t *sring_native = blkdev->sring;
  686. BACK_RING_INIT(&blkdev->rings.native, sring_native, XC_PAGE_SIZE);
  687. break;
  688. }
  689. case BLKIF_PROTOCOL_X86_32:
  690. {
  691. blkif_x86_32_sring_t *sring_x86_32 = blkdev->sring;
  692. BACK_RING_INIT(&blkdev->rings.x86_32_part, sring_x86_32, XC_PAGE_SIZE);
  693. break;
  694. }
  695. case BLKIF_PROTOCOL_X86_64:
  696. {
  697. blkif_x86_64_sring_t *sring_x86_64 = blkdev->sring;
  698. BACK_RING_INIT(&blkdev->rings.x86_64_part, sring_x86_64, XC_PAGE_SIZE);
  699. break;
  700. }
  701. }
  702. xen_be_bind_evtchn(&blkdev->xendev);
  703. xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
  704. "remote port %d, local port %d\n",
  705. blkdev->xendev.protocol, blkdev->ring_ref,
  706. blkdev->xendev.remote_port, blkdev->xendev.local_port);
  707. return 0;
  708. }
  709. static void blk_disconnect(struct XenDevice *xendev)
  710. {
  711. struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
  712. if (blkdev->bs) {
  713. if (!blkdev->dinfo) {
  714. /* close/delete only if we created it ourself */
  715. bdrv_close(blkdev->bs);
  716. bdrv_delete(blkdev->bs);
  717. }
  718. blkdev->bs = NULL;
  719. }
  720. xen_be_unbind_evtchn(&blkdev->xendev);
  721. if (blkdev->sring) {
  722. xc_gnttab_munmap(blkdev->xendev.gnttabdev, blkdev->sring, 1);
  723. blkdev->cnt_map--;
  724. blkdev->sring = NULL;
  725. }
  726. }
  727. static int blk_free(struct XenDevice *xendev)
  728. {
  729. struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
  730. struct ioreq *ioreq;
  731. while (!QLIST_EMPTY(&blkdev->freelist)) {
  732. ioreq = QLIST_FIRST(&blkdev->freelist);
  733. QLIST_REMOVE(ioreq, list);
  734. qemu_iovec_destroy(&ioreq->v);
  735. g_free(ioreq);
  736. }
  737. g_free(blkdev->params);
  738. g_free(blkdev->mode);
  739. g_free(blkdev->type);
  740. g_free(blkdev->dev);
  741. g_free(blkdev->devtype);
  742. qemu_bh_delete(blkdev->bh);
  743. return 0;
  744. }
  745. static void blk_event(struct XenDevice *xendev)
  746. {
  747. struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
  748. qemu_bh_schedule(blkdev->bh);
  749. }
  750. struct XenDevOps xen_blkdev_ops = {
  751. .size = sizeof(struct XenBlkDev),
  752. .flags = DEVOPS_FLAG_NEED_GNTDEV,
  753. .alloc = blk_alloc,
  754. .init = blk_init,
  755. .initialise = blk_connect,
  756. .disconnect = blk_disconnect,
  757. .event = blk_event,
  758. .free = blk_free,
  759. };