123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667 |
- /*
- * Virtio Block Device
- *
- * Copyright IBM, Corp. 2007
- *
- * Authors:
- * Anthony Liguori <aliguori@us.ibm.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
- */
- #include "qemu-common.h"
- #include "qemu-error.h"
- #include "trace.h"
- #include "hw/block-common.h"
- #include "blockdev.h"
- #include "virtio-blk.h"
- #include "scsi-defs.h"
- #ifdef __linux__
- # include <scsi/sg.h>
- #endif
- typedef struct VirtIOBlock
- {
- VirtIODevice vdev;
- BlockDriverState *bs;
- VirtQueue *vq;
- void *rq;
- QEMUBH *bh;
- BlockConf *conf;
- VirtIOBlkConf *blk;
- unsigned short sector_mask;
- DeviceState *qdev;
- } VirtIOBlock;
- static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
- {
- return (VirtIOBlock *)vdev;
- }
- typedef struct VirtIOBlockReq
- {
- VirtIOBlock *dev;
- VirtQueueElement elem;
- struct virtio_blk_inhdr *in;
- struct virtio_blk_outhdr *out;
- struct virtio_scsi_inhdr *scsi;
- QEMUIOVector qiov;
- struct VirtIOBlockReq *next;
- BlockAcctCookie acct;
- } VirtIOBlockReq;
- static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
- {
- VirtIOBlock *s = req->dev;
- trace_virtio_blk_req_complete(req, status);
- stb_p(&req->in->status, status);
- virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in));
- virtio_notify(&s->vdev, s->vq);
- }
- static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
- int is_read)
- {
- BlockErrorAction action = bdrv_get_on_error(req->dev->bs, is_read);
- VirtIOBlock *s = req->dev;
- if (action == BLOCK_ERR_IGNORE) {
- bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_IGNORE, is_read);
- return 0;
- }
- if ((error == ENOSPC && action == BLOCK_ERR_STOP_ENOSPC)
- || action == BLOCK_ERR_STOP_ANY) {
- req->next = s->rq;
- s->rq = req;
- bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_STOP, is_read);
- vm_stop(RUN_STATE_IO_ERROR);
- bdrv_iostatus_set_err(s->bs, error);
- } else {
- virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
- bdrv_acct_done(s->bs, &req->acct);
- g_free(req);
- bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_REPORT, is_read);
- }
- return 1;
- }
- static void virtio_blk_rw_complete(void *opaque, int ret)
- {
- VirtIOBlockReq *req = opaque;
- trace_virtio_blk_rw_complete(req, ret);
- if (ret) {
- int is_read = !(ldl_p(&req->out->type) & VIRTIO_BLK_T_OUT);
- if (virtio_blk_handle_rw_error(req, -ret, is_read))
- return;
- }
- virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
- bdrv_acct_done(req->dev->bs, &req->acct);
- g_free(req);
- }
- static void virtio_blk_flush_complete(void *opaque, int ret)
- {
- VirtIOBlockReq *req = opaque;
- if (ret) {
- if (virtio_blk_handle_rw_error(req, -ret, 0)) {
- return;
- }
- }
- virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
- bdrv_acct_done(req->dev->bs, &req->acct);
- g_free(req);
- }
- static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
- {
- VirtIOBlockReq *req = g_malloc(sizeof(*req));
- req->dev = s;
- req->qiov.size = 0;
- req->next = NULL;
- return req;
- }
- static VirtIOBlockReq *virtio_blk_get_request(VirtIOBlock *s)
- {
- VirtIOBlockReq *req = virtio_blk_alloc_request(s);
- if (req != NULL) {
- if (!virtqueue_pop(s->vq, &req->elem)) {
- g_free(req);
- return NULL;
- }
- }
- return req;
- }
- static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
- {
- #ifdef __linux__
- int ret;
- int i;
- #endif
- int status = VIRTIO_BLK_S_OK;
- /*
- * We require at least one output segment each for the virtio_blk_outhdr
- * and the SCSI command block.
- *
- * We also at least require the virtio_blk_inhdr, the virtio_scsi_inhdr
- * and the sense buffer pointer in the input segments.
- */
- if (req->elem.out_num < 2 || req->elem.in_num < 3) {
- virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
- g_free(req);
- return;
- }
- /*
- * The scsi inhdr is placed in the second-to-last input segment, just
- * before the regular inhdr.
- */
- req->scsi = (void *)req->elem.in_sg[req->elem.in_num - 2].iov_base;
- if (!req->dev->blk->scsi) {
- status = VIRTIO_BLK_S_UNSUPP;
- goto fail;
- }
- /*
- * No support for bidirection commands yet.
- */
- if (req->elem.out_num > 2 && req->elem.in_num > 3) {
- status = VIRTIO_BLK_S_UNSUPP;
- goto fail;
- }
- #ifdef __linux__
- struct sg_io_hdr hdr;
- memset(&hdr, 0, sizeof(struct sg_io_hdr));
- hdr.interface_id = 'S';
- hdr.cmd_len = req->elem.out_sg[1].iov_len;
- hdr.cmdp = req->elem.out_sg[1].iov_base;
- hdr.dxfer_len = 0;
- if (req->elem.out_num > 2) {
- /*
- * If there are more than the minimally required 2 output segments
- * there is write payload starting from the third iovec.
- */
- hdr.dxfer_direction = SG_DXFER_TO_DEV;
- hdr.iovec_count = req->elem.out_num - 2;
- for (i = 0; i < hdr.iovec_count; i++)
- hdr.dxfer_len += req->elem.out_sg[i + 2].iov_len;
- hdr.dxferp = req->elem.out_sg + 2;
- } else if (req->elem.in_num > 3) {
- /*
- * If we have more than 3 input segments the guest wants to actually
- * read data.
- */
- hdr.dxfer_direction = SG_DXFER_FROM_DEV;
- hdr.iovec_count = req->elem.in_num - 3;
- for (i = 0; i < hdr.iovec_count; i++)
- hdr.dxfer_len += req->elem.in_sg[i].iov_len;
- hdr.dxferp = req->elem.in_sg;
- } else {
- /*
- * Some SCSI commands don't actually transfer any data.
- */
- hdr.dxfer_direction = SG_DXFER_NONE;
- }
- hdr.sbp = req->elem.in_sg[req->elem.in_num - 3].iov_base;
- hdr.mx_sb_len = req->elem.in_sg[req->elem.in_num - 3].iov_len;
- ret = bdrv_ioctl(req->dev->bs, SG_IO, &hdr);
- if (ret) {
- status = VIRTIO_BLK_S_UNSUPP;
- goto fail;
- }
- /*
- * From SCSI-Generic-HOWTO: "Some lower level drivers (e.g. ide-scsi)
- * clear the masked_status field [hence status gets cleared too, see
- * block/scsi_ioctl.c] even when a CHECK_CONDITION or COMMAND_TERMINATED
- * status has occurred. However they do set DRIVER_SENSE in driver_status
- * field. Also a (sb_len_wr > 0) indicates there is a sense buffer.
- */
- if (hdr.status == 0 && hdr.sb_len_wr > 0) {
- hdr.status = CHECK_CONDITION;
- }
- stl_p(&req->scsi->errors,
- hdr.status | (hdr.msg_status << 8) |
- (hdr.host_status << 16) | (hdr.driver_status << 24));
- stl_p(&req->scsi->residual, hdr.resid);
- stl_p(&req->scsi->sense_len, hdr.sb_len_wr);
- stl_p(&req->scsi->data_len, hdr.dxfer_len);
- virtio_blk_req_complete(req, status);
- g_free(req);
- return;
- #else
- abort();
- #endif
- fail:
- /* Just put anything nonzero so that the ioctl fails in the guest. */
- stl_p(&req->scsi->errors, 255);
- virtio_blk_req_complete(req, status);
- g_free(req);
- }
- typedef struct MultiReqBuffer {
- BlockRequest blkreq[32];
- unsigned int num_writes;
- } MultiReqBuffer;
- static void virtio_submit_multiwrite(BlockDriverState *bs, MultiReqBuffer *mrb)
- {
- int i, ret;
- if (!mrb->num_writes) {
- return;
- }
- ret = bdrv_aio_multiwrite(bs, mrb->blkreq, mrb->num_writes);
- if (ret != 0) {
- for (i = 0; i < mrb->num_writes; i++) {
- if (mrb->blkreq[i].error) {
- virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
- }
- }
- }
- mrb->num_writes = 0;
- }
- static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
- {
- bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH);
- /*
- * Make sure all outstanding writes are posted to the backing device.
- */
- virtio_submit_multiwrite(req->dev->bs, mrb);
- bdrv_aio_flush(req->dev->bs, virtio_blk_flush_complete, req);
- }
- static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
- {
- BlockRequest *blkreq;
- uint64_t sector;
- sector = ldq_p(&req->out->sector);
- bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE);
- trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
- if (sector & req->dev->sector_mask) {
- virtio_blk_rw_complete(req, -EIO);
- return;
- }
- if (req->qiov.size % req->dev->conf->logical_block_size) {
- virtio_blk_rw_complete(req, -EIO);
- return;
- }
- if (mrb->num_writes == 32) {
- virtio_submit_multiwrite(req->dev->bs, mrb);
- }
- blkreq = &mrb->blkreq[mrb->num_writes];
- blkreq->sector = sector;
- blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
- blkreq->qiov = &req->qiov;
- blkreq->cb = virtio_blk_rw_complete;
- blkreq->opaque = req;
- blkreq->error = 0;
- mrb->num_writes++;
- }
- static void virtio_blk_handle_read(VirtIOBlockReq *req)
- {
- uint64_t sector;
- sector = ldq_p(&req->out->sector);
- bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ);
- trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
- if (sector & req->dev->sector_mask) {
- virtio_blk_rw_complete(req, -EIO);
- return;
- }
- if (req->qiov.size % req->dev->conf->logical_block_size) {
- virtio_blk_rw_complete(req, -EIO);
- return;
- }
- bdrv_aio_readv(req->dev->bs, sector, &req->qiov,
- req->qiov.size / BDRV_SECTOR_SIZE,
- virtio_blk_rw_complete, req);
- }
- static void virtio_blk_handle_request(VirtIOBlockReq *req,
- MultiReqBuffer *mrb)
- {
- uint32_t type;
- if (req->elem.out_num < 1 || req->elem.in_num < 1) {
- error_report("virtio-blk missing headers");
- exit(1);
- }
- if (req->elem.out_sg[0].iov_len < sizeof(*req->out) ||
- req->elem.in_sg[req->elem.in_num - 1].iov_len < sizeof(*req->in)) {
- error_report("virtio-blk header not in correct element");
- exit(1);
- }
- req->out = (void *)req->elem.out_sg[0].iov_base;
- req->in = (void *)req->elem.in_sg[req->elem.in_num - 1].iov_base;
- type = ldl_p(&req->out->type);
- if (type & VIRTIO_BLK_T_FLUSH) {
- virtio_blk_handle_flush(req, mrb);
- } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
- virtio_blk_handle_scsi(req);
- } else if (type & VIRTIO_BLK_T_GET_ID) {
- VirtIOBlock *s = req->dev;
- /*
- * NB: per existing s/n string convention the string is
- * terminated by '\0' only when shorter than buffer.
- */
- strncpy(req->elem.in_sg[0].iov_base,
- s->blk->serial ? s->blk->serial : "",
- MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES));
- virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
- g_free(req);
- } else if (type & VIRTIO_BLK_T_OUT) {
- qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
- req->elem.out_num - 1);
- virtio_blk_handle_write(req, mrb);
- } else {
- qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
- req->elem.in_num - 1);
- virtio_blk_handle_read(req);
- }
- }
- static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
- {
- VirtIOBlock *s = to_virtio_blk(vdev);
- VirtIOBlockReq *req;
- MultiReqBuffer mrb = {
- .num_writes = 0,
- };
- while ((req = virtio_blk_get_request(s))) {
- virtio_blk_handle_request(req, &mrb);
- }
- virtio_submit_multiwrite(s->bs, &mrb);
- /*
- * FIXME: Want to check for completions before returning to guest mode,
- * so cached reads and writes are reported as quickly as possible. But
- * that should be done in the generic block layer.
- */
- }
- static void virtio_blk_dma_restart_bh(void *opaque)
- {
- VirtIOBlock *s = opaque;
- VirtIOBlockReq *req = s->rq;
- MultiReqBuffer mrb = {
- .num_writes = 0,
- };
- qemu_bh_delete(s->bh);
- s->bh = NULL;
- s->rq = NULL;
- while (req) {
- virtio_blk_handle_request(req, &mrb);
- req = req->next;
- }
- virtio_submit_multiwrite(s->bs, &mrb);
- }
- static void virtio_blk_dma_restart_cb(void *opaque, int running,
- RunState state)
- {
- VirtIOBlock *s = opaque;
- if (!running)
- return;
- if (!s->bh) {
- s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
- qemu_bh_schedule(s->bh);
- }
- }
- static void virtio_blk_reset(VirtIODevice *vdev)
- {
- /*
- * This should cancel pending requests, but can't do nicely until there
- * are per-device request lists.
- */
- bdrv_drain_all();
- }
- /* coalesce internal state, copy to pci i/o region 0
- */
- static void virtio_blk_update_config(VirtIODevice *vdev, uint8_t *config)
- {
- VirtIOBlock *s = to_virtio_blk(vdev);
- struct virtio_blk_config blkcfg;
- uint64_t capacity;
- int blk_size = s->conf->logical_block_size;
- bdrv_get_geometry(s->bs, &capacity);
- memset(&blkcfg, 0, sizeof(blkcfg));
- stq_raw(&blkcfg.capacity, capacity);
- stl_raw(&blkcfg.seg_max, 128 - 2);
- stw_raw(&blkcfg.cylinders, s->conf->cyls);
- stl_raw(&blkcfg.blk_size, blk_size);
- stw_raw(&blkcfg.min_io_size, s->conf->min_io_size / blk_size);
- stw_raw(&blkcfg.opt_io_size, s->conf->opt_io_size / blk_size);
- blkcfg.heads = s->conf->heads;
- /*
- * We must ensure that the block device capacity is a multiple of
- * the logical block size. If that is not the case, lets use
- * sector_mask to adopt the geometry to have a correct picture.
- * For those devices where the capacity is ok for the given geometry
- * we dont touch the sector value of the geometry, since some devices
- * (like s390 dasd) need a specific value. Here the capacity is already
- * cyls*heads*secs*blk_size and the sector value is not block size
- * divided by 512 - instead it is the amount of blk_size blocks
- * per track (cylinder).
- */
- if (bdrv_getlength(s->bs) / s->conf->heads / s->conf->secs % blk_size) {
- blkcfg.sectors = s->conf->secs & ~s->sector_mask;
- } else {
- blkcfg.sectors = s->conf->secs;
- }
- blkcfg.size_max = 0;
- blkcfg.physical_block_exp = get_physical_block_exp(s->conf);
- blkcfg.alignment_offset = 0;
- blkcfg.wce = bdrv_enable_write_cache(s->bs);
- memcpy(config, &blkcfg, sizeof(struct virtio_blk_config));
- }
- static void virtio_blk_set_config(VirtIODevice *vdev, const uint8_t *config)
- {
- VirtIOBlock *s = to_virtio_blk(vdev);
- struct virtio_blk_config blkcfg;
- memcpy(&blkcfg, config, sizeof(blkcfg));
- bdrv_set_enable_write_cache(s->bs, blkcfg.wce != 0);
- }
- static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
- {
- VirtIOBlock *s = to_virtio_blk(vdev);
- features |= (1 << VIRTIO_BLK_F_SEG_MAX);
- features |= (1 << VIRTIO_BLK_F_GEOMETRY);
- features |= (1 << VIRTIO_BLK_F_TOPOLOGY);
- features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
- features |= (1 << VIRTIO_BLK_F_SCSI);
- if (bdrv_enable_write_cache(s->bs))
- features |= (1 << VIRTIO_BLK_F_WCE);
- if (bdrv_is_read_only(s->bs))
- features |= 1 << VIRTIO_BLK_F_RO;
- return features;
- }
- static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
- {
- VirtIOBlock *s = to_virtio_blk(vdev);
- uint32_t features;
- if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
- return;
- }
- features = vdev->guest_features;
- bdrv_set_enable_write_cache(s->bs, !!(features & (1 << VIRTIO_BLK_F_WCE)));
- }
- static void virtio_blk_save(QEMUFile *f, void *opaque)
- {
- VirtIOBlock *s = opaque;
- VirtIOBlockReq *req = s->rq;
- virtio_save(&s->vdev, f);
-
- while (req) {
- qemu_put_sbyte(f, 1);
- qemu_put_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
- req = req->next;
- }
- qemu_put_sbyte(f, 0);
- }
- static int virtio_blk_load(QEMUFile *f, void *opaque, int version_id)
- {
- VirtIOBlock *s = opaque;
- int ret;
- if (version_id != 2)
- return -EINVAL;
- ret = virtio_load(&s->vdev, f);
- if (ret) {
- return ret;
- }
- while (qemu_get_sbyte(f)) {
- VirtIOBlockReq *req = virtio_blk_alloc_request(s);
- qemu_get_buffer(f, (unsigned char*)&req->elem, sizeof(req->elem));
- req->next = s->rq;
- s->rq = req;
- virtqueue_map_sg(req->elem.in_sg, req->elem.in_addr,
- req->elem.in_num, 1);
- virtqueue_map_sg(req->elem.out_sg, req->elem.out_addr,
- req->elem.out_num, 0);
- }
- return 0;
- }
- static void virtio_blk_resize(void *opaque)
- {
- VirtIOBlock *s = opaque;
- virtio_notify_config(&s->vdev);
- }
- static const BlockDevOps virtio_block_ops = {
- .resize_cb = virtio_blk_resize,
- };
- VirtIODevice *virtio_blk_init(DeviceState *dev, VirtIOBlkConf *blk)
- {
- VirtIOBlock *s;
- static int virtio_blk_id;
- if (!blk->conf.bs) {
- error_report("drive property not set");
- return NULL;
- }
- if (!bdrv_is_inserted(blk->conf.bs)) {
- error_report("Device needs media, but drive is empty");
- return NULL;
- }
- blkconf_serial(&blk->conf, &blk->serial);
- if (blkconf_geometry(&blk->conf, NULL, 65535, 255, 255) < 0) {
- return NULL;
- }
- s = (VirtIOBlock *)virtio_common_init("virtio-blk", VIRTIO_ID_BLOCK,
- sizeof(struct virtio_blk_config),
- sizeof(VirtIOBlock));
- s->vdev.get_config = virtio_blk_update_config;
- s->vdev.set_config = virtio_blk_set_config;
- s->vdev.get_features = virtio_blk_get_features;
- s->vdev.set_status = virtio_blk_set_status;
- s->vdev.reset = virtio_blk_reset;
- s->bs = blk->conf.bs;
- s->conf = &blk->conf;
- s->blk = blk;
- s->rq = NULL;
- s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
- s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
- qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
- s->qdev = dev;
- register_savevm(dev, "virtio-blk", virtio_blk_id++, 2,
- virtio_blk_save, virtio_blk_load, s);
- bdrv_set_dev_ops(s->bs, &virtio_block_ops, s);
- bdrv_set_buffer_alignment(s->bs, s->conf->logical_block_size);
- bdrv_iostatus_enable(s->bs);
- add_boot_device_path(s->conf->bootindex, dev, "/disk@0,0");
- return &s->vdev;
- }
- void virtio_blk_exit(VirtIODevice *vdev)
- {
- VirtIOBlock *s = to_virtio_blk(vdev);
- unregister_savevm(s->qdev, "virtio-blk", s);
- blockdev_mark_auto_del(s->bs);
- virtio_cleanup(vdev);
- }
|