Browse Source

Merge remote-tracking branch 'kwolf/for-anthony' into staging

* kwolf/for-anthony: (46 commits)
  qed: remove incoming live migration blocker
  qed: honor BDRV_O_INCOMING for incoming live migration
  migration: clear BDRV_O_INCOMING flags on end of incoming live migration
  qed: add bdrv_invalidate_cache to be called after incoming live migration
  blockdev: open images with BDRV_O_INCOMING on incoming live migration
  block: add a function to clear incoming live migration flags
  block: Add new BDRV_O_INCOMING flag to notice incoming live migration
  block stream: close unused files and update ->backing_hd
  qemu-iotests: Fix call syntax for qemu-io
  qemu-iotests: Fix call syntax for qemu-img
  qemu-iotests: Test unknown qcow2 header extensions
  qemu-iotests: qcow2.py
  sheepdog: fix send req helpers
  sheepdog: implement SD_OP_FLUSH_VDI operation
  block: bdrv_append() fixes
  qed: track dirty flag status
  qemu-img: add dirty flag status
  qed: image fragmentation statistics
  qemu-img: add image fragmentation statistics
  block: document job API
  ...
Anthony Liguori 13 years ago
parent
commit
bb5d8dd757
44 changed files with 1081 additions and 494 deletions
  1. 77 11
      block.c
  2. 15 6
      block.h
  3. 0 7
      block/blkdebug.c
  4. 0 6
      block/cow.c
  5. 0 6
      block/qcow.c
  6. 8 10
      block/qcow2-cluster.c
  7. 0 6
      block/qcow2.c
  8. 9 0
      block/qed-check.c
  9. 14 19
      block/qed.c
  10. 0 2
      block/qed.h
  11. 0 6
      block/raw.c
  12. 130 14
      block/sheepdog.c
  13. 38 3
      block/stream.c
  14. 108 321
      block/vdi.c
  15. 2 2
      block/vmdk.c
  16. 3 6
      block/vpc.c
  17. 114 21
      block_int.h
  18. 10 2
      blockdev.c
  19. 12 9
      dma-helpers.c
  20. 6 6
      dma.h
  21. 47 13
      hw/ide/core.c
  22. 6 1
      hw/ide/internal.h
  23. 2 1
      hw/ide/macio.c
  24. 5 2
      hw/ide/qdev.c
  25. 0 1
      hw/lsi53c895a.c
  26. 46 0
      hw/qdev-properties.c
  27. 3 0
      hw/qdev.h
  28. 0 1
      linux-aio.c
  29. 1 0
      migration.c
  30. 21 0
      qemu-aio.h
  31. 11 1
      qemu-img.c
  32. 9 1
      qemu-io.c
  33. 6 1
      qerror.c
  34. 4 0
      qerror.h
  35. 4 0
      scripts/tracetool
  36. 2 2
      tests/qemu-iotests/009
  37. 3 3
      tests/qemu-iotests/010
  38. 1 1
      tests/qemu-iotests/011
  39. 72 0
      tests/qemu-iotests/031
  40. 76 0
      tests/qemu-iotests/031.out
  41. 7 2
      tests/qemu-iotests/common.rc
  42. 1 0
      tests/qemu-iotests/group
  43. 207 0
      tests/qemu-iotests/qcow2.py
  44. 1 1
      trace-events

+ 77 - 11
block.c

@@ -813,6 +813,9 @@ unlink_and_fail:
 void bdrv_close(BlockDriverState *bs)
 {
     if (bs->drv) {
+        if (bs->job) {
+            block_job_cancel_sync(bs->job);
+        }
         if (bs == bs_snapshots) {
             bs_snapshots = NULL;
         }
@@ -889,14 +892,16 @@ void bdrv_make_anon(BlockDriverState *bs)
  * This will modify the BlockDriverState fields, and swap contents
  * between bs_new and bs_top. Both bs_new and bs_top are modified.
  *
+ * bs_new is required to be anonymous.
+ *
  * This function does not create any image files.
  */
 void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 {
     BlockDriverState tmp;
 
-    /* the new bs must not be in bdrv_states */
-    bdrv_make_anon(bs_new);
+    /* bs_new must be anonymous */
+    assert(bs_new->device_name[0] == '\0');
 
     tmp = *bs_new;
 
@@ -941,11 +946,18 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
      * swapping bs_new and bs_top contents. */
     tmp.backing_hd = bs_new;
     pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename);
+    bdrv_get_format(bs_top, tmp.backing_format, sizeof(tmp.backing_format));
 
     /* swap contents of the fixed new bs and the current top */
     *bs_new = *bs_top;
     *bs_top = tmp;
 
+    /* device_name[] was carried over from the old bs_top.  bs_new
+     * shouldn't be in bdrv_states, so we need to make device_name[]
+     * reflect the anonymity of bs_new
+     */
+    bs_new->device_name[0] = '\0';
+
     /* clear the copied fields in the new backing file */
     bdrv_detach_dev(bs_new, bs_new->dev);
 
@@ -966,6 +978,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top)
 void bdrv_delete(BlockDriverState *bs)
 {
     assert(!bs->dev);
+    assert(!bs->job);
+    assert(!bs->in_use);
 
     /* remove from list, if necessary */
     bdrv_make_anon(bs);
@@ -1463,6 +1477,17 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
 
     qemu_iovec_init_external(&qiov, &iov, 1);
 
+    /**
+     * In sync call context, when the vcpu is blocked, this throttling timer
+     * will not fire; so the I/O throttling function has to be disabled here
+     * if it has been enabled.
+     */
+    if (bs->io_limits_enabled) {
+        fprintf(stderr, "Disabling I/O throttling on '%s' due "
+                        "to synchronous I/O.\n", bdrv_get_device_name(bs));
+        bdrv_io_limits_disable(bs);
+    }
+
     if (qemu_in_coroutine()) {
         /* Fast-path if already in coroutine context */
         bdrv_rw_co_entry(&rwco);
@@ -1969,10 +1994,19 @@ static int guess_disk_lchs(BlockDriverState *bs,
     struct partition *p;
     uint32_t nr_sects;
     uint64_t nb_sectors;
+    bool enabled;
 
     bdrv_get_geometry(bs, &nb_sectors);
 
+    /**
+     * The function will be invoked during startup not only in sync I/O mode,
+     * but also in async I/O mode. So the I/O throttling function has to
+     * be disabled temporarily here, not permanently.
+     */
+    enabled = bs->io_limits_enabled;
+    bs->io_limits_enabled = false;
     ret = bdrv_read(bs, 0, buf, 1);
+    bs->io_limits_enabled = enabled;
     if (ret < 0)
         return -1;
     /* test msdos magic */
@@ -2331,9 +2365,7 @@ void bdrv_flush_all(void)
     BlockDriverState *bs;
 
     QTAILQ_FOREACH(bs, &bdrv_states, list) {
-        if (!bdrv_is_read_only(bs) && bdrv_is_inserted(bs)) {
-            bdrv_flush(bs);
-        }
+        bdrv_flush(bs);
     }
 }
 
@@ -3520,7 +3552,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
 
-    if (!bs->drv) {
+    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) {
         return 0;
     }
 
@@ -3538,7 +3570,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
     }
 
     if (bs->drv->bdrv_co_flush_to_disk) {
-        return bs->drv->bdrv_co_flush_to_disk(bs);
+        ret = bs->drv->bdrv_co_flush_to_disk(bs);
     } else if (bs->drv->bdrv_aio_flush) {
         BlockDriverAIOCB *acb;
         CoroutineIOCompletion co = {
@@ -3547,10 +3579,10 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
         if (acb == NULL) {
-            return -EIO;
+            ret = -EIO;
         } else {
             qemu_coroutine_yield();
-            return co.ret;
+            ret = co.ret;
         }
     } else {
         /*
@@ -3564,8 +3596,16 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
          *
          * Let's hope the user knows what he's doing.
          */
-        return 0;
+        ret = 0;
+    }
+    if (ret < 0) {
+        return ret;
     }
+
+    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
+     * in the case of cache=unsafe, so there are no useless flushes.
+     */
+    return bdrv_co_flush(bs->file);
 }
 
 void bdrv_invalidate_cache(BlockDriverState *bs)
@@ -3584,6 +3624,15 @@ void bdrv_invalidate_cache_all(void)
     }
 }
 
+void bdrv_clear_incoming_migration_all(void)
+{
+    BlockDriverState *bs;
+
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING);
+    }
+}
+
 int bdrv_flush(BlockDriverState *bs)
 {
     Coroutine *co;
@@ -4054,10 +4103,16 @@ void block_job_complete(BlockJob *job, int ret)
 
 int block_job_set_speed(BlockJob *job, int64_t value)
 {
+    int rc;
+
     if (!job->job_type->set_speed) {
         return -ENOTSUP;
     }
-    return job->job_type->set_speed(job, value);
+    rc = job->job_type->set_speed(job, value);
+    if (rc == 0) {
+        job->speed = value;
+    }
+    return rc;
 }
 
 void block_job_cancel(BlockJob *job)
@@ -4069,3 +4124,14 @@ bool block_job_is_cancelled(BlockJob *job)
 {
     return job->cancelled;
 }
+
+void block_job_cancel_sync(BlockJob *job)
+{
+    BlockDriverState *bs = job->bs;
+
+    assert(bs->job == job);
+    block_job_cancel(job);
+    while (bs->job != NULL && bs->job->busy) {
+        qemu_aio_wait();
+    }
+}

+ 15 - 6
block.h

@@ -15,8 +15,15 @@ typedef struct BlockDriverInfo {
     int cluster_size;
     /* offset at which the VM state can be saved (0 if not possible) */
     int64_t vm_state_offset;
+    bool is_dirty;
 } BlockDriverInfo;
 
+typedef struct BlockFragInfo {
+    uint64_t allocated_clusters;
+    uint64_t total_clusters;
+    uint64_t fragmented_clusters;
+} BlockFragInfo;
+
 typedef struct QEMUSnapshotInfo {
     char id_str[128]; /* unique snapshot id */
     /* the following fields are informative. They are not needed for
@@ -71,6 +78,7 @@ typedef struct BlockDevOps {
 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
+#define BDRV_O_INCOMING    0x0800  /* consistency hint for incoming migration */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
 
@@ -175,13 +183,12 @@ typedef struct BdrvCheckResult {
     int corruptions;
     int leaks;
     int check_errors;
+    BlockFragInfo bfi;
 } BdrvCheckResult;
 
 int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res);
 
 /* async block I/O */
-typedef struct BlockDriverAIOCB BlockDriverAIOCB;
-typedef void BlockDriverCompletionFunc(void *opaque, int ret);
 typedef void BlockDriverDirtyHandler(BlockDriverState *bs, int64_t sector,
                                      int sector_num);
 BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
@@ -222,6 +229,8 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
 void bdrv_invalidate_cache(BlockDriverState *bs);
 void bdrv_invalidate_cache_all(void);
 
+void bdrv_clear_incoming_migration_all(void);
+
 /* Ensure contents are flushed to disk.  */
 int bdrv_flush(BlockDriverState *bs);
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
@@ -437,10 +446,10 @@ static inline unsigned int get_physical_block_exp(BlockConf *conf)
 
 #define DEFINE_BLOCK_PROPERTIES(_state, _conf)                          \
     DEFINE_PROP_DRIVE("drive", _state, _conf.bs),                       \
-    DEFINE_PROP_UINT16("logical_block_size", _state,                    \
-                       _conf.logical_block_size, 512),                  \
-    DEFINE_PROP_UINT16("physical_block_size", _state,                   \
-                       _conf.physical_block_size, 512),                 \
+    DEFINE_PROP_BLOCKSIZE("logical_block_size", _state,                 \
+                          _conf.logical_block_size, 512),               \
+    DEFINE_PROP_BLOCKSIZE("physical_block_size", _state,                \
+                          _conf.physical_block_size, 512),              \
     DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),  \
     DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0),    \
     DEFINE_PROP_INT32("bootindex", _state, _conf.bootindex, -1),        \

+ 0 - 7
block/blkdebug.c

@@ -397,12 +397,6 @@ static void blkdebug_close(BlockDriverState *bs)
     }
 }
 
-static BlockDriverAIOCB *blkdebug_aio_flush(BlockDriverState *bs,
-    BlockDriverCompletionFunc *cb, void *opaque)
-{
-    return bdrv_aio_flush(bs->file, cb, opaque);
-}
-
 static void process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
     BlkdebugVars *old_vars)
 {
@@ -452,7 +446,6 @@ static BlockDriver bdrv_blkdebug = {
 
     .bdrv_aio_readv     = blkdebug_aio_readv,
     .bdrv_aio_writev    = blkdebug_aio_writev,
-    .bdrv_aio_flush     = blkdebug_aio_flush,
 
     .bdrv_debug_event   = blkdebug_debug_event,
 };

+ 0 - 6
block/cow.c

@@ -318,11 +318,6 @@ exit:
     return ret;
 }
 
-static coroutine_fn int cow_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static QEMUOptionParameter cow_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -348,7 +343,6 @@ static BlockDriver bdrv_cow = {
 
     .bdrv_read              = cow_co_read,
     .bdrv_write             = cow_co_write,
-    .bdrv_co_flush_to_disk  = cow_co_flush,
     .bdrv_co_is_allocated   = cow_co_is_allocated,
 
     .create_options = cow_create_options,

+ 0 - 6
block/qcow.c

@@ -835,11 +835,6 @@ fail:
     return ret;
 }
 
-static coroutine_fn int qcow_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 {
     BDRVQcowState *s = bs->opaque;
@@ -877,7 +872,6 @@ static BlockDriver bdrv_qcow = {
 
     .bdrv_co_readv          = qcow_co_readv,
     .bdrv_co_writev         = qcow_co_writev,
-    .bdrv_co_flush_to_disk  = qcow_co_flush,
     .bdrv_co_is_allocated   = qcow_co_is_allocated,
 
     .bdrv_set_key           = qcow_set_key,

+ 8 - 10
block/qcow2-cluster.c

@@ -466,7 +466,6 @@ out:
  */
 static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
                              uint64_t **new_l2_table,
-                             uint64_t *new_l2_offset,
                              int *new_l2_index)
 {
     BDRVQcowState *s = bs->opaque;
@@ -514,7 +513,6 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
 
     *new_l2_table = l2_table;
-    *new_l2_offset = l2_offset;
     *new_l2_index = l2_index;
 
     return 0;
@@ -539,11 +537,11 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index, ret;
-    uint64_t l2_offset, *l2_table;
+    uint64_t *l2_table;
     int64_t cluster_offset;
     int nb_csectors;
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
         return 0;
     }
@@ -588,7 +586,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 {
     BDRVQcowState *s = bs->opaque;
     int i, j = 0, l2_index, ret;
-    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
+    uint64_t *old_cluster, start_sect, *l2_table;
     uint64_t cluster_offset = m->alloc_offset;
     bool cow = false;
 
@@ -633,7 +631,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
-    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index);
     if (ret < 0) {
         goto err;
     }
@@ -817,7 +815,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
 {
     BDRVQcowState *s = bs->opaque;
     int l2_index, ret, sectors;
-    uint64_t l2_offset, *l2_table;
+    uint64_t *l2_table;
     unsigned int nb_clusters, keep_clusters;
     uint64_t cluster_offset;
 
@@ -825,7 +823,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
                                       n_start, n_end);
 
     /* Find L2 entry for the first involved cluster */
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
         return ret;
     }
@@ -1000,12 +998,12 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset,
     unsigned int nb_clusters)
 {
     BDRVQcowState *s = bs->opaque;
-    uint64_t l2_offset, *l2_table;
+    uint64_t *l2_table;
     int l2_index;
     int ret;
     int i;
 
-    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_index);
     if (ret < 0) {
         return ret;
     }

+ 0 - 6
block/qcow2.c

@@ -1253,11 +1253,6 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
     return 0;
 }
 
-static coroutine_fn int qcow2_co_flush_to_disk(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
 {
 	return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits);
@@ -1377,7 +1372,6 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_co_readv          = qcow2_co_readv,
     .bdrv_co_writev         = qcow2_co_writev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
-    .bdrv_co_flush_to_disk  = qcow2_co_flush_to_disk,
 
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,

+ 9 - 0
block/qed-check.c

@@ -68,6 +68,7 @@ static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
 {
     BDRVQEDState *s = check->s;
     unsigned int i, num_invalid = 0;
+    uint64_t last_offset = 0;
 
     for (i = 0; i < s->table_nelems; i++) {
         uint64_t offset = table->offsets[i];
@@ -76,6 +77,11 @@ static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table)
             qed_offset_is_zero_cluster(offset)) {
             continue;
         }
+        check->result->bfi.allocated_clusters++;
+        if (last_offset && (last_offset + s->header.cluster_size != offset)) {
+            check->result->bfi.fragmented_clusters++;
+        }
+        last_offset = offset;
 
         /* Detect invalid cluster offset */
         if (!qed_check_cluster_offset(s, offset)) {
@@ -200,6 +206,9 @@ int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix)
     check.used_clusters = g_malloc0(((check.nclusters + 31) / 32) *
                                        sizeof(check.used_clusters[0]));
 
+    check.result->bfi.total_clusters =
+        (s->header.image_size + s->header.cluster_size - 1) /
+            s->header.cluster_size;
     ret = qed_check_l1_table(&check, s->l1_table);
     if (ret == 0) {
         /* Only check for leaks if entire image was scanned successfully */

+ 14 - 19
block/qed.c

@@ -450,7 +450,7 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
      * feature is no longer valid.
      */
     if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 &&
-        !bdrv_is_read_only(bs->file)) {
+        !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) {
         s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK;
 
         ret = qed_write_header_sync(s);
@@ -477,7 +477,8 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
          * potentially inconsistent images to be opened read-only.  This can
          * aid data recovery from an otherwise inconsistent image.
          */
-        if (!bdrv_is_read_only(bs->file)) {
+        if (!bdrv_is_read_only(bs->file) &&
+            !(flags & BDRV_O_INCOMING)) {
             BdrvCheckResult result = {0};
 
             ret = qed_check(s, &result, true);
@@ -497,12 +498,6 @@ static int bdrv_qed_open(BlockDriverState *bs, int flags)
     s->need_check_timer = qemu_new_timer_ns(vm_clock,
                                             qed_need_check_timer_cb, s);
 
-    error_set(&s->migration_blocker,
-              QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED,
-              "qed", bs->device_name, "live migration");
-    migrate_add_blocker(s->migration_blocker);
-
-
 out:
     if (ret) {
         qed_free_l2_cache(&s->l2_cache);
@@ -515,9 +510,6 @@ static void bdrv_qed_close(BlockDriverState *bs)
 {
     BDRVQEDState *s = bs->opaque;
 
-    migrate_del_blocker(s->migration_blocker);
-    error_free(s->migration_blocker);
-
     qed_cancel_need_check_timer(s);
     qemu_free_timer(s->need_check_timer);
 
@@ -1350,13 +1342,6 @@ static BlockDriverAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs,
                          opaque, QED_AIOCB_WRITE);
 }
 
-static BlockDriverAIOCB *bdrv_qed_aio_flush(BlockDriverState *bs,
-                                            BlockDriverCompletionFunc *cb,
-                                            void *opaque)
-{
-    return bdrv_aio_flush(bs->file, cb, opaque);
-}
-
 typedef struct {
     Coroutine *co;
     int ret;
@@ -1441,6 +1426,7 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 
     memset(bdi, 0, sizeof(*bdi));
     bdi->cluster_size = s->header.cluster_size;
+    bdi->is_dirty = s->header.features & QED_F_NEED_CHECK;
     return 0;
 }
 
@@ -1516,6 +1502,15 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs,
     return ret;
 }
 
+static void bdrv_qed_invalidate_cache(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    bdrv_qed_close(bs);
+    memset(s, 0, sizeof(BDRVQEDState));
+    bdrv_qed_open(bs, bs->open_flags);
+}
+
 static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result)
 {
     BDRVQEDState *s = bs->opaque;
@@ -1562,12 +1557,12 @@ static BlockDriver bdrv_qed = {
     .bdrv_make_empty          = bdrv_qed_make_empty,
     .bdrv_aio_readv           = bdrv_qed_aio_readv,
     .bdrv_aio_writev          = bdrv_qed_aio_writev,
-    .bdrv_aio_flush           = bdrv_qed_aio_flush,
     .bdrv_co_write_zeroes     = bdrv_qed_co_write_zeroes,
     .bdrv_truncate            = bdrv_qed_truncate,
     .bdrv_getlength           = bdrv_qed_getlength,
     .bdrv_get_info            = bdrv_qed_get_info,
     .bdrv_change_backing_file = bdrv_qed_change_backing_file,
+    .bdrv_invalidate_cache    = bdrv_qed_invalidate_cache,
     .bdrv_check               = bdrv_qed_check,
 };
 

+ 0 - 2
block/qed.h

@@ -169,8 +169,6 @@ typedef struct {
 
     /* Periodic flush and clear need check flag */
     QEMUTimer *need_check_timer;
-
-    Error *migration_blocker;
 } BDRVQEDState;
 
 enum {

+ 0 - 6
block/raw.c

@@ -25,11 +25,6 @@ static void raw_close(BlockDriverState *bs)
 {
 }
 
-static int coroutine_fn raw_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 static int64_t raw_getlength(BlockDriverState *bs)
 {
     return bdrv_getlength(bs->file);
@@ -113,7 +108,6 @@ static BlockDriver bdrv_raw = {
 
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
-    .bdrv_co_flush_to_disk  = raw_co_flush,
     .bdrv_co_discard        = raw_co_discard,
 
     .bdrv_probe         = raw_probe,

+ 130 - 14
block/sheepdog.c

@@ -32,9 +32,11 @@
 #define SD_OP_RELEASE_VDI    0x13
 #define SD_OP_GET_VDI_INFO   0x14
 #define SD_OP_READ_VDIS      0x15
+#define SD_OP_FLUSH_VDI      0x16
 
 #define SD_FLAG_CMD_WRITE    0x01
 #define SD_FLAG_CMD_COW      0x02
+#define SD_FLAG_CMD_CACHE    0x04
 
 #define SD_RES_SUCCESS       0x00 /* Success */
 #define SD_RES_UNKNOWN       0x01 /* Unknown error */
@@ -293,10 +295,12 @@ typedef struct BDRVSheepdogState {
 
     char name[SD_MAX_VDI_LEN];
     int is_snapshot;
+    uint8_t cache_enabled;
 
     char *addr;
     char *port;
     int fd;
+    int flush_fd;
 
     CoMutex lock;
     Coroutine *co_send;
@@ -506,6 +510,7 @@ static int send_req(int sockfd, SheepdogReq *hdr, void *data,
     ret = qemu_send_full(sockfd, hdr, sizeof(*hdr), 0);
     if (ret < sizeof(*hdr)) {
         error_report("failed to send a req, %s", strerror(errno));
+        return ret;
     }
 
     ret = qemu_send_full(sockfd, data, *wlen, 0);
@@ -516,6 +521,24 @@ static int send_req(int sockfd, SheepdogReq *hdr, void *data,
     return ret;
 }
 
+static int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
+                       unsigned int *wlen)
+{
+    int ret;
+
+    ret = qemu_co_send(sockfd, hdr, sizeof(*hdr));
+    if (ret < sizeof(*hdr)) {
+        error_report("failed to send a req, %s", strerror(errno));
+        return ret;
+    }
+
+    ret = qemu_co_send(sockfd, data, *wlen);
+    if (ret < *wlen) {
+        error_report("failed to send a req, %s", strerror(errno));
+    }
+
+    return ret;
+}
 static int do_req(int sockfd, SheepdogReq *hdr, void *data,
                   unsigned int *wlen, unsigned int *rlen)
 {
@@ -550,6 +573,40 @@ out:
     return ret;
 }
 
+static int do_co_req(int sockfd, SheepdogReq *hdr, void *data,
+                     unsigned int *wlen, unsigned int *rlen)
+{
+    int ret;
+
+    socket_set_block(sockfd);
+    ret = send_co_req(sockfd, hdr, data, wlen);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
+    if (ret < sizeof(*hdr)) {
+        error_report("failed to get a rsp, %s", strerror(errno));
+        goto out;
+    }
+
+    if (*rlen > hdr->data_length) {
+        *rlen = hdr->data_length;
+    }
+
+    if (*rlen) {
+        ret = qemu_co_recv(sockfd, data, *rlen);
+        if (ret < *rlen) {
+            error_report("failed to get the data, %s", strerror(errno));
+            goto out;
+        }
+    }
+    ret = 0;
+out:
+    socket_set_nonblock(sockfd);
+    return ret;
+}
+
 static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
                            struct iovec *iov, int niov, int create,
                            enum AIOCBState aiocb_type);
@@ -900,6 +957,10 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
         hdr.flags = SD_FLAG_CMD_WRITE | flags;
     }
 
+    if (s->cache_enabled) {
+        hdr.flags |= SD_FLAG_CMD_CACHE;
+    }
+
     hdr.oid = oid;
     hdr.cow_oid = old_oid;
     hdr.copies = s->inode.nr_copies;
@@ -942,7 +1003,7 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
 static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
                              unsigned int datalen, uint64_t offset,
-                             int write, int create)
+                             int write, int create, uint8_t cache)
 {
     SheepdogObjReq hdr;
     SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
@@ -965,6 +1026,11 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
         rlen = datalen;
         hdr.opcode = SD_OP_READ_OBJ;
     }
+
+    if (cache) {
+        hdr.flags |= SD_FLAG_CMD_CACHE;
+    }
+
     hdr.oid = oid;
     hdr.data_length = datalen;
     hdr.offset = offset;
@@ -986,15 +1052,18 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies,
 }
 
 static int read_object(int fd, char *buf, uint64_t oid, int copies,
-                       unsigned int datalen, uint64_t offset)
+                       unsigned int datalen, uint64_t offset, uint8_t cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, 0, 0,
+                             cache);
 }
 
 static int write_object(int fd, char *buf, uint64_t oid, int copies,
-                        unsigned int datalen, uint64_t offset, int create)
+                        unsigned int datalen, uint64_t offset, int create,
+                        uint8_t cache)
 {
-    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create);
+    return read_write_object(fd, buf, oid, copies, datalen, offset, 1, create,
+                             cache);
 }
 
 static int sd_open(BlockDriverState *bs, const char *filename, int flags)
@@ -1026,6 +1095,15 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
         goto out;
     }
 
+    if (flags & BDRV_O_CACHE_WB) {
+        s->cache_enabled = 1;
+        s->flush_fd = connect_to_sdog(s->addr, s->port);
+        if (s->flush_fd < 0) {
+            error_report("failed to connect");
+            goto out;
+        }
+    }
+
     if (snapid) {
         dprintf("%" PRIx32 " snapshot inode was open.\n", vid);
         s->is_snapshot = 1;
@@ -1038,7 +1116,8 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     }
 
     buf = g_malloc(SD_INODE_SIZE);
-    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0);
+    ret = read_object(fd, buf, vid_to_vdi_oid(vid), 0, SD_INODE_SIZE, 0,
+                      s->cache_enabled);
 
     closesocket(fd);
 
@@ -1272,6 +1351,9 @@ static void sd_close(BlockDriverState *bs)
 
     qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL, NULL);
     closesocket(s->fd);
+    if (s->cache_enabled) {
+        closesocket(s->flush_fd);
+    }
     g_free(s->addr);
 }
 
@@ -1305,7 +1387,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
     s->inode.vdi_size = offset;
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0);
+                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
     close(fd);
 
     if (ret < 0) {
@@ -1387,7 +1469,7 @@ static int sd_create_branch(BDRVSheepdogState *s)
     }
 
     ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
-                      SD_INODE_SIZE, 0);
+                      SD_INODE_SIZE, 0, s->cache_enabled);
 
     closesocket(fd);
 
@@ -1575,6 +1657,36 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
     return acb->ret;
 }
 
+static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs)
+{
+    BDRVSheepdogState *s = bs->opaque;
+    SheepdogObjReq hdr = { 0 };
+    SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr;
+    SheepdogInode *inode = &s->inode;
+    int ret;
+    unsigned int wlen = 0, rlen = 0;
+
+    if (!s->cache_enabled) {
+        return 0;
+    }
+
+    hdr.opcode = SD_OP_FLUSH_VDI;
+    hdr.oid = vid_to_vdi_oid(inode->vdi_id);
+
+    ret = do_co_req(s->flush_fd, (SheepdogReq *)&hdr, NULL, &wlen, &rlen);
+    if (ret) {
+        error_report("failed to send a request to the sheep");
+        return ret;
+    }
+
+    if (rsp->result != SD_RES_SUCCESS) {
+        error_report("%s", sd_strerror(rsp->result));
+        return -EIO;
+    }
+
+    return 0;
+}
+
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 {
     BDRVSheepdogState *s = bs->opaque;
@@ -1610,7 +1722,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     }
 
     ret = write_object(fd, (char *)&s->inode, vid_to_vdi_oid(s->inode.vdi_id),
-                       s->inode.nr_copies, datalen, 0, 0);
+                       s->inode.nr_copies, datalen, 0, 0, s->cache_enabled);
     if (ret < 0) {
         error_report("failed to write snapshot's inode.");
         ret = -EIO;
@@ -1629,7 +1741,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
     inode = (SheepdogInode *)g_malloc(datalen);
 
     ret = read_object(fd, (char *)inode, vid_to_vdi_oid(new_vid),
-                      s->inode.nr_copies, datalen, 0);
+                      s->inode.nr_copies, datalen, 0, s->cache_enabled);
 
     if (ret < 0) {
         error_report("failed to read new inode info. %s", strerror(errno));
@@ -1684,7 +1796,7 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 
     buf = g_malloc(SD_INODE_SIZE);
     ret = read_object(fd, buf, vid_to_vdi_oid(vid), s->inode.nr_copies,
-                      SD_INODE_SIZE, 0);
+                      SD_INODE_SIZE, 0, s->cache_enabled);
 
     closesocket(fd);
 
@@ -1779,7 +1891,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
         /* we don't need to read entire object */
         ret = read_object(fd, (char *)&inode, vid_to_vdi_oid(vid),
-                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0);
+                          0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
+                          s->cache_enabled);
 
         if (ret) {
             continue;
@@ -1835,10 +1948,12 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
         create = (offset == 0);
         if (load) {
             ret = read_object(fd, (char *)data, vmstate_oid,
-                              s->inode.nr_copies, data_len, offset);
+                              s->inode.nr_copies, data_len, offset,
+                              s->cache_enabled);
         } else {
             ret = write_object(fd, (char *)data, vmstate_oid,
-                               s->inode.nr_copies, data_len, offset, create);
+                               s->inode.nr_copies, data_len, offset, create,
+                               s->cache_enabled);
         }
 
         if (ret < 0) {
@@ -1904,6 +2019,7 @@ BlockDriver bdrv_sheepdog = {
 
     .bdrv_co_readv  = sd_co_readv,
     .bdrv_co_writev = sd_co_writev,
+    .bdrv_co_flush_to_disk  = sd_co_flush_to_disk,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,

+ 38 - 3
block/stream.c

@@ -76,6 +76,39 @@ static int coroutine_fn stream_populate(BlockDriverState *bs,
     return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov);
 }
 
+static void close_unused_images(BlockDriverState *top, BlockDriverState *base,
+                                const char *base_id)
+{
+    BlockDriverState *intermediate;
+    intermediate = top->backing_hd;
+
+    while (intermediate) {
+        BlockDriverState *unused;
+
+        /* reached base */
+        if (intermediate == base) {
+            break;
+        }
+
+        unused = intermediate;
+        intermediate = intermediate->backing_hd;
+        unused->backing_hd = NULL;
+        bdrv_delete(unused);
+    }
+    top->backing_hd = base;
+
+    pstrcpy(top->backing_file, sizeof(top->backing_file), "");
+    pstrcpy(top->backing_format, sizeof(top->backing_format), "");
+    if (base_id) {
+        pstrcpy(top->backing_file, sizeof(top->backing_file), base_id);
+        if (base->drv) {
+            pstrcpy(top->backing_format, sizeof(top->backing_format),
+                    base->drv->format_name);
+        }
+    }
+
+}
+
 /*
  * Given an image chain: [BASE] -> [INTER1] -> [INTER2] -> [TOP]
  *
@@ -175,7 +208,7 @@ retry:
             break;
         }
 
-
+        s->common.busy = true;
         if (base) {
             ret = is_allocated_base(bs, base, sector_num,
                                     STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
@@ -189,6 +222,7 @@ retry:
             if (s->common.speed) {
                 uint64_t delay_ns = ratelimit_calculate_delay(&s->limit, n);
                 if (delay_ns > 0) {
+                    s->common.busy = false;
                     co_sleep_ns(rt_clock, delay_ns);
 
                     /* Recheck cancellation and that sectors are unallocated */
@@ -208,6 +242,7 @@ retry:
         /* Note that even when no rate limit is applied we need to yield
          * with no pending I/O here so that qemu_aio_flush() returns.
          */
+        s->common.busy = false;
         co_sleep_ns(rt_clock, 0);
     }
 
@@ -215,12 +250,13 @@ retry:
         bdrv_disable_copy_on_read(bs);
     }
 
-    if (sector_num == end && ret == 0) {
+    if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
         const char *base_id = NULL;
         if (base) {
             base_id = s->backing_file_id;
         }
         ret = bdrv_change_backing_file(bs, base_id, NULL);
+        close_unused_images(bs, base, base_id);
     }
 
     qemu_vfree(buf);
@@ -234,7 +270,6 @@ static int stream_set_speed(BlockJob *job, int64_t value)
     if (value < 0) {
         return -EINVAL;
     }
-    job->speed = value;
     ratelimit_set_speed(&s->limit, value / BDRV_SECTOR_SIZE);
     return 0;
 }

+ 108 - 321
block/vdi.c

@@ -143,29 +143,6 @@ void uuid_unparse(const uuid_t uu, char *out)
 }
 #endif
 
-typedef struct {
-    BlockDriverAIOCB common;
-    int64_t sector_num;
-    QEMUIOVector *qiov;
-    uint8_t *buf;
-    /* Total number of sectors. */
-    int nb_sectors;
-    /* Number of sectors for current AIO. */
-    int n_sectors;
-    /* New allocated block map entry. */
-    uint32_t bmap_first;
-    uint32_t bmap_last;
-    /* Buffer for new allocated block. */
-    void *block_buffer;
-    void *orig_buf;
-    bool is_write;
-    int header_modified;
-    BlockDriverAIOCB *hd_aiocb;
-    struct iovec hd_iov;
-    QEMUIOVector hd_qiov;
-    QEMUBH *bh;
-} VdiAIOCB;
-
 typedef struct {
     char text[0x40];
     uint32_t signature;
@@ -489,332 +466,150 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
     return VDI_IS_ALLOCATED(bmap_entry);
 }
 
-static void vdi_aio_cancel(BlockDriverAIOCB *blockacb)
-{
-    /* TODO: This code is untested. How can I get it executed? */
-    VdiAIOCB *acb = container_of(blockacb, VdiAIOCB, common);
-    logout("\n");
-    if (acb->hd_aiocb) {
-        bdrv_aio_cancel(acb->hd_aiocb);
-    }
-    qemu_aio_release(acb);
-}
-
-static AIOPool vdi_aio_pool = {
-    .aiocb_size = sizeof(VdiAIOCB),
-    .cancel = vdi_aio_cancel,
-};
-
-static VdiAIOCB *vdi_aio_setup(BlockDriverState *bs, int64_t sector_num,
-        QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
-{
-    VdiAIOCB *acb;
-
-    logout("%p, %" PRId64 ", %p, %d, %p, %p, %d\n",
-           bs, sector_num, qiov, nb_sectors, cb, opaque, is_write);
-
-    acb = qemu_aio_get(&vdi_aio_pool, bs, cb, opaque);
-    acb->hd_aiocb = NULL;
-    acb->sector_num = sector_num;
-    acb->qiov = qiov;
-    acb->is_write = is_write;
-
-    if (qiov->niov > 1) {
-        acb->buf = qemu_blockalign(bs, qiov->size);
-        acb->orig_buf = acb->buf;
-        if (is_write) {
-            qemu_iovec_to_buffer(qiov, acb->buf);
-        }
-    } else {
-        acb->buf = (uint8_t *)qiov->iov->iov_base;
-    }
-    acb->nb_sectors = nb_sectors;
-    acb->n_sectors = 0;
-    acb->bmap_first = VDI_UNALLOCATED;
-    acb->bmap_last = VDI_UNALLOCATED;
-    acb->block_buffer = NULL;
-    acb->header_modified = 0;
-    return acb;
-}
-
-static int vdi_schedule_bh(QEMUBHFunc *cb, VdiAIOCB *acb)
-{
-    logout("\n");
-
-    if (acb->bh) {
-        return -EIO;
-    }
-
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return 0;
-}
-
-static void vdi_aio_read_cb(void *opaque, int ret);
-static void vdi_aio_write_cb(void *opaque, int ret);
-
-static void vdi_aio_rw_bh(void *opaque)
+static int vdi_co_read(BlockDriverState *bs,
+        int64_t sector_num, uint8_t *buf, int nb_sectors)
 {
-    VdiAIOCB *acb = opaque;
-    logout("\n");
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    if (acb->is_write) {
-        vdi_aio_write_cb(opaque, 0);
-    } else {
-        vdi_aio_read_cb(opaque, 0);
-    }
-}
-
-static void vdi_aio_read_cb(void *opaque, int ret)
-{
-    VdiAIOCB *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
     BDRVVdiState *s = bs->opaque;
     uint32_t bmap_entry;
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
+    int ret = 0;
 
-    logout("%u sectors read\n", acb->n_sectors);
-
-    acb->hd_aiocb = NULL;
-
-    if (ret < 0) {
-        goto done;
-    }
-
-    acb->nb_sectors -= acb->n_sectors;
-
-    if (acb->nb_sectors == 0) {
-        /* request completed */
-        ret = 0;
-        goto done;
-    }
-
-    acb->sector_num += acb->n_sectors;
-    acb->buf += acb->n_sectors * SECTOR_SIZE;
-
-    block_index = acb->sector_num / s->block_sectors;
-    sector_in_block = acb->sector_num % s->block_sectors;
-    n_sectors = s->block_sectors - sector_in_block;
-    if (n_sectors > acb->nb_sectors) {
-        n_sectors = acb->nb_sectors;
-    }
-
-    logout("will read %u sectors starting at sector %" PRIu64 "\n",
-           n_sectors, acb->sector_num);
+    logout("\n");
 
-    /* prepare next AIO request */
-    acb->n_sectors = n_sectors;
-    bmap_entry = le32_to_cpu(s->bmap[block_index]);
-    if (!VDI_IS_ALLOCATED(bmap_entry)) {
-        /* Block not allocated, return zeros, no need to wait. */
-        memset(acb->buf, 0, n_sectors * SECTOR_SIZE);
-        ret = vdi_schedule_bh(vdi_aio_rw_bh, acb);
-        if (ret < 0) {
-            goto done;
+    while (ret >= 0 && nb_sectors > 0) {
+        block_index = sector_num / s->block_sectors;
+        sector_in_block = sector_num % s->block_sectors;
+        n_sectors = s->block_sectors - sector_in_block;
+        if (n_sectors > nb_sectors) {
+            n_sectors = nb_sectors;
         }
-    } else {
-        uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                          (uint64_t)bmap_entry * s->block_sectors +
-                          sector_in_block;
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_readv(bs->file, offset, &acb->hd_qiov,
-                                       n_sectors, vdi_aio_read_cb, acb);
-    }
-    return;
-done:
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_aio_release(acb);
-}
 
-static BlockDriverAIOCB *vdi_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
-{
-    VdiAIOCB *acb;
-    int ret;
+        logout("will read %u sectors starting at sector %" PRIu64 "\n",
+               n_sectors, sector_num);
 
-    logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-    ret = vdi_schedule_bh(vdi_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
+        /* prepare next AIO request */
+        bmap_entry = le32_to_cpu(s->bmap[block_index]);
+        if (!VDI_IS_ALLOCATED(bmap_entry)) {
+            /* Block not allocated, return zeros, no need to wait. */
+            memset(buf, 0, n_sectors * SECTOR_SIZE);
+            ret = 0;
+        } else {
+            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+                              (uint64_t)bmap_entry * s->block_sectors +
+                              sector_in_block;
+            ret = bdrv_read(bs->file, offset, buf, n_sectors);
         }
-        qemu_aio_release(acb);
-        return NULL;
+        logout("%u sectors read\n", n_sectors);
+
+        nb_sectors -= n_sectors;
+        sector_num += n_sectors;
+        buf += n_sectors * SECTOR_SIZE;
     }
 
-    return &acb->common;
+    return ret;
 }
 
-static void vdi_aio_write_cb(void *opaque, int ret)
+static int vdi_co_write(BlockDriverState *bs,
+        int64_t sector_num, const uint8_t *buf, int nb_sectors)
 {
-    VdiAIOCB *acb = opaque;
-    BlockDriverState *bs = acb->common.bs;
     BDRVVdiState *s = bs->opaque;
     uint32_t bmap_entry;
     uint32_t block_index;
     uint32_t sector_in_block;
     uint32_t n_sectors;
+    uint32_t bmap_first = VDI_UNALLOCATED;
+    uint32_t bmap_last = VDI_UNALLOCATED;
+    uint8_t *block = NULL;
+    int ret = 0;
 
-    acb->hd_aiocb = NULL;
+    logout("\n");
 
-    if (ret < 0) {
-        goto done;
-    }
+    while (ret >= 0 && nb_sectors > 0) {
+        block_index = sector_num / s->block_sectors;
+        sector_in_block = sector_num % s->block_sectors;
+        n_sectors = s->block_sectors - sector_in_block;
+        if (n_sectors > nb_sectors) {
+            n_sectors = nb_sectors;
+        }
+
+        logout("will write %u sectors starting at sector %" PRIu64 "\n",
+               n_sectors, sector_num);
 
-    acb->nb_sectors -= acb->n_sectors;
-    acb->sector_num += acb->n_sectors;
-    acb->buf += acb->n_sectors * SECTOR_SIZE;
-
-    if (acb->nb_sectors == 0) {
-        logout("finished data write\n");
-        acb->n_sectors = 0;
-        if (acb->header_modified) {
-            VdiHeader *header = acb->block_buffer;
-            logout("now writing modified header\n");
-            assert(VDI_IS_ALLOCATED(acb->bmap_first));
-            *header = s->header;
-            vdi_header_to_le(header);
-            acb->header_modified = 0;
-            acb->hd_iov.iov_base = acb->block_buffer;
-            acb->hd_iov.iov_len = SECTOR_SIZE;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            acb->hd_aiocb = bdrv_aio_writev(bs->file, 0, &acb->hd_qiov, 1,
-                                            vdi_aio_write_cb, acb);
-            return;
-        } else if (VDI_IS_ALLOCATED(acb->bmap_first)) {
-            /* One or more new blocks were allocated. */
+        /* prepare next AIO request */
+        bmap_entry = le32_to_cpu(s->bmap[block_index]);
+        if (!VDI_IS_ALLOCATED(bmap_entry)) {
+            /* Allocate new block and write to it. */
             uint64_t offset;
-            uint32_t bmap_first;
-            uint32_t bmap_last;
-            g_free(acb->block_buffer);
-            acb->block_buffer = NULL;
-            bmap_first = acb->bmap_first;
-            bmap_last = acb->bmap_last;
-            logout("now writing modified block map entry %u...%u\n",
-                   bmap_first, bmap_last);
-            /* Write modified sectors from block map. */
-            bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
-            bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
-            n_sectors = bmap_last - bmap_first + 1;
-            offset = s->bmap_sector + bmap_first;
-            acb->bmap_first = VDI_UNALLOCATED;
-            acb->hd_iov.iov_base = (void *)((uint8_t *)&s->bmap[0] +
-                                            bmap_first * SECTOR_SIZE);
-            acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            logout("will write %u block map sectors starting from entry %u\n",
-                   n_sectors, bmap_first);
-            acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov,
-                                            n_sectors, vdi_aio_write_cb, acb);
-            return;
+            bmap_entry = s->header.blocks_allocated;
+            s->bmap[block_index] = cpu_to_le32(bmap_entry);
+            s->header.blocks_allocated++;
+            offset = s->header.offset_data / SECTOR_SIZE +
+                     (uint64_t)bmap_entry * s->block_sectors;
+            if (block == NULL) {
+                block = g_malloc(s->block_size);
+                bmap_first = block_index;
+            }
+            bmap_last = block_index;
+            /* Copy data to be written to new block and zero unused parts. */
+            memset(block, 0, sector_in_block * SECTOR_SIZE);
+            memcpy(block + sector_in_block * SECTOR_SIZE,
+                   buf, n_sectors * SECTOR_SIZE);
+            memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
+                   (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
+            ret = bdrv_write(bs->file, offset, block, s->block_sectors);
+        } else {
+            uint64_t offset = s->header.offset_data / SECTOR_SIZE +
+                              (uint64_t)bmap_entry * s->block_sectors +
+                              sector_in_block;
+            ret = bdrv_write(bs->file, offset, buf, n_sectors);
         }
-        ret = 0;
-        goto done;
-    }
 
-    logout("%u sectors written\n", acb->n_sectors);
+        nb_sectors -= n_sectors;
+        sector_num += n_sectors;
+        buf += n_sectors * SECTOR_SIZE;
 
-    block_index = acb->sector_num / s->block_sectors;
-    sector_in_block = acb->sector_num % s->block_sectors;
-    n_sectors = s->block_sectors - sector_in_block;
-    if (n_sectors > acb->nb_sectors) {
-        n_sectors = acb->nb_sectors;
+        logout("%u sectors written\n", n_sectors);
     }
 
-    logout("will write %u sectors starting at sector %" PRIu64 "\n",
-           n_sectors, acb->sector_num);
-
-    /* prepare next AIO request */
-    acb->n_sectors = n_sectors;
-    bmap_entry = le32_to_cpu(s->bmap[block_index]);
-    if (!VDI_IS_ALLOCATED(bmap_entry)) {
-        /* Allocate new block and write to it. */
-        uint64_t offset;
-        uint8_t *block;
-        bmap_entry = s->header.blocks_allocated;
-        s->bmap[block_index] = cpu_to_le32(bmap_entry);
-        s->header.blocks_allocated++;
-        offset = s->header.offset_data / SECTOR_SIZE +
-                 (uint64_t)bmap_entry * s->block_sectors;
-        block = acb->block_buffer;
-        if (block == NULL) {
-            block = g_malloc(s->block_size);
-            acb->block_buffer = block;
-            acb->bmap_first = block_index;
-            assert(!acb->header_modified);
-            acb->header_modified = 1;
-        }
-        acb->bmap_last = block_index;
-        /* Copy data to be written to new block and zero unused parts. */
-        memset(block, 0, sector_in_block * SECTOR_SIZE);
-        memcpy(block + sector_in_block * SECTOR_SIZE,
-               acb->buf, n_sectors * SECTOR_SIZE);
-        memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0,
-               (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE);
-        acb->hd_iov.iov_base = (void *)block;
-        acb->hd_iov.iov_len = s->block_size;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_writev(bs->file, offset,
-                                        &acb->hd_qiov, s->block_sectors,
-                                        vdi_aio_write_cb, acb);
-    } else {
-        uint64_t offset = s->header.offset_data / SECTOR_SIZE +
-                          (uint64_t)bmap_entry * s->block_sectors +
-                          sector_in_block;
-        acb->hd_iov.iov_base = (void *)acb->buf;
-        acb->hd_iov.iov_len = n_sectors * SECTOR_SIZE;
-        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_writev(bs->file, offset, &acb->hd_qiov,
-                                        n_sectors, vdi_aio_write_cb, acb);
+    logout("finished data write\n");
+    if (ret < 0) {
+        return ret;
     }
 
-    return;
-
-done:
-    if (acb->qiov->niov > 1) {
-        qemu_vfree(acb->orig_buf);
-    }
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_aio_release(acb);
-}
+    if (block) {
+        /* One or more new blocks were allocated. */
+        VdiHeader *header = (VdiHeader *) block;
+        uint8_t *base;
+        uint64_t offset;
 
-static BlockDriverAIOCB *vdi_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
-{
-    VdiAIOCB *acb;
-    int ret;
+        logout("now writing modified header\n");
+        assert(VDI_IS_ALLOCATED(bmap_first));
+        *header = s->header;
+        vdi_header_to_le(header);
+        ret = bdrv_write(bs->file, 0, block, 1);
+        g_free(block);
+        block = NULL;
 
-    logout("\n");
-    acb = vdi_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-    ret = vdi_schedule_bh(vdi_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
+        if (ret < 0) {
+            return ret;
         }
-        qemu_aio_release(acb);
-        return NULL;
+
+        logout("now writing modified block map entry %u...%u\n",
+               bmap_first, bmap_last);
+        /* Write modified sectors from block map. */
+        bmap_first /= (SECTOR_SIZE / sizeof(uint32_t));
+        bmap_last /= (SECTOR_SIZE / sizeof(uint32_t));
+        n_sectors = bmap_last - bmap_first + 1;
+        offset = s->bmap_sector + bmap_first;
+        base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE;
+        logout("will write %u block map sectors starting from entry %u\n",
+               n_sectors, bmap_first);
+        ret = bdrv_write(bs->file, offset, base, n_sectors);
     }
 
-    return &acb->common;
+    return ret;
 }
 
 static int vdi_create(const char *filename, QEMUOptionParameter *options)
@@ -930,13 +725,6 @@ static void vdi_close(BlockDriverState *bs)
     error_free(s->migration_blocker);
 }
 
-static coroutine_fn int vdi_co_flush(BlockDriverState *bs)
-{
-    logout("\n");
-    return bdrv_co_flush(bs->file);
-}
-
-
 static QEMUOptionParameter vdi_create_options[] = {
     {
         .name = BLOCK_OPT_SIZE,
@@ -969,13 +757,12 @@ static BlockDriver bdrv_vdi = {
     .bdrv_open = vdi_open,
     .bdrv_close = vdi_close,
     .bdrv_create = vdi_create,
-    .bdrv_co_flush_to_disk = vdi_co_flush,
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
 
-    .bdrv_aio_readv = vdi_aio_readv,
+    .bdrv_read = vdi_co_read,
 #if defined(CONFIG_VDI_WRITE)
-    .bdrv_aio_writev = vdi_aio_writev,
+    .bdrv_write = vdi_co_write,
 #endif
 
     .bdrv_get_info = vdi_get_info,

+ 2 - 2
block/vmdk.c

@@ -1525,10 +1525,10 @@ static void vmdk_close(BlockDriverState *bs)
 
 static coroutine_fn int vmdk_co_flush(BlockDriverState *bs)
 {
-    int i, ret, err;
     BDRVVmdkState *s = bs->opaque;
+    int i, err;
+    int ret = 0;
 
-    ret = bdrv_co_flush(bs->file);
     for (i = 0; i < s->num_extents; i++) {
         err = bdrv_co_flush(s->extents[i].file);
         if (err < 0) {

+ 3 - 6
block/vpc.c

@@ -189,6 +189,9 @@ static int vpc_open(BlockDriverState *bs, int flags)
         fprintf(stderr, "block-vpc: The header checksum of '%s' is "
             "incorrect.\n", bs->filename);
 
+    /* Write 'checksum' back to footer, or else will leave it with zero. */
+    footer->checksum = be32_to_cpu(checksum);
+
     // The visible size of a image in Virtual PC depends on the geometry
     // rather than on the size stored in the footer (the size in the footer
     // is too large usually)
@@ -507,11 +510,6 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num,
     return ret;
 }
 
-static coroutine_fn int vpc_co_flush(BlockDriverState *bs)
-{
-    return bdrv_co_flush(bs->file);
-}
-
 /*
  * Calculates the number of cylinders, heads and sectors per cylinder
  * based on a given number of sectors. This is the algorithm described
@@ -789,7 +787,6 @@ static BlockDriver bdrv_vpc = {
 
     .bdrv_read              = vpc_co_read,
     .bdrv_write             = vpc_co_write,
-    .bdrv_co_flush_to_disk  = vpc_co_flush,
 
     .create_options = vpc_create_options,
 };

+ 114 - 21
block_int.h

@@ -53,12 +53,6 @@
 
 typedef struct BdrvTrackedRequest BdrvTrackedRequest;
 
-typedef struct AIOPool {
-    void (*cancel)(BlockDriverAIOCB *acb);
-    int aiocb_size;
-    BlockDriverAIOCB *free_aiocb;
-} AIOPool;
-
 typedef struct BlockIOLimit {
     int64_t bps[3];
     int64_t iops[3];
@@ -69,8 +63,13 @@ typedef struct BlockIOBaseValue {
     uint64_t ios[2];
 } BlockIOBaseValue;
 
-typedef void BlockJobCancelFunc(void *opaque);
 typedef struct BlockJob BlockJob;
+
+/**
+ * BlockJobType:
+ *
+ * A class type for block job objects.
+ */
 typedef struct BlockJobType {
     /** Derived BlockJob struct size */
     size_t instance_size;
@@ -83,19 +82,48 @@ typedef struct BlockJobType {
 } BlockJobType;
 
 /**
- * Long-running operation on a BlockDriverState
+ * BlockJob:
+ *
+ * Long-running operation on a BlockDriverState.
  */
 struct BlockJob {
+    /** The job type, including the job vtable.  */
     const BlockJobType *job_type;
+
+    /** The block device on which the job is operating.  */
     BlockDriverState *bs;
+
+    /**
+     * Set to true if the job should cancel itself.  The flag must
+     * always be tested just before toggling the busy flag from false
+     * to true.  After a job has detected that the cancelled flag is
+     * true, it should not anymore issue any I/O operation to the
+     * block device.
+     */
     bool cancelled;
 
-    /* These fields are published by the query-block-jobs QMP API */
+    /**
+     * Set to false by the job while it is in a quiescent state, where
+     * no I/O is pending and cancellation can be processed without
+     * issuing new I/O.  The busy flag must be set to false when the
+     * job goes to sleep on any condition that is not detected by
+     * #qemu_aio_wait, such as a timer.
+     */
+    bool busy;
+
+    /** Offset that is published by the query-block-jobs QMP API */
     int64_t offset;
+
+    /** Length that is published by the query-block-jobs QMP API */
     int64_t len;
+
+    /** Speed that was set with @block_job_set_speed.  */
     int64_t speed;
 
+    /** The completion function that will be called when the job completes.  */
     BlockDriverCompletionFunc *cb;
+
+    /** The opaque value that is passed to the completion function.  */
     void *opaque;
 };
 
@@ -302,20 +330,8 @@ struct BlockDriverState {
     BlockJob *job;
 };
 
-struct BlockDriverAIOCB {
-    AIOPool *pool;
-    BlockDriverState *bs;
-    BlockDriverCompletionFunc *cb;
-    void *opaque;
-    BlockDriverAIOCB *next;
-};
-
 void get_tmp_filename(char *filename, int size);
 
-void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
-                   BlockDriverCompletionFunc *cb, void *opaque);
-void qemu_aio_release(void *p);
-
 void bdrv_set_io_limits(BlockDriverState *bs,
                         BlockIOLimit *io_limits);
 
@@ -323,13 +339,90 @@ void bdrv_set_io_limits(BlockDriverState *bs,
 int is_windows_drive(const char *filename);
 #endif
 
+/**
+ * block_job_create:
+ * @job_type: The class object for the newly-created job.
+ * @bs: The block
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Create a new long-running block device job and return it.  The job
+ * will call @cb asynchronously when the job completes.  Note that
+ * @bs may have been closed at the time the @cb it is called.  If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
 void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs,
                        BlockDriverCompletionFunc *cb, void *opaque);
+
+/**
+ * block_job_complete:
+ * @job: The job being completed.
+ * @ret: The status code.
+ *
+ * Call the completion function that was registered at creation time, and
+ * free @job.
+ */
 void block_job_complete(BlockJob *job, int ret);
+
+/**
+ * block_job_set_speed:
+ * @job: The job to set the speed for.
+ * @speed: The new value
+ *
+ * Set a rate-limiting parameter for the job; the actual meaning may
+ * vary depending on the job type.
+ */
 int block_job_set_speed(BlockJob *job, int64_t value);
+
+/**
+ * block_job_cancel:
+ * @job: The job to be canceled.
+ *
+ * Asynchronously cancel the specified job.
+ */
 void block_job_cancel(BlockJob *job);
+
+/**
+ * block_job_is_cancelled:
+ * @job: The job being queried.
+ *
+ * Returns whether the job is scheduled for cancellation.
+ */
 bool block_job_is_cancelled(BlockJob *job);
 
+/**
+ * block_job_cancel:
+ * @job: The job to be canceled.
+ *
+ * Asynchronously cancel the job and wait for it to reach a quiescent
+ * state.  Note that the completion callback will still be called
+ * asynchronously, hence it is *not* valid to call #bdrv_delete
+ * immediately after #block_job_cancel_sync.  Users of block jobs
+ * will usually protect the BlockDriverState objects with a reference
+ * count, should this be a concern.
+ */
+void block_job_cancel_sync(BlockJob *job);
+
+/**
+ * stream_start:
+ * @bs: Block device to operate on.
+ * @base: Block device that will become the new base, or %NULL to
+ * flatten the whole backing file chain onto @bs.
+ * @base_id: The file name that will be written to @bs as the new
+ * backing file if the job completes.  Ignored if @base is %NULL.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Start a streaming operation on @bs.  Clusters that are unallocated
+ * in @bs, but allocated in any image between @base and @bs (both
+ * exclusive) will be written to @bs.  At the end of a successful
+ * streaming job, the backing file of @bs will be changed to
+ * @base_id in the written image and to @base in the live BlockDriverState.
+ */
 int stream_start(BlockDriverState *bs, BlockDriverState *base,
                  const char *base_id, BlockDriverCompletionFunc *cb,
                  void *opaque);

+ 10 - 2
blockdev.c

@@ -64,6 +64,9 @@ void blockdev_mark_auto_del(BlockDriverState *bs)
 {
     DriveInfo *dinfo = drive_get_by_blockdev(bs);
 
+    if (bs->job) {
+        block_job_cancel(bs->job);
+    }
     if (dinfo) {
         dinfo->auto_del = 1;
     }
@@ -532,8 +535,9 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
     dinfo->unit = unit_id;
     dinfo->opts = opts;
     dinfo->refcount = 1;
-    if (serial)
-        strncpy(dinfo->serial, serial, sizeof(dinfo->serial) - 1);
+    if (serial) {
+        pstrcpy(dinfo->serial, sizeof(dinfo->serial), serial);
+    }
     QTAILQ_INSERT_TAIL(&drives, dinfo, next);
 
     bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error);
@@ -591,6 +595,10 @@ DriveInfo *drive_init(QemuOpts *opts, int default_to_scsi)
         bdrv_flags |= BDRV_O_COPY_ON_READ;
     }
 
+    if (runstate_check(RUN_STATE_INMIGRATE)) {
+        bdrv_flags |= BDRV_O_INCOMING;
+    }
+
     if (media == MEDIA_CDROM) {
         /* CDROM is fine for any interface, don't check.  */
         ro = 1;

+ 12 - 9
dma-helpers.c

@@ -8,7 +8,6 @@
  */
 
 #include "dma.h"
-#include "block_int.h"
 #include "trace.h"
 
 void qemu_sglist_init(QEMUSGList *qsg, int alloc_hint)
@@ -42,7 +41,7 @@ typedef struct {
     BlockDriverAIOCB *acb;
     QEMUSGList *sg;
     uint64_t sector_num;
-    bool to_dev;
+    DMADirection dir;
     bool in_cancel;
     int sg_cur_index;
     dma_addr_t sg_cur_byte;
@@ -76,7 +75,8 @@ static void dma_bdrv_unmap(DMAAIOCB *dbs)
 
     for (i = 0; i < dbs->iov.niov; ++i) {
         cpu_physical_memory_unmap(dbs->iov.iov[i].iov_base,
-                                  dbs->iov.iov[i].iov_len, !dbs->to_dev,
+                                  dbs->iov.iov[i].iov_len,
+                                  dbs->dir != DMA_DIRECTION_TO_DEVICE,
                                   dbs->iov.iov[i].iov_len);
     }
     qemu_iovec_reset(&dbs->iov);
@@ -123,7 +123,8 @@ static void dma_bdrv_cb(void *opaque, int ret)
     while (dbs->sg_cur_index < dbs->sg->nsg) {
         cur_addr = dbs->sg->sg[dbs->sg_cur_index].base + dbs->sg_cur_byte;
         cur_len = dbs->sg->sg[dbs->sg_cur_index].len - dbs->sg_cur_byte;
-        mem = cpu_physical_memory_map(cur_addr, &cur_len, !dbs->to_dev);
+        mem = cpu_physical_memory_map(cur_addr, &cur_len,
+                                      dbs->dir != DMA_DIRECTION_TO_DEVICE);
         if (!mem)
             break;
         qemu_iovec_add(&dbs->iov, mem, cur_len);
@@ -170,11 +171,11 @@ static AIOPool dma_aio_pool = {
 BlockDriverAIOCB *dma_bdrv_io(
     BlockDriverState *bs, QEMUSGList *sg, uint64_t sector_num,
     DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
-    void *opaque, bool to_dev)
+    void *opaque, DMADirection dir)
 {
     DMAAIOCB *dbs = qemu_aio_get(&dma_aio_pool, bs, cb, opaque);
 
-    trace_dma_bdrv_io(dbs, bs, sector_num, to_dev);
+    trace_dma_bdrv_io(dbs, bs, sector_num, (dir == DMA_DIRECTION_TO_DEVICE));
 
     dbs->acb = NULL;
     dbs->bs = bs;
@@ -182,7 +183,7 @@ BlockDriverAIOCB *dma_bdrv_io(
     dbs->sector_num = sector_num;
     dbs->sg_cur_index = 0;
     dbs->sg_cur_byte = 0;
-    dbs->to_dev = to_dev;
+    dbs->dir = dir;
     dbs->io_func = io_func;
     dbs->bh = NULL;
     qemu_iovec_init(&dbs->iov, sg->nsg);
@@ -195,14 +196,16 @@ BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
                                 void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque, false);
+    return dma_bdrv_io(bs, sg, sector, bdrv_aio_readv, cb, opaque,
+                       DMA_DIRECTION_FROM_DEVICE);
 }
 
 BlockDriverAIOCB *dma_bdrv_write(BlockDriverState *bs,
                                  QEMUSGList *sg, uint64_t sector,
                                  void (*cb)(void *opaque, int ret), void *opaque)
 {
-    return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque, true);
+    return dma_bdrv_io(bs, sg, sector, bdrv_aio_writev, cb, opaque,
+                       DMA_DIRECTION_TO_DEVICE);
 }
 
 

+ 6 - 6
dma.h

@@ -16,6 +16,11 @@
 
 typedef struct ScatterGatherEntry ScatterGatherEntry;
 
+typedef enum {
+    DMA_DIRECTION_TO_DEVICE = 0,
+    DMA_DIRECTION_FROM_DEVICE = 1,
+} DMADirection;
+
 struct QEMUSGList {
     ScatterGatherEntry *sg;
     int nsg;
@@ -28,11 +33,6 @@ typedef target_phys_addr_t dma_addr_t;
 
 #define DMA_ADDR_FMT TARGET_FMT_plx
 
-typedef enum {
-    DMA_DIRECTION_TO_DEVICE = 0,
-    DMA_DIRECTION_FROM_DEVICE = 1,
-} DMADirection;
-
 struct ScatterGatherEntry {
     dma_addr_t base;
     dma_addr_t len;
@@ -50,7 +50,7 @@ typedef BlockDriverAIOCB *DMAIOFunc(BlockDriverState *bs, int64_t sector_num,
 BlockDriverAIOCB *dma_bdrv_io(BlockDriverState *bs,
                               QEMUSGList *sg, uint64_t sector_num,
                               DMAIOFunc *io_func, BlockDriverCompletionFunc *cb,
-                              void *opaque, bool to_dev);
+                              void *opaque, DMADirection dir);
 BlockDriverAIOCB *dma_bdrv_read(BlockDriverState *bs,
                                 QEMUSGList *sg, uint64_t sector,
                                 BlockDriverCompletionFunc *cb, void *opaque);

+ 47 - 13
hw/ide/core.c

@@ -31,7 +31,6 @@
 #include "sysemu.h"
 #include "dma.h"
 #include "blockdev.h"
-#include "block_int.h"
 
 #include <hw/ide/internal.h>
 
@@ -101,7 +100,7 @@ static void ide_identify(IDEState *s)
     put_le16(p + 21, 512); /* cache size in sectors */
     put_le16(p + 22, 4); /* ecc bytes */
     padstr((char *)(p + 23), s->version, 8); /* firmware version */
-    padstr((char *)(p + 27), "QEMU HARDDISK", 40); /* model */
+    padstr((char *)(p + 27), s->drive_model_str, 40); /* model */
 #if MAX_MULT_SECTORS > 1
     put_le16(p + 47, 0x8000 | MAX_MULT_SECTORS);
 #endif
@@ -143,17 +142,25 @@ static void ide_identify(IDEState *s)
     put_le16(p + 82, (1 << 14) | (1 << 5) | 1);
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
     put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
-    /* 14=set to 1, 1=SMART self test, 0=SMART error logging */
-    put_le16(p + 84, (1 << 14) | 0);
+    /* 14=set to 1, 8=has WWN, 1=SMART self test, 0=SMART error logging */
+    if (s->wwn) {
+        put_le16(p + 84, (1 << 14) | (1 << 8) | 0);
+    } else {
+        put_le16(p + 84, (1 << 14) | 0);
+    }
     /* 14 = NOP supported, 5=WCACHE enabled, 0=SMART feature set enabled */
     if (bdrv_enable_write_cache(s->bs))
          put_le16(p + 85, (1 << 14) | (1 << 5) | 1);
     else
          put_le16(p + 85, (1 << 14) | 1);
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
-    put_le16(p + 86, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
-    /* 14=set to 1, 1=smart self test, 0=smart error logging */
-    put_le16(p + 87, (1 << 14) | 0);
+    put_le16(p + 86, (1 << 13) | (1 <<12) | (1 << 10));
+    /* 14=set to 1, 8=has WWN, 1=SMART self test, 0=SMART error logging */
+    if (s->wwn) {
+        put_le16(p + 87, (1 << 14) | (1 << 8) | 0);
+    } else {
+        put_le16(p + 87, (1 << 14) | 0);
+    }
     put_le16(p + 88, 0x3f | (1 << 13)); /* udma5 set and supported */
     put_le16(p + 93, 1 | (1 << 14) | 0x2000);
     put_le16(p + 100, s->nb_sectors);
@@ -163,6 +170,13 @@ static void ide_identify(IDEState *s)
 
     if (dev && dev->conf.physical_block_size)
         put_le16(p + 106, 0x6000 | get_physical_block_exp(&dev->conf));
+    if (s->wwn) {
+        /* LE 16-bit words 111-108 contain 64-bit World Wide Name */
+        put_le16(p + 108, s->wwn >> 48);
+        put_le16(p + 109, s->wwn >> 32);
+        put_le16(p + 110, s->wwn >> 16);
+        put_le16(p + 111, s->wwn);
+    }
     if (dev && dev->conf.discard_granularity) {
         put_le16(p + 169, 1); /* TRIM support */
     }
@@ -189,7 +203,7 @@ static void ide_atapi_identify(IDEState *s)
     put_le16(p + 21, 512); /* cache size in sectors */
     put_le16(p + 22, 4); /* ecc bytes */
     padstr((char *)(p + 23), s->version, 8); /* firmware version */
-    padstr((char *)(p + 27), "QEMU DVD-ROM", 40); /* model */
+    padstr((char *)(p + 27), s->drive_model_str, 40); /* model */
     put_le16(p + 48, 1); /* dword I/O (XXX: should not be set on CDROM) */
 #ifdef USE_DMA_CDROM
     put_le16(p + 49, 1 << 9 | 1 << 8); /* DMA and LBA supported */
@@ -246,7 +260,7 @@ static void ide_cfata_identify(IDEState *s)
     padstr((char *)(p + 10), s->drive_serial_str, 20); /* serial number */
     put_le16(p + 22, 0x0004);			/* ECC bytes */
     padstr((char *) (p + 23), s->version, 8);	/* Firmware Revision */
-    padstr((char *) (p + 27), "QEMU MICRODRIVE", 40);/* Model number */
+    padstr((char *) (p + 27), s->drive_model_str, 40);/* Model number */
 #if MAX_MULT_SECTORS > 1
     put_le16(p + 47, 0x8000 | MAX_MULT_SECTORS);
 #else
@@ -604,7 +618,8 @@ void ide_dma_cb(void *opaque, int ret)
         break;
     case IDE_DMA_TRIM:
         s->bus->dma->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num,
-                                         ide_issue_trim, ide_dma_cb, s, true);
+                                         ide_issue_trim, ide_dma_cb, s,
+                                         DMA_DIRECTION_TO_DEVICE);
         break;
     }
     return;
@@ -1834,7 +1849,8 @@ static const BlockDevOps ide_cd_block_ops = {
 };
 
 int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
-                   const char *version, const char *serial)
+                   const char *version, const char *serial, const char *model,
+                   uint64_t wwn)
 {
     int cylinders, heads, secs;
     uint64_t nb_sectors;
@@ -1860,6 +1876,7 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
     s->heads = heads;
     s->sectors = secs;
     s->nb_sectors = nb_sectors;
+    s->wwn = wwn;
     /* The SMART values should be preserved across power cycles
        but they aren't.  */
     s->smart_enabled = 1;
@@ -1880,11 +1897,27 @@ int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
         }
     }
     if (serial) {
-        strncpy(s->drive_serial_str, serial, sizeof(s->drive_serial_str));
+        pstrcpy(s->drive_serial_str, sizeof(s->drive_serial_str), serial);
     } else {
         snprintf(s->drive_serial_str, sizeof(s->drive_serial_str),
                  "QM%05d", s->drive_serial);
     }
+    if (model) {
+        pstrcpy(s->drive_model_str, sizeof(s->drive_model_str), model);
+    } else {
+        switch (kind) {
+        case IDE_CD:
+            strcpy(s->drive_model_str, "QEMU DVD-ROM");
+            break;
+        case IDE_CFATA:
+            strcpy(s->drive_model_str, "QEMU MICRODRIVE");
+            break;
+        default:
+            strcpy(s->drive_model_str, "QEMU HARDDISK");
+            break;
+        }
+    }
+
     if (version) {
         pstrcpy(s->version, sizeof(s->version), version);
     } else {
@@ -1977,7 +2010,8 @@ void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
         if (dinfo) {
             if (ide_init_drive(&bus->ifs[i], dinfo->bdrv,
                                dinfo->media_cd ? IDE_CD : IDE_HD, NULL,
-                               *dinfo->serial ? dinfo->serial : NULL) < 0) {
+                               *dinfo->serial ? dinfo->serial : NULL,
+                               NULL, 0) < 0) {
                 error_report("Can't set up IDE drive %s", dinfo->id);
                 exit(1);
             }

+ 6 - 1
hw/ide/internal.h

@@ -348,6 +348,8 @@ struct IDEState {
     uint8_t identify_data[512];
     int drive_serial;
     char drive_serial_str[21];
+    char drive_model_str[41];
+    uint64_t wwn;
     /* ide regs */
     uint8_t feature;
     uint8_t error;
@@ -468,6 +470,8 @@ struct IDEDevice {
     BlockConf conf;
     char *version;
     char *serial;
+    char *model;
+    uint64_t wwn;
 };
 
 #define BM_STATUS_DMAING 0x01
@@ -534,7 +538,8 @@ void ide_data_writel(void *opaque, uint32_t addr, uint32_t val);
 uint32_t ide_data_readl(void *opaque, uint32_t addr);
 
 int ide_init_drive(IDEState *s, BlockDriverState *bs, IDEDriveKind kind,
-                   const char *version, const char *serial);
+                   const char *version, const char *serial, const char *model,
+                   uint64_t wwn);
 void ide_init2(IDEBus *bus, qemu_irq irq);
 void ide_init2_with_non_qdev_drives(IDEBus *bus, DriveInfo *hd0,
                                     DriveInfo *hd1, qemu_irq irq);

+ 2 - 1
hw/ide/macio.c

@@ -149,7 +149,8 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
         break;
     case IDE_DMA_TRIM:
         m->aiocb = dma_bdrv_io(s->bs, &s->sg, sector_num,
-                               ide_issue_trim, pmac_ide_transfer_cb, s, true);
+                               ide_issue_trim, pmac_ide_transfer_cb, s,
+                               DMA_DIRECTION_TO_DEVICE);
         break;
     }
     return;

+ 5 - 2
hw/ide/qdev.c

@@ -136,7 +136,8 @@ static int ide_dev_initfn(IDEDevice *dev, IDEDriveKind kind)
         }
     }
 
-    if (ide_init_drive(s, dev->conf.bs, kind, dev->version, serial) < 0) {
+    if (ide_init_drive(s, dev->conf.bs, kind,
+                       dev->version, serial, dev->model, dev->wwn) < 0) {
         return -1;
     }
 
@@ -173,7 +174,9 @@ static int ide_drive_initfn(IDEDevice *dev)
 #define DEFINE_IDE_DEV_PROPERTIES()                     \
     DEFINE_BLOCK_PROPERTIES(IDEDrive, dev.conf),        \
     DEFINE_PROP_STRING("ver",  IDEDrive, dev.version),  \
-    DEFINE_PROP_STRING("serial",  IDEDrive, dev.serial)
+    DEFINE_PROP_HEX64("wwn",  IDEDrive, dev.wwn, 0),    \
+    DEFINE_PROP_STRING("serial",  IDEDrive, dev.serial),\
+    DEFINE_PROP_STRING("model", IDEDrive, dev.model)
 
 static Property ide_hd_properties[] = {
     DEFINE_IDE_DEV_PROPERTIES(),

+ 0 - 1
hw/lsi53c895a.c

@@ -15,7 +15,6 @@
 #include "hw.h"
 #include "pci.h"
 #include "scsi.h"
-#include "block_int.h"
 #include "dma.h"
 
 //#define DEBUG_LSI

+ 46 - 0
hw/qdev-properties.c

@@ -877,6 +877,52 @@ PropertyInfo qdev_prop_pci_devfn = {
     .max   = 0xFFFFFFFFULL,
 };
 
+/* --- blocksize --- */
+
+static void set_blocksize(Object *obj, Visitor *v, void *opaque,
+                          const char *name, Error **errp)
+{
+    DeviceState *dev = DEVICE(obj);
+    Property *prop = opaque;
+    int16_t *ptr = qdev_get_prop_ptr(dev, prop);
+    Error *local_err = NULL;
+    int64_t value;
+
+    if (dev->state != DEV_STATE_CREATED) {
+        error_set(errp, QERR_PERMISSION_DENIED);
+        return;
+    }
+
+    visit_type_int(v, &value, name, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+    if (value < prop->info->min || value > prop->info->max) {
+        error_set(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
+                  dev->id?:"", name, value, prop->info->min,
+                  prop->info->max);
+        return;
+    }
+
+    /* We rely on power-of-2 blocksizes for bitmasks */
+    if ((value & (value - 1)) != 0) {
+        error_set(errp, QERR_PROPERTY_VALUE_NOT_POWER_OF_2,
+                  dev->id?:"", name, value);
+        return;
+    }
+
+    *ptr = value;
+}
+
+PropertyInfo qdev_prop_blocksize = {
+    .name  = "blocksize",
+    .get   = get_int16,
+    .set   = set_blocksize,
+    .min   = 512,
+    .max   = 65024,
+};
+
 /* --- public helpers --- */
 
 static Property *qdev_prop_walk(Property *props, const char *name)

+ 3 - 0
hw/qdev.h

@@ -223,6 +223,7 @@ extern PropertyInfo qdev_prop_drive;
 extern PropertyInfo qdev_prop_netdev;
 extern PropertyInfo qdev_prop_vlan;
 extern PropertyInfo qdev_prop_pci_devfn;
+extern PropertyInfo qdev_prop_blocksize;
 
 #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
         .name      = (_name),                                    \
@@ -284,6 +285,8 @@ extern PropertyInfo qdev_prop_pci_devfn;
 #define DEFINE_PROP_LOSTTICKPOLICY(_n, _s, _f, _d) \
     DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_losttickpolicy, \
                         LostTickPolicy)
+#define DEFINE_PROP_BLOCKSIZE(_n, _s, _f, _d) \
+    DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_blocksize, uint16_t)
 
 #define DEFINE_PROP_END_OF_LIST()               \
     {}

+ 0 - 1
linux-aio.c

@@ -9,7 +9,6 @@
  */
 #include "qemu-common.h"
 #include "qemu-aio.h"
-#include "block_int.h"
 #include "block/raw-posix-aio.h"
 
 #include <sys/eventfd.h>

+ 1 - 0
migration.c

@@ -91,6 +91,7 @@ void process_incoming_migration(QEMUFile *f)
     qemu_announce_self();
     DPRINTF("successfully loaded vm state\n");
 
+    bdrv_clear_incoming_migration_all();
     /* Make sure all file formats flush their mutable metadata */
     bdrv_invalidate_cache_all();
 

+ 21 - 0
qemu-aio.h

@@ -17,6 +17,27 @@
 #include "qemu-common.h"
 #include "qemu-char.h"
 
+typedef struct BlockDriverAIOCB BlockDriverAIOCB;
+typedef void BlockDriverCompletionFunc(void *opaque, int ret);
+
+typedef struct AIOPool {
+    void (*cancel)(BlockDriverAIOCB *acb);
+    int aiocb_size;
+    BlockDriverAIOCB *free_aiocb;
+} AIOPool;
+
+struct BlockDriverAIOCB {
+    AIOPool *pool;
+    BlockDriverState *bs;
+    BlockDriverCompletionFunc *cb;
+    void *opaque;
+    BlockDriverAIOCB *next;
+};
+
+void *qemu_aio_get(AIOPool *pool, BlockDriverState *bs,
+                   BlockDriverCompletionFunc *cb, void *opaque);
+void qemu_aio_release(void *p);
+
 /* Returns 1 if there are still outstanding AIO requests; 0 otherwise */
 typedef int (AioFlushHandler)(void *opaque);
 

+ 11 - 1
qemu-img.c

@@ -428,6 +428,13 @@ static int img_check(int argc, char **argv)
         }
     }
 
+    if (result.bfi.total_clusters != 0 && result.bfi.allocated_clusters != 0) {
+        printf("%" PRId64 "/%" PRId64 "= %0.2f%% allocated, %0.2f%% fragmented\n",
+        result.bfi.allocated_clusters, result.bfi.total_clusters,
+        result.bfi.allocated_clusters * 100.0 / result.bfi.total_clusters,
+        result.bfi.fragmented_clusters * 100.0 / result.bfi.allocated_clusters);
+    }
+
     bdrv_delete(bs);
 
     if (ret < 0 || result.check_errors) {
@@ -716,7 +723,7 @@ static int img_convert(int argc, char **argv)
         ret = -1;
         goto out;
     }
-        
+
     qemu_progress_init(progress, 2.0);
     qemu_progress_print(0, 100);
 
@@ -1125,6 +1132,9 @@ static int img_info(int argc, char **argv)
         if (bdi.cluster_size != 0) {
             printf("cluster_size: %d\n", bdi.cluster_size);
         }
+        if (bdi.is_dirty) {
+            printf("cleanly shut down: no\n");
+        }
     }
     bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
     if (backing_filename[0] != '\0') {

+ 9 - 1
qemu-io.c

@@ -17,6 +17,7 @@
 #include "qemu-common.h"
 #include "block_int.h"
 #include "cmd.h"
+#include "trace/control.h"
 
 #define VERSION	"0.0.1"
 
@@ -1783,6 +1784,7 @@ static void usage(const char *name)
 "  -g, --growable       allow file to grow (only applies to protocols)\n"
 "  -m, --misalign       misalign allocations for O_DIRECT\n"
 "  -k, --native-aio     use kernel AIO implementation (on Linux only)\n"
+"  -T, --trace FILE     enable trace events listed in the given file\n"
 "  -h, --help           display this help and exit\n"
 "  -V, --version        output version information and exit\n"
 "\n",
@@ -1794,7 +1796,7 @@ int main(int argc, char **argv)
 {
     int readonly = 0;
     int growable = 0;
-    const char *sopt = "hVc:rsnmgk";
+    const char *sopt = "hVc:rsnmgkT:";
     const struct option lopt[] = {
         { "help", 0, NULL, 'h' },
         { "version", 0, NULL, 'V' },
@@ -1806,6 +1808,7 @@ int main(int argc, char **argv)
         { "misalign", 0, NULL, 'm' },
         { "growable", 0, NULL, 'g' },
         { "native-aio", 0, NULL, 'k' },
+        { "trace", 1, NULL, 'T' },
         { NULL, 0, NULL, 0 }
     };
     int c;
@@ -1837,6 +1840,11 @@ int main(int argc, char **argv)
         case 'k':
             flags |= BDRV_O_NATIVE_AIO;
             break;
+        case 'T':
+            if (!trace_backend_init(optarg, NULL)) {
+                exit(1); /* error message will have been printed */
+            }
+            break;
         case 'V':
             printf("%s version %s\n", progname, VERSION);
             exit(0);

+ 6 - 1
qerror.c

@@ -240,10 +240,15 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_PROPERTY_VALUE_NOT_FOUND,
         .desc      = "Property '%(device).%(property)' can't find value '%(value)'",
     },
+    {
+        .error_fmt = QERR_PROPERTY_VALUE_NOT_POWER_OF_2,
+        .desc      = "Property '%(device).%(property)' doesn't take "
+                     "value '%(value)', it's not a power of 2",
+    },
     {
         .error_fmt = QERR_PROPERTY_VALUE_OUT_OF_RANGE,
         .desc      = "Property '%(device).%(property)' doesn't take "
-                     "value %(value) (minimum: %(min), maximum: %(max)'",
+                     "value %(value) (minimum: %(min), maximum: %(max))",
     },
     {
         .error_fmt = QERR_QGA_COMMAND_FAILED,

+ 4 - 0
qerror.h

@@ -202,6 +202,10 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_PROPERTY_VALUE_NOT_FOUND \
     "{ 'class': 'PropertyValueNotFound', 'data': { 'device': %s, 'property': %s, 'value': %s } }"
 
+#define QERR_PROPERTY_VALUE_NOT_POWER_OF_2 \
+    "{ 'class': 'PropertyValueNotPowerOf2', 'data': { " \
+    "'device': %s, 'property': %s, 'value': %"PRId64" } }"
+
 #define QERR_PROPERTY_VALUE_OUT_OF_RANGE \
     "{ 'class': 'PropertyValueOutOfRange', 'data': { 'device': %s, 'property': %s, 'value': %"PRId64", 'min': %"PRId64", 'max': %"PRId64" } }"
 

+ 4 - 0
scripts/tracetool

@@ -81,6 +81,10 @@ get_args()
     args=${1#*\(}
     args=${args%%\)*}
     echo "$args"
+
+    if (echo "$args" | grep "[ *]next\($\|[, ]\)" > /dev/null 2>&1); then
+        echo -e "\n#error 'next' is a bad argument name (clash with systemtap keyword)\n "
+    fi
 }
 
 # Get the argument name list of a trace event

+ 2 - 2
tests/qemu-iotests/009

@@ -53,10 +53,10 @@ _make_test_img $size
 echo
 echo "creating pattern"
 $QEMU_IO \
-	-c "write 2048k 4k -P 65" \
+	-c "write -P 65 2048k 4k" \
 	-c "write 4k 4k" \
 	-c "write 9M 4k" \
-	-c "read 2044k 8k -P 65 -s 4k -l 4k" \
+	-c "read -P 65 -s 4k -l 4k 2044k 8k" \
 $TEST_IMG | _filter_qemu_io
 
 echo

+ 3 - 3
tests/qemu-iotests/010

@@ -53,11 +53,11 @@ _make_test_img $size
 echo
 echo "creating pattern"
 $QEMU_IO \
-	-c "write 2048k 4k -P 165" \
+	-c "write -P 165 2048k 4k" \
 	-c "write 64k 4k" \
 	-c "write 9M 4k" \
-	-c "write 2044k 4k -P 165" \
-	-c "write 8M 4k -P 99" \
+	-c "write -P 165 2044k 4k" \
+	-c "write -P 99 8M 4k" \
 	-c "read -P 165 2044k 8k" \
 $TEST_IMG | _filter_qemu_io
 

+ 1 - 1
tests/qemu-iotests/011

@@ -60,7 +60,7 @@ for i in `seq 1 10`; do
     # Note that we filter away the actual offset.  That's because qemu
     # may re-order the two aio requests.  We only want to make sure the
     # filesystem isn't corrupted afterwards anyway.
-    $QEMU_IO $TEST_IMG -c "aio_write $off1 1M" -c "aio_write $off2 1M" | \
+    $QEMU_IO -c "aio_write $off1 1M" -c "aio_write $off2 1M" $TEST_IMG | \
     	_filter_qemu_io | \
 	sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g'
 done

+ 72 - 0
tests/qemu-iotests/031

@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# Test that all qcow2 header extensions survive a header rewrite
+#
+# Copyright (C) 2011 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=kwolf@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+# This tests qcow2-specific low-level functionality
+_supported_fmt qcow2
+_supported_proto generic
+_supported_os Linux
+
+CLUSTER_SIZE=65536
+echo
+echo === Create image with unknown header extension ===
+echo
+_make_test_img 64M
+./qcow2.py $TEST_IMG add-header-ext 0x12345678 "This is a test header extension"
+./qcow2.py $TEST_IMG dump-header
+_check_test_img
+
+echo
+echo === Rewrite header with no backing file ===
+echo
+$QEMU_IMG rebase -u -b "" $TEST_IMG
+./qcow2.py $TEST_IMG dump-header
+_check_test_img
+
+echo
+echo === Add a backing file and format ===
+echo
+$QEMU_IMG rebase -u -b "/some/backing/file/path" -F host_device $TEST_IMG
+./qcow2.py $TEST_IMG dump-header
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0

+ 76 - 0
tests/qemu-iotests/031.out

@@ -0,0 +1,76 @@
+QA output created by 031
+
+=== Create image with unknown header extension ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 cluster_size=65536 
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x0
+backing_file_size         0x0
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+
+Header extension:
+magic                     0x12345678
+length                    31
+data                      'This is a test header extension'
+
+No errors were found on the image.
+
+=== Rewrite header with no backing file ===
+
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x0
+backing_file_size         0x0
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+
+Header extension:
+magic                     0x12345678
+length                    31
+data                      'This is a test header extension'
+
+No errors were found on the image.
+
+=== Add a backing file and format ===
+
+magic                     0x514649fb
+version                   2
+backing_file_offset       0x90
+backing_file_size         0x17
+cluster_bits              16
+size                      67108864
+crypt_method              0
+l1_size                   1
+l1_table_offset           0x30000
+refcount_table_offset     0x10000
+refcount_table_clusters   1
+nb_snapshots              0
+snapshot_offset           0x0
+
+Header extension:
+magic                     0xe2792aca
+length                    11
+data                      'host_device'
+
+Header extension:
+magic                     0x12345678
+length                    31
+data                      'This is a test header extension'
+
+*** done

+ 7 - 2
tests/qemu-iotests/common.rc

@@ -57,16 +57,21 @@ _make_test_img()
 {
     # extra qemu-img options can be added by tests
     # at least one argument (the image size) needs to be added
-    local extra_img_options=$*
+    local extra_img_options=""
     local cluster_size_filter="s# cluster_size=[0-9]\\+##g"
+    local image_size=$*
 
+    if [ "$1" = "-b" ]; then
+        extra_img_options="$1 $2"
+        image_size=$3
+    fi
     if [ \( "$IMGFMT" = "qcow2" -o "$IMGFMT" = "qed" \) -a -n "$CLUSTER_SIZE" ]; then
         extra_img_options="-o cluster_size=$CLUSTER_SIZE $extra_img_options"
         cluster_size_filter=""
     fi
 
     # XXX(hch): have global image options?
-    $QEMU_IMG create -f $IMGFMT $TEST_IMG $extra_img_options | \
+    $QEMU_IMG create -f $IMGFMT $extra_img_options $TEST_IMG $image_size | \
     	sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" | \
     	sed -e "s#$TEST_DIR#TEST_DIR#g" | \
     	sed -e "s#$IMGFMT#IMGFMT#g" | \

+ 1 - 0
tests/qemu-iotests/group

@@ -37,3 +37,4 @@
 028 rw backing auto
 029 rw auto quick
 030 rw auto
+031 rw auto quick

+ 207 - 0
tests/qemu-iotests/qcow2.py

@@ -0,0 +1,207 @@
+#!/usr/bin/env python
+
+import sys
+import struct
+import string
+
+class QcowHeaderExtension:
+
+    def __init__(self, magic, length, data):
+        self.magic  = magic
+        self.length = length
+        self.data   = data
+
+    @classmethod
+    def create(cls, magic, data):
+        return QcowHeaderExtension(magic, len(data), data)
+
+class QcowHeader:
+
+    uint32_t = 'I'
+    uint64_t = 'Q'
+
+    fields = [
+        # Version 2 header fields
+        [ uint32_t, '%#x',  'magic' ],
+        [ uint32_t, '%d',   'version' ],
+        [ uint64_t, '%#x',  'backing_file_offset' ],
+        [ uint32_t, '%#x',  'backing_file_size' ],
+        [ uint32_t, '%d',   'cluster_bits' ],
+        [ uint64_t, '%d',   'size' ],
+        [ uint32_t, '%d',   'crypt_method' ],
+        [ uint32_t, '%d',   'l1_size' ],
+        [ uint64_t, '%#x',  'l1_table_offset' ],
+        [ uint64_t, '%#x',  'refcount_table_offset' ],
+        [ uint32_t, '%d',   'refcount_table_clusters' ],
+        [ uint32_t, '%d',   'nb_snapshots' ],
+        [ uint64_t, '%#x',  'snapshot_offset' ],
+    ];
+
+    fmt = '>' + ''.join(field[0] for field in fields)
+
+    def __init__(self, fd):
+
+        buf_size = struct.calcsize(QcowHeader.fmt)
+
+        fd.seek(0)
+        buf = fd.read(buf_size)
+
+        header = struct.unpack(QcowHeader.fmt, buf)
+        self.__dict__ = dict((field[2], header[i])
+            for i, field in enumerate(QcowHeader.fields))
+
+        self.cluster_size = 1 << self.cluster_bits
+
+        fd.seek(self.get_header_length())
+        self.load_extensions(fd)
+
+        if self.backing_file_offset:
+            fd.seek(self.backing_file_offset)
+            self.backing_file = fd.read(self.backing_file_size)
+        else:
+            self.backing_file = None
+
+    def get_header_length(self):
+        if self.version == 2:
+            return 72
+        else:
+            raise Exception("version != 2 not supported")
+
+    def load_extensions(self, fd):
+        self.extensions = []
+
+        if self.backing_file_offset != 0:
+            end = min(self.cluster_size, self.backing_file_offset)
+        else:
+            end = self.cluster_size
+
+        while fd.tell() < end:
+            (magic, length) = struct.unpack('>II', fd.read(8))
+            if magic == 0:
+                break
+            else:
+                padded = (length + 7) & ~7
+                data = fd.read(padded)
+                self.extensions.append(QcowHeaderExtension(magic, length, data))
+
+    def update_extensions(self, fd):
+
+        fd.seek(self.get_header_length())
+        extensions = self.extensions
+        extensions.append(QcowHeaderExtension(0, 0, ""))
+        for ex in extensions:
+            buf = struct.pack('>II', ex.magic, ex.length)
+            fd.write(buf)
+            fd.write(ex.data)
+
+        if self.backing_file != None:
+            self.backing_file_offset = fd.tell()
+            fd.write(self.backing_file)
+
+        if fd.tell() > self.cluster_size:
+            raise Exception("I think I just broke the image...")
+
+
+    def update(self, fd):
+        header_bytes = self.get_header_length()
+
+        self.update_extensions(fd)
+
+        fd.seek(0)
+        header = tuple(self.__dict__[f] for t, p, f in QcowHeader.fields)
+        buf = struct.pack(QcowHeader.fmt, *header)
+        buf = buf[0:header_bytes-1]
+        fd.write(buf)
+
+    def dump(self):
+        for f in QcowHeader.fields:
+            print "%-25s" % f[2], f[1] % self.__dict__[f[2]]
+        print ""
+
+    def dump_extensions(self):
+        for ex in self.extensions:
+
+            data = ex.data[:ex.length]
+            if all(c in string.printable for c in data):
+                data = "'%s'" % data
+            else:
+                data = "<binary>"
+
+            print "Header extension:"
+            print "%-25s %#x" % ("magic", ex.magic)
+            print "%-25s %d" % ("length", ex.length)
+            print "%-25s %s" % ("data", data)
+            print ""
+
+
+def cmd_dump_header(fd):
+    h = QcowHeader(fd)
+    h.dump()
+    h.dump_extensions()
+
+def cmd_add_header_ext(fd, magic, data):
+    try:
+        magic = int(magic, 0)
+    except:
+        print "'%s' is not a valid magic number" % magic
+        sys.exit(1)
+
+    h = QcowHeader(fd)
+    h.extensions.append(QcowHeaderExtension.create(magic, data))
+    h.update(fd)
+
+def cmd_del_header_ext(fd, magic):
+    try:
+        magic = int(magic, 0)
+    except:
+        print "'%s' is not a valid magic number" % magic
+        sys.exit(1)
+
+    h = QcowHeader(fd)
+    found = False
+
+    for ex in h.extensions:
+        if ex.magic == magic:
+            found = True
+            h.extensions.remove(ex)
+
+    if not found:
+        print "No such header extension"
+        return
+
+    h.update(fd)
+
+cmds = [
+    [ 'dump-header',    cmd_dump_header,    0, 'Dump image header and header extensions' ],
+    [ 'add-header-ext', cmd_add_header_ext, 2, 'Add a header extension' ],
+    [ 'del-header-ext', cmd_del_header_ext, 1, 'Delete a header extension' ],
+]
+
+def main(filename, cmd, args):
+    fd = open(filename, "r+b")
+    try:
+        for name, handler, num_args, desc in cmds:
+            if name != cmd:
+                continue
+            elif len(args) != num_args:
+                usage()
+                return
+            else:
+                handler(fd, *args)
+                return
+        print "Unknown command '%s'" % cmd
+    finally:
+        fd.close()
+
+def usage():
+    print "Usage: %s <file> <cmd> [<arg>, ...]" % sys.argv[0]
+    print ""
+    print "Supported commands:"
+    for name, handler, num_args, desc in cmds:
+        print "    %-20s - %s" % (name, desc)
+
+if len(sys.argv) < 3:
+    usage()
+    sys.exit(1)
+
+main(sys.argv[1], sys.argv[2], sys.argv[3:])

+ 1 - 1
trace-events

@@ -562,7 +562,7 @@ qemu_coroutine_terminate(void *co) "self %p"
 
 # qemu-coroutine-lock.c
 qemu_co_queue_next_bh(void) ""
-qemu_co_queue_next(void *next) "next %p"
+qemu_co_queue_next(void *nxt) "next %p"
 qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
 qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"