Bläddra i källkod

Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging

Block patches for 2.3

# gpg: Signature made Fri 06 Feb 2015 17:14:10 GMT using RSA key ID C88F2FD6
# gpg: Good signature from "Kevin Wolf <kwolf@redhat.com>"

* remotes/kevin/tags/for-upstream: (47 commits)
  block/raw-posix.c: Fix raw_getlength() on Mac OS X block devices
  block: Eliminate silly QERR_ macros used for encryption keys
  block: New bdrv_add_key(), convert monitor to use it
  blockdev: Eliminate silly QERR_BLOCK_JOB_NOT_ACTIVE macro
  blockdev: Give find_block_job() an Error ** parameter
  qcow2: Rewrite qcow2_alloc_bytes()
  block: Give always priority to unused entries in the qcow2 L2 cache
  nbd: fix max_discard/max_transfer_length
  block: introduce BDRV_REQUEST_MAX_SECTORS
  nbd: Improve error messages
  iotests: Fix 104 for NBD
  iotests: Fix 100 for nbd
  iotests: Fix 083
  block: fix off-by-one error in qcow and qcow2
  qemu-iotests: add 116 invalid QED input file tests
  qed: check for header size overflow
  block/dmg: improve zeroes handling
  block/dmg: support bzip2 block entry types
  block/dmg: factor out block type check
  block/dmg: use SectorNumber from BLKX header
  ...

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Peter Maydell 10 år sedan
förälder
incheckning
3d815ac82b
55 ändrade filer med 1602 tillägg och 462 borttagningar
  1. 43 22
      block.c
  2. 2 0
      block/Makefile.objs
  3. 7 0
      block/accounting.c
  4. 5 0
      block/block-backend.c
  5. 385 117
      block/dmg.c
  6. 2 2
      block/nbd-client.c
  7. 1 1
      block/nbd-client.h
  8. 10 1
      block/nbd.c
  9. 5 0
      block/qapi.c
  10. 1 1
      block/qcow.c
  11. 3 1
      block/qcow2-cache.c
  12. 39 39
      block/qcow2-refcount.c
  13. 1 1
      block/qcow2.c
  14. 5 0
      block/qed.c
  15. 0 1
      block/qed.h
  16. 108 32
      block/raw-posix.c
  17. 125 0
      block/write-threshold.c
  18. 11 33
      blockdev.c
  19. 50 0
      configure
  20. 5 1
      hmp.c
  21. 4 4
      hw/block/dataplane/virtio-blk.c
  22. 205 94
      hw/block/virtio-blk.c
  23. 17 0
      hw/ide/atapi.c
  24. 1 0
      hw/ide/core.c
  25. 2 0
      hw/ide/internal.h
  26. 11 0
      hw/ide/pci.c
  27. 3 0
      include/block/accounting.h
  28. 4 0
      include/block/block.h
  29. 4 0
      include/block/block_int.h
  30. 1 1
      include/block/nbd.h
  31. 64 0
      include/block/write-threshold.h
  32. 12 6
      include/hw/virtio/virtio-blk.h
  33. 0 9
      include/qapi/qmp/qerror.h
  34. 1 0
      include/sysemu/block-backend.h
  35. 11 5
      monitor.c
  36. 25 17
      nbd.c
  37. 58 2
      qapi/block-core.json
  38. 1 1
      qemu-img.c
  39. 6 1
      qemu-nbd.c
  40. 50 4
      qmp-commands.hx
  41. 4 4
      qmp.c
  42. 3 0
      tests/Makefile
  43. 5 0
      tests/qemu-iotests/067.out
  44. 2 1
      tests/qemu-iotests/083
  45. 28 53
      tests/qemu-iotests/083.out
  46. 12 0
      tests/qemu-iotests/100
  47. 3 6
      tests/qemu-iotests/104
  48. 96 0
      tests/qemu-iotests/116
  49. 37 0
      tests/qemu-iotests/116.out
  50. 1 0
      tests/qemu-iotests/common.filter
  51. 1 1
      tests/qemu-iotests/common.rc
  52. 1 0
      tests/qemu-iotests/group
  53. 1 1
      tests/qemu-iotests/iotests.py
  54. 119 0
      tests/test-write-threshold.c
  55. 1 0
      trace-events

+ 43 - 22
block.c

@@ -2647,7 +2647,7 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 {
     int64_t len;
 
-    if (size > INT_MAX) {
+    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
         return -EIO;
     }
 
@@ -2671,7 +2671,7 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                               int nb_sectors)
 {
-    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EIO;
     }
 
@@ -2758,7 +2758,7 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf,
         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
     };
 
-    if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EINVAL;
     }
 
@@ -2826,13 +2826,10 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
     }
 
     for (;;) {
-        nb_sectors = target_sectors - sector_num;
+        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
         if (nb_sectors <= 0) {
             return 0;
         }
-        if (nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) {
-            nb_sectors = INT_MAX / BDRV_SECTOR_SIZE;
-        }
         ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n);
         if (ret < 0) {
             error_report("error getting block status at sector %" PRId64 ": %s",
@@ -3167,7 +3164,7 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EINVAL;
     }
 
@@ -3192,10 +3189,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                             BDRV_REQ_COPY_ON_READ);
 }
 
-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_WRITE_ZEROES_DEFAULT 32768
+#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768
 
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
@@ -3205,8 +3199,8 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     struct iovec iov = {0};
     int ret = 0;
 
-    int max_write_zeroes = bs->bl.max_write_zeroes ?
-                           bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
+    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes,
+                                        BDRV_REQUEST_MAX_SECTORS);
 
     while (nb_sectors > 0 && !ret) {
         int num = nb_sectors;
@@ -3242,7 +3236,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
         if (ret == -ENOTSUP) {
             /* Fall back to bounce buffer if write zeroes is unsupported */
             int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length,
-                                            MAX_WRITE_ZEROES_DEFAULT);
+                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
             num = MIN(num, max_xfer_len);
             iov.iov_len = num * BDRV_SECTOR_SIZE;
             if (iov.iov_base == NULL) {
@@ -3461,7 +3455,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
     BdrvRequestFlags flags)
 {
-    if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) {
+    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
         return -EINVAL;
     }
 
@@ -3719,6 +3713,36 @@ int bdrv_set_key(BlockDriverState *bs, const char *key)
     return ret;
 }
 
+/*
+ * Provide an encryption key for @bs.
+ * If @key is non-null:
+ *     If @bs is not encrypted, fail.
+ *     Else if the key is invalid, fail.
+ *     Else set @bs's key to @key, replacing the existing key, if any.
+ * If @key is null:
+ *     If @bs is encrypted and still lacks a key, fail.
+ *     Else do nothing.
+ * On failure, store an error object through @errp if non-null.
+ */
+void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp)
+{
+    if (key) {
+        if (!bdrv_is_encrypted(bs)) {
+            error_setg(errp, "Device '%s' is not encrypted",
+                      bdrv_get_device_name(bs));
+        } else if (bdrv_set_key(bs, key) < 0) {
+            error_set(errp, QERR_INVALID_PASSWORD);
+        }
+    } else {
+        if (bdrv_key_required(bs)) {
+            error_set(errp, ERROR_CLASS_DEVICE_ENCRYPTED,
+                      "'%s' (%s) is encrypted",
+                      bdrv_get_device_name(bs),
+                      bdrv_get_encrypted_filename(bs));
+        }
+    }
+}
+
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
     return bs->drv ? bs->drv->format_name : NULL;
@@ -4562,6 +4586,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
         }
     }
 
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
     return outidx + 1;
 }
 
@@ -5097,11 +5123,6 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
     rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors);
 }
 
-/* if no limit is specified in the BlockLimits use a default
- * of 32768 512-byte sectors (16 MiB) per request.
- */
-#define MAX_DISCARD_DEFAULT 32768
-
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                  int nb_sectors)
 {
@@ -5126,7 +5147,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
-    max_discard = bs->bl.max_discard ?  bs->bl.max_discard : MAX_DISCARD_DEFAULT;
+    max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
     while (nb_sectors > 0) {
         int ret;
         int num = nb_sectors;

+ 2 - 0
block/Makefile.objs

@@ -20,6 +20,7 @@ block-obj-$(CONFIG_GLUSTERFS) += gluster.o
 block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o
+block-obj-y += write-threshold.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
@@ -36,5 +37,6 @@ gluster.o-libs     := $(GLUSTERFS_LIBS)
 ssh.o-cflags       := $(LIBSSH2_CFLAGS)
 ssh.o-libs         := $(LIBSSH2_LIBS)
 archipelago.o-libs := $(ARCHIPELAGO_LIBS)
+dmg.o-libs         := $(BZIP2_LIBS)
 qcow.o-libs        := -lz
 linux-aio.o-libs   := -laio

+ 7 - 0
block/accounting.c

@@ -54,3 +54,10 @@ void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
         stats->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 }
+
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                      int num_requests)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+    stats->merged[type] += num_requests;
+}

+ 5 - 0
block/block-backend.c

@@ -580,6 +580,11 @@ int blk_get_flags(BlockBackend *blk)
     return bdrv_get_flags(blk->bs);
 }
 
+int blk_get_max_transfer_length(BlockBackend *blk)
+{
+    return blk->bs->bl.max_transfer_length;
+}
+
 void blk_set_guest_block_size(BlockBackend *blk, int align)
 {
     bdrv_set_guest_block_size(blk->bs, align);

+ 385 - 117
block/dmg.c

@@ -26,6 +26,10 @@
 #include "qemu/bswap.h"
 #include "qemu/module.h"
 #include <zlib.h>
+#ifdef CONFIG_BZIP2
+#include <bzlib.h>
+#endif
+#include <glib.h>
 
 enum {
     /* Limit chunk sizes to prevent unreasonable amounts of memory being used
@@ -55,6 +59,9 @@ typedef struct BDRVDMGState {
     uint8_t *compressed_chunk;
     uint8_t *uncompressed_chunk;
     z_stream zstream;
+#ifdef CONFIG_BZIP2
+    bz_stream bzstream;
+#endif
 } BDRVDMGState;
 
 static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -100,6 +107,16 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result)
     return 0;
 }
 
+static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset)
+{
+    return be64_to_cpu(*(uint64_t *)&buffer[offset]);
+}
+
+static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset)
+{
+    return be32_to_cpu(*(uint32_t *)&buffer[offset]);
+}
+
 /* Increase max chunk sizes, if necessary.  This function is used to calculate
  * the buffer sizes needed for compressed/uncompressed chunk I/O.
  */
@@ -112,6 +129,7 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
 
     switch (s->types[chunk]) {
     case 0x80000005: /* zlib compressed */
+    case 0x80000006: /* bzip2 compressed */
         compressed_size = s->lengths[chunk];
         uncompressed_sectors = s->sectorcounts[chunk];
         break;
@@ -119,7 +137,9 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
         uncompressed_sectors = (s->lengths[chunk] + 511) / 512;
         break;
     case 2: /* zero */
-        uncompressed_sectors = s->sectorcounts[chunk];
+        /* as the all-zeroes block may be large, it is treated specially: the
+         * sector is not copied from a large buffer, a simple memset is used
+         * instead. Therefore uncompressed_sectors does not need to be set. */
         break;
     }
 
@@ -131,163 +151,372 @@ static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk,
     }
 }
 
+static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp)
+{
+    int64_t length;
+    int64_t offset = 0;
+    uint8_t buffer[515];
+    int i, ret;
+
+    /* bdrv_getlength returns a multiple of block size (512), rounded up. Since
+     * dmg images can have odd sizes, try to look for the "koly" magic which
+     * marks the begin of the UDIF trailer (512 bytes). This magic can be found
+     * in the last 511 bytes of the second-last sector or the first 4 bytes of
+     * the last sector (search space: 515 bytes) */
+    length = bdrv_getlength(file_bs);
+    if (length < 0) {
+        error_setg_errno(errp, -length,
+            "Failed to get file size while reading UDIF trailer");
+        return length;
+    } else if (length < 512) {
+        error_setg(errp, "dmg file must be at least 512 bytes long");
+        return -EINVAL;
+    }
+    if (length > 511 + 512) {
+        offset = length - 511 - 512;
+    }
+    length = length < 515 ? length : 515;
+    ret = bdrv_pread(file_bs, offset, buffer, length);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed while reading UDIF trailer");
+        return ret;
+    }
+    for (i = 0; i < length - 3; i++) {
+        if (buffer[i] == 'k' && buffer[i+1] == 'o' &&
+            buffer[i+2] == 'l' && buffer[i+3] == 'y') {
+            return offset + i;
+        }
+    }
+    error_setg(errp, "Could not locate UDIF trailer in dmg file");
+    return -EINVAL;
+}
+
+/* used when building the sector table */
+typedef struct DmgHeaderState {
+    /* used internally by dmg_read_mish_block to remember offsets of blocks
+     * across calls */
+    uint64_t data_fork_offset;
+    /* exported for dmg_open */
+    uint32_t max_compressed_size;
+    uint32_t max_sectors_per_chunk;
+} DmgHeaderState;
+
+static bool dmg_is_known_block_type(uint32_t entry_type)
+{
+    switch (entry_type) {
+    case 0x00000001:    /* uncompressed */
+    case 0x00000002:    /* zeroes */
+    case 0x80000005:    /* zlib */
+#ifdef CONFIG_BZIP2
+    case 0x80000006:    /* bzip2 */
+#endif
+        return true;
+    default:
+        return false;
+    }
+}
+
+static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds,
+                               uint8_t *buffer, uint32_t count)
+{
+    uint32_t type, i;
+    int ret;
+    size_t new_size;
+    uint32_t chunk_count;
+    int64_t offset = 0;
+    uint64_t data_offset;
+    uint64_t in_offset = ds->data_fork_offset;
+    uint64_t out_offset;
+
+    type = buff_read_uint32(buffer, offset);
+    /* skip data that is not a valid MISH block (invalid magic or too small) */
+    if (type != 0x6d697368 || count < 244) {
+        /* assume success for now */
+        return 0;
+    }
+
+    /* chunk offsets are relative to this sector number */
+    out_offset = buff_read_uint64(buffer, offset + 8);
+
+    /* location in data fork for (compressed) blob (in bytes) */
+    data_offset = buff_read_uint64(buffer, offset + 0x18);
+    in_offset += data_offset;
+
+    /* move to begin of chunk entries */
+    offset += 204;
+
+    chunk_count = (count - 204) / 40;
+    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+    s->types = g_realloc(s->types, new_size / 2);
+    s->offsets = g_realloc(s->offsets, new_size);
+    s->lengths = g_realloc(s->lengths, new_size);
+    s->sectors = g_realloc(s->sectors, new_size);
+    s->sectorcounts = g_realloc(s->sectorcounts, new_size);
+
+    for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
+        s->types[i] = buff_read_uint32(buffer, offset);
+        if (!dmg_is_known_block_type(s->types[i])) {
+            chunk_count--;
+            i--;
+            offset += 40;
+            continue;
+        }
+
+        /* sector number */
+        s->sectors[i] = buff_read_uint64(buffer, offset + 8);
+        s->sectors[i] += out_offset;
+
+        /* sector count */
+        s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10);
+
+        /* all-zeroes sector (type 2) does not need to be "uncompressed" and can
+         * therefore be unbounded. */
+        if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+            error_report("sector count %" PRIu64 " for chunk %" PRIu32
+                         " is larger than max (%u)",
+                         s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        /* offset in (compressed) data fork */
+        s->offsets[i] = buff_read_uint64(buffer, offset + 0x18);
+        s->offsets[i] += in_offset;
+
+        /* length in (compressed) data fork */
+        s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
+
+        if (s->lengths[i] > DMG_LENGTHS_MAX) {
+            error_report("length %" PRIu64 " for chunk %" PRIu32
+                         " is larger than max (%u)",
+                         s->lengths[i], i, DMG_LENGTHS_MAX);
+            ret = -EINVAL;
+            goto fail;
+        }
+
+        update_max_chunk_size(s, i, &ds->max_compressed_size,
+                              &ds->max_sectors_per_chunk);
+        offset += 40;
+    }
+    s->n_chunks += chunk_count;
+    return 0;
+
+fail:
+    return ret;
+}
+
+static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
+                                  uint64_t info_begin, uint64_t info_length)
+{
+    BDRVDMGState *s = bs->opaque;
+    int ret;
+    uint32_t count, rsrc_data_offset;
+    uint8_t *buffer = NULL;
+    uint64_t info_end;
+    uint64_t offset;
+
+    /* read offset from begin of resource fork (info_begin) to resource data */
+    ret = read_uint32(bs, info_begin, &rsrc_data_offset);
+    if (ret < 0) {
+        goto fail;
+    } else if (rsrc_data_offset > info_length) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* read length of resource data */
+    ret = read_uint32(bs, info_begin + 8, &count);
+    if (ret < 0) {
+        goto fail;
+    } else if (count == 0 || rsrc_data_offset + count > info_length) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* begin of resource data (consisting of one or more resources) */
+    offset = info_begin + rsrc_data_offset;
+
+    /* end of resource data (there is possibly a following resource map
+     * which will be ignored). */
+    info_end = offset + count;
+
+    /* read offsets (mish blocks) from one or more resources in resource data */
+    while (offset < info_end) {
+        /* size of following resource */
+        ret = read_uint32(bs, offset, &count);
+        if (ret < 0) {
+            goto fail;
+        } else if (count == 0 || count > info_end - offset) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        offset += 4;
+
+        buffer = g_realloc(buffer, count);
+        ret = bdrv_pread(bs->file, offset, buffer, count);
+        if (ret < 0) {
+            goto fail;
+        }
+
+        ret = dmg_read_mish_block(s, ds, buffer, count);
+        if (ret < 0) {
+            goto fail;
+        }
+        /* advance offset by size of resource */
+        offset += count;
+    }
+    ret = 0;
+
+fail:
+    g_free(buffer);
+    return ret;
+}
+
+static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds,
+                              uint64_t info_begin, uint64_t info_length)
+{
+    BDRVDMGState *s = bs->opaque;
+    int ret;
+    uint8_t *buffer = NULL;
+    char *data_begin, *data_end;
+
+    /* Have at least some length to avoid NULL for g_malloc. Attempt to set a
+     * safe upper cap on the data length. A test sample had a XML length of
+     * about 1 MiB. */
+    if (info_length == 0 || info_length > 16 * 1024 * 1024) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    buffer = g_malloc(info_length + 1);
+    buffer[info_length] = '\0';
+    ret = bdrv_pread(bs->file, info_begin, buffer, info_length);
+    if (ret != info_length) {
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* look for <data>...</data>. The data is 284 (0x11c) bytes after base64
+     * decode. The actual data element has 431 (0x1af) bytes which includes tabs
+     * and line feeds. */
+    data_end = (char *)buffer;
+    while ((data_begin = strstr(data_end, "<data>")) != NULL) {
+        guchar *mish;
+        gsize out_len = 0;
+
+        data_begin += 6;
+        data_end = strstr(data_begin, "</data>");
+        /* malformed XML? */
+        if (data_end == NULL) {
+            ret = -EINVAL;
+            goto fail;
+        }
+        *data_end++ = '\0';
+        mish = g_base64_decode(data_begin, &out_len);
+        ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len);
+        g_free(mish);
+        if (ret < 0) {
+            goto fail;
+        }
+    }
+    ret = 0;
+
+fail:
+    g_free(buffer);
+    return ret;
+}
+
 static int dmg_open(BlockDriverState *bs, QDict *options, int flags,
                     Error **errp)
 {
     BDRVDMGState *s = bs->opaque;
-    uint64_t info_begin, info_end, last_in_offset, last_out_offset;
-    uint32_t count, tmp;
-    uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i;
+    DmgHeaderState ds;
+    uint64_t rsrc_fork_offset, rsrc_fork_length;
+    uint64_t plist_xml_offset, plist_xml_length;
     int64_t offset;
     int ret;
 
     bs->read_only = 1;
     s->n_chunks = 0;
     s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
+    /* used by dmg_read_mish_block to keep track of the current I/O position */
+    ds.data_fork_offset = 0;
+    ds.max_compressed_size = 1;
+    ds.max_sectors_per_chunk = 1;
 
-    /* read offset of info blocks */
-    offset = bdrv_getlength(bs->file);
+    /* locate the UDIF trailer */
+    offset = dmg_find_koly_offset(bs->file, errp);
     if (offset < 0) {
         ret = offset;
         goto fail;
     }
-    offset -= 0x1d8;
 
-    ret = read_uint64(bs, offset, &info_begin);
+    /* offset of data fork (DataForkOffset) */
+    ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset);
     if (ret < 0) {
         goto fail;
-    } else if (info_begin == 0) {
+    } else if (ds.data_fork_offset > offset) {
         ret = -EINVAL;
         goto fail;
     }
 
-    ret = read_uint32(bs, info_begin, &tmp);
+    /* offset of resource fork (RsrcForkOffset) */
+    ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset);
     if (ret < 0) {
         goto fail;
-    } else if (tmp != 0x100) {
+    }
+    ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length);
+    if (ret < 0) {
+        goto fail;
+    }
+    if (rsrc_fork_offset >= offset ||
+        rsrc_fork_length > offset - rsrc_fork_offset) {
         ret = -EINVAL;
         goto fail;
     }
-
-    ret = read_uint32(bs, info_begin + 4, &count);
+    /* offset of property list (XMLOffset) */
+    ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset);
     if (ret < 0) {
         goto fail;
-    } else if (count == 0) {
+    }
+    ret = read_uint64(bs, offset + 0xe0, &plist_xml_length);
+    if (ret < 0) {
+        goto fail;
+    }
+    if (plist_xml_offset >= offset ||
+        plist_xml_length > offset - plist_xml_offset) {
         ret = -EINVAL;
         goto fail;
     }
-    info_end = info_begin + count;
-
-    offset = info_begin + 0x100;
-
-    /* read offsets */
-    last_in_offset = last_out_offset = 0;
-    while (offset < info_end) {
-        uint32_t type;
-
-        ret = read_uint32(bs, offset, &count);
+    ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors);
+    if (ret < 0) {
+        goto fail;
+    }
+    if (bs->total_sectors < 0) {
+        ret = -EINVAL;
+        goto fail;
+    }
+    if (rsrc_fork_length != 0) {
+        ret = dmg_read_resource_fork(bs, &ds,
+                                     rsrc_fork_offset, rsrc_fork_length);
         if (ret < 0) {
             goto fail;
-        } else if (count == 0) {
-            ret = -EINVAL;
-            goto fail;
         }
-        offset += 4;
-
-        ret = read_uint32(bs, offset, &type);
+    } else if (plist_xml_length != 0) {
+        ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length);
         if (ret < 0) {
             goto fail;
         }
-
-        if (type == 0x6d697368 && count >= 244) {
-            size_t new_size;
-            uint32_t chunk_count;
-
-            offset += 4;
-            offset += 200;
-
-            chunk_count = (count - 204) / 40;
-            new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
-            s->types = g_realloc(s->types, new_size / 2);
-            s->offsets = g_realloc(s->offsets, new_size);
-            s->lengths = g_realloc(s->lengths, new_size);
-            s->sectors = g_realloc(s->sectors, new_size);
-            s->sectorcounts = g_realloc(s->sectorcounts, new_size);
-
-            for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) {
-                ret = read_uint32(bs, offset, &s->types[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                offset += 4;
-                if (s->types[i] != 0x80000005 && s->types[i] != 1 &&
-                    s->types[i] != 2) {
-                    if (s->types[i] == 0xffffffff && i > 0) {
-                        last_in_offset = s->offsets[i - 1] + s->lengths[i - 1];
-                        last_out_offset = s->sectors[i - 1] +
-                                          s->sectorcounts[i - 1];
-                    }
-                    chunk_count--;
-                    i--;
-                    offset += 36;
-                    continue;
-                }
-                offset += 4;
-
-                ret = read_uint64(bs, offset, &s->sectors[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                s->sectors[i] += last_out_offset;
-                offset += 8;
-
-                ret = read_uint64(bs, offset, &s->sectorcounts[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                offset += 8;
-
-                if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
-                    error_report("sector count %" PRIu64 " for chunk %" PRIu32
-                                 " is larger than max (%u)",
-                                 s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
-                    ret = -EINVAL;
-                    goto fail;
-                }
-
-                ret = read_uint64(bs, offset, &s->offsets[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                s->offsets[i] += last_in_offset;
-                offset += 8;
-
-                ret = read_uint64(bs, offset, &s->lengths[i]);
-                if (ret < 0) {
-                    goto fail;
-                }
-                offset += 8;
-
-                if (s->lengths[i] > DMG_LENGTHS_MAX) {
-                    error_report("length %" PRIu64 " for chunk %" PRIu32
-                                 " is larger than max (%u)",
-                                 s->lengths[i], i, DMG_LENGTHS_MAX);
-                    ret = -EINVAL;
-                    goto fail;
-                }
-
-                update_max_chunk_size(s, i, &max_compressed_size,
-                                      &max_sectors_per_chunk);
-            }
-            s->n_chunks += chunk_count;
-        }
+    } else {
+        ret = -EINVAL;
+        goto fail;
     }
 
     /* initialize zlib engine */
     s->compressed_chunk = qemu_try_blockalign(bs->file,
-                                              max_compressed_size + 1);
+                                              ds.max_compressed_size + 1);
     s->uncompressed_chunk = qemu_try_blockalign(bs->file,
-                                                512 * max_sectors_per_chunk);
+                                                512 * ds.max_sectors_per_chunk);
     if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) {
         ret = -ENOMEM;
         goto fail;
@@ -349,13 +578,16 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
     if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
         int ret;
         uint32_t chunk = search_chunk(s, sector_num);
+#ifdef CONFIG_BZIP2
+        uint64_t total_out;
+#endif
 
         if (chunk >= s->n_chunks) {
             return -1;
         }
 
         s->current_chunk = s->n_chunks;
-        switch (s->types[chunk]) {
+        switch (s->types[chunk]) { /* block entry type */
         case 0x80000005: { /* zlib compressed */
             /* we need to buffer, because only the chunk as whole can be
              * inflated. */
@@ -379,6 +611,34 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
                 return -1;
             }
             break; }
+#ifdef CONFIG_BZIP2
+        case 0x80000006: /* bzip2 compressed */
+            /* we need to buffer, because only the chunk as whole can be
+             * inflated. */
+            ret = bdrv_pread(bs->file, s->offsets[chunk],
+                             s->compressed_chunk, s->lengths[chunk]);
+            if (ret != s->lengths[chunk]) {
+                return -1;
+            }
+
+            ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0);
+            if (ret != BZ_OK) {
+                return -1;
+            }
+            s->bzstream.next_in = (char *)s->compressed_chunk;
+            s->bzstream.avail_in = (unsigned int) s->lengths[chunk];
+            s->bzstream.next_out = (char *)s->uncompressed_chunk;
+            s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk];
+            ret = BZ2_bzDecompress(&s->bzstream);
+            total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) +
+                        s->bzstream.total_out_lo32;
+            BZ2_bzDecompressEnd(&s->bzstream);
+            if (ret != BZ_STREAM_END ||
+                total_out != 512 * s->sectorcounts[chunk]) {
+                return -1;
+            }
+            break;
+#endif /* CONFIG_BZIP2 */
         case 1: /* copy */
             ret = bdrv_pread(bs->file, s->offsets[chunk],
                              s->uncompressed_chunk, s->lengths[chunk]);
@@ -387,7 +647,8 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
             }
             break;
         case 2: /* zero */
-            memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]);
+            /* see dmg_read, it is treated specially. No buffer needs to be
+             * pre-filled, the zeroes can be set directly. */
             break;
         }
         s->current_chunk = chunk;
@@ -406,6 +667,13 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num,
         if (dmg_read_chunk(bs, sector_num + i) != 0) {
             return -1;
         }
+        /* Special case: current chunk is all zeroes. Do not perform a memcpy as
+         * s->uncompressed_chunk may be too small to cover the large all-zeroes
+         * section. dmg_read_chunk is called to find s->current_chunk */
+        if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
+            memset(buf + i * 512, 0, 512);
+            continue;
+        }
         sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
         memcpy(buf + i * 512,
                s->uncompressed_chunk + sector_offset_in_chunk * 512, 512);

+ 2 - 2
block/nbd-client.c

@@ -373,7 +373,7 @@ void nbd_client_session_close(NbdClientSession *client)
 }
 
 int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
-    int sock, const char *export)
+                            int sock, const char *export, Error **errp)
 {
     int ret;
 
@@ -382,7 +382,7 @@ int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
     qemu_set_block(sock);
     ret = nbd_receive_negotiate(sock, export,
                                 &client->nbdflags, &client->size,
-                                &client->blocksize);
+                                &client->blocksize, errp);
     if (ret < 0) {
         logout("Failed to negotiate with the NBD server\n");
         closesocket(sock);

+ 1 - 1
block/nbd-client.h

@@ -36,7 +36,7 @@ typedef struct NbdClientSession {
 } NbdClientSession;
 
 int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs,
-                            int sock, const char *export_name);
+                            int sock, const char *export_name, Error **errp);
 void nbd_client_session_close(NbdClientSession *client);
 
 int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num,

+ 10 - 1
block/nbd.c

@@ -271,7 +271,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, int flags,
     }
 
     /* NBD handshake */
-    result = nbd_client_session_init(&s->client, bs, sock, export);
+    result = nbd_client_session_init(&s->client, bs, sock, export, errp);
     g_free(export);
     return result;
 }
@@ -301,6 +301,12 @@ static int nbd_co_flush(BlockDriverState *bs)
     return nbd_client_session_co_flush(&s->client);
 }
 
+static void nbd_refresh_limits(BlockDriverState *bs, Error **errp)
+{
+    bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS;
+    bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS;
+}
+
 static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
                           int nb_sectors)
 {
@@ -396,6 +402,7 @@ static BlockDriver bdrv_nbd = {
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
     .bdrv_attach_aio_context    = nbd_attach_aio_context,
@@ -413,6 +420,7 @@ static BlockDriver bdrv_nbd_tcp = {
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
     .bdrv_attach_aio_context    = nbd_attach_aio_context,
@@ -430,6 +438,7 @@ static BlockDriver bdrv_nbd_unix = {
     .bdrv_close                 = nbd_close,
     .bdrv_co_flush_to_os        = nbd_co_flush,
     .bdrv_co_discard            = nbd_co_discard,
+    .bdrv_refresh_limits        = nbd_refresh_limits,
     .bdrv_getlength             = nbd_getlength,
     .bdrv_detach_aio_context    = nbd_detach_aio_context,
     .bdrv_attach_aio_context    = nbd_attach_aio_context,

+ 5 - 0
block/qapi.c

@@ -24,6 +24,7 @@
 
 #include "block/qapi.h"
 #include "block/block_int.h"
+#include "block/write-threshold.h"
 #include "qmp-commands.h"
 #include "qapi-visit.h"
 #include "qapi/qmp-output-visitor.h"
@@ -89,6 +90,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs)
         info->iops_size = cfg.op_size;
     }
 
+    info->write_threshold = bdrv_write_threshold_get(bs);
+
     return info;
 }
 
@@ -335,6 +338,8 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
     s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
     s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
     s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
+    s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ];
+    s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE];
     s->stats->wr_highest_offset =
         bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
     s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];

+ 1 - 1
block/qcow.c

@@ -215,7 +215,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags,
     /* read the backing file name */
     if (header.backing_file_offset != 0) {
         len = header.backing_file_size;
-        if (len > 1023 || len > sizeof(bs->backing_file)) {
+        if (len > 1023 || len >= sizeof(bs->backing_file)) {
             error_setg(errp, "Backing file name too long");
             ret = -EINVAL;
             goto fail;

+ 3 - 1
block/qcow2-cache.c

@@ -253,7 +253,9 @@ static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c)
 
         /* Give newer hits priority */
         /* TODO Check how to optimize the replacement strategy */
-        c->entries[i].cache_hits /= 2;
+        if (c->entries[i].cache_hits > 1) {
+            c->entries[i].cache_hits /= 2;
+        }
     }
 
     if (min_index == -1) {

+ 39 - 39
block/qcow2-refcount.c

@@ -759,54 +759,54 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
 {
     BDRVQcowState *s = bs->opaque;
-    int64_t offset, cluster_offset;
-    int free_in_cluster;
+    int64_t offset;
+    size_t free_in_cluster;
+    int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
     assert(size > 0 && size <= s->cluster_size);
-    if (s->free_byte_offset == 0) {
-        offset = qcow2_alloc_clusters(bs, s->cluster_size);
-        if (offset < 0) {
-            return offset;
+    assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
+
+    offset = s->free_byte_offset;
+
+    if (offset) {
+        int refcount = qcow2_get_refcount(bs, offset >> s->cluster_bits);
+        if (refcount < 0) {
+            return refcount;
         }
-        s->free_byte_offset = offset;
-    }
- redo:
-    free_in_cluster = s->cluster_size -
-        offset_into_cluster(s, s->free_byte_offset);
-    if (size <= free_in_cluster) {
-        /* enough space in current cluster */
-        offset = s->free_byte_offset;
-        s->free_byte_offset += size;
-        free_in_cluster -= size;
-        if (free_in_cluster == 0)
-            s->free_byte_offset = 0;
-        if (offset_into_cluster(s, offset) != 0)
-            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                          QCOW2_DISCARD_NEVER);
-    } else {
-        offset = qcow2_alloc_clusters(bs, s->cluster_size);
-        if (offset < 0) {
-            return offset;
+
+        if (refcount == 0xffff) {
+            offset = 0;
         }
-        cluster_offset = start_of_cluster(s, s->free_byte_offset);
-        if ((cluster_offset + s->cluster_size) == offset) {
-            /* we are lucky: contiguous data */
-            offset = s->free_byte_offset;
-            qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1,
-                                          QCOW2_DISCARD_NEVER);
-            s->free_byte_offset += size;
-        } else {
-            s->free_byte_offset = offset;
-            goto redo;
+    }
+
+    free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
+    if (!offset || free_in_cluster < size) {
+        int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
+        if (new_cluster < 0) {
+            return new_cluster;
+        }
+
+        if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
+            offset = new_cluster;
         }
     }
 
-    /* The cluster refcount was incremented, either by qcow2_alloc_clusters()
-     * or explicitly by qcow2_update_cluster_refcount().  Refcount blocks must
-     * be flushed before the caller's L2 table updates.
-     */
+    assert(offset);
+    ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER);
+    if (ret < 0) {
+        return ret;
+    }
+
+    /* The cluster refcount was incremented; refcount blocks must be flushed
+     * before the caller's L2 table updates. */
     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
+
+    s->free_byte_offset = offset + size;
+    if (!offset_into_cluster(s, s->free_byte_offset)) {
+        s->free_byte_offset = 0;
+    }
+
     return offset;
 }
 

+ 1 - 1
block/qcow2.c

@@ -869,7 +869,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
     if (header.backing_file_offset != 0) {
         len = header.backing_file_size;
         if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
-            len > sizeof(bs->backing_file)) {
+            len >= sizeof(bs->backing_file)) {
             error_setg(errp, "Backing file name too long");
             ret = -EINVAL;
             goto fail;

+ 5 - 0
block/qed.c

@@ -440,6 +440,11 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
     s->l2_mask = s->table_nelems - 1;
     s->l1_shift = s->l2_shift + ffs(s->table_nelems) - 1;
 
+    /* Header size calculation must not overflow uint32_t */
+    if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
+        return -EINVAL;
+    }
+
     if ((s->header.features & QED_F_BACKING_FILE)) {
         if ((uint64_t)s->header.backing_filename_offset +
             s->header.backing_filename_size >

+ 0 - 1
block/qed.h

@@ -133,7 +133,6 @@ typedef struct QEDAIOCB {
     int bh_ret;                     /* final return status for completion bh */
     QSIMPLEQ_ENTRY(QEDAIOCB) next;  /* next request */
     int flags;                      /* QED_AIOCB_* bits ORed together */
-    bool *finished;                 /* signal for cancel completion */
     uint64_t end_pos;               /* request end on block device, in bytes */
 
     /* User scatter-gather list */

+ 108 - 32
block/raw-posix.c

@@ -60,7 +60,7 @@
 #define FS_NOCOW_FL                     0x00800000 /* Do not cow file */
 #endif
 #endif
-#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE)
 #include <linux/falloc.h>
 #endif
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -147,6 +147,7 @@ typedef struct BDRVRawState {
     bool has_discard:1;
     bool has_write_zeroes:1;
     bool discard_zeroes:1;
+    bool has_fallocate;
     bool needs_alignment;
 } BDRVRawState;
 
@@ -452,6 +453,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
     }
     if (S_ISREG(st.st_mode)) {
         s->discard_zeroes = true;
+        s->has_fallocate = true;
     }
     if (S_ISBLK(st.st_mode)) {
 #ifdef BLKDISCARDZEROES
@@ -893,40 +895,108 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes)
 }
 #endif
 
-static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+static int translate_err(int err)
 {
-    int ret = -EOPNOTSUPP;
+    if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
+        err == -ENOTTY) {
+        err = -ENOTSUP;
+    }
+    return err;
+}
+
+#ifdef CONFIG_FALLOCATE
+static int do_fallocate(int fd, int mode, off_t offset, off_t len)
+{
+    do {
+        if (fallocate(fd, mode, offset, len) == 0) {
+            return 0;
+        }
+    } while (errno == EINTR);
+    return translate_err(-errno);
+}
+#endif
+
+static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb)
+{
+    int ret = -ENOTSUP;
     BDRVRawState *s = aiocb->bs->opaque;
 
-    if (s->has_write_zeroes == 0) {
+    if (!s->has_write_zeroes) {
         return -ENOTSUP;
     }
 
-    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
 #ifdef BLKZEROOUT
-        do {
-            uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
-            if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
-                return 0;
-            }
-        } while (errno == EINTR);
+    do {
+        uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes };
+        if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) {
+            return 0;
+        }
+    } while (errno == EINTR);
 
-        ret = -errno;
+    ret = translate_err(-errno);
 #endif
-    } else {
+
+    if (ret == -ENOTSUP) {
+        s->has_write_zeroes = false;
+    }
+    return ret;
+}
+
+static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb)
+{
+    BDRVRawState *s = aiocb->bs->opaque;
+
+    if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
+        return handle_aiocb_write_zeroes_block(aiocb);
+    }
+
 #ifdef CONFIG_XFS
-        if (s->is_xfs) {
-            return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+    if (s->is_xfs) {
+        return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes);
+    }
+#endif
+
+#ifdef CONFIG_FALLOCATE_ZERO_RANGE
+    if (s->has_write_zeroes) {
+        int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE,
+                               aiocb->aio_offset, aiocb->aio_nbytes);
+        if (ret == 0 || ret != -ENOTSUP) {
+            return ret;
         }
+        s->has_write_zeroes = false;
+    }
 #endif
+
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+    if (s->has_discard && s->has_fallocate) {
+        int ret = do_fallocate(s->fd,
+                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                               aiocb->aio_offset, aiocb->aio_nbytes);
+        if (ret == 0) {
+            ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
+            if (ret == 0 || ret != -ENOTSUP) {
+                return ret;
+            }
+            s->has_fallocate = false;
+        } else if (ret != -ENOTSUP) {
+            return ret;
+        } else {
+            s->has_discard = false;
+        }
     }
+#endif
 
-    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
-        ret == -ENOTTY) {
-        s->has_write_zeroes = false;
-        ret = -ENOTSUP;
+#ifdef CONFIG_FALLOCATE
+    if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) {
+        int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes);
+        if (ret == 0 || ret != -ENOTSUP) {
+            return ret;
+        }
+        s->has_fallocate = false;
     }
-    return ret;
+#endif
+
+    return -ENOTSUP;
 }
 
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
@@ -957,21 +1027,14 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 #endif
 
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
-        do {
-            if (fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-                          aiocb->aio_offset, aiocb->aio_nbytes) == 0) {
-                return 0;
-            }
-        } while (errno == EINTR);
-
-        ret = -errno;
+        ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                           aiocb->aio_offset, aiocb->aio_nbytes);
 #endif
     }
 
-    if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP ||
-        ret == -ENOTTY) {
+    ret = translate_err(ret);
+    if (ret == -ENOTSUP) {
         s->has_discard = false;
-        ret = -ENOTSUP;
     }
     return ret;
 }
@@ -1312,7 +1375,20 @@ again:
         if (size == 0)
 #endif
 #if defined(__APPLE__) && defined(__MACH__)
-        size = LLONG_MAX;
+        {
+            uint64_t sectors = 0;
+            uint32_t sector_size = 0;
+
+            if (ioctl(fd, DKIOCGETBLOCKCOUNT, &sectors) == 0
+               && ioctl(fd, DKIOCGETBLOCKSIZE, &sector_size) == 0) {
+                size = sectors * sector_size;
+            } else {
+                size = lseek(fd, 0LL, SEEK_END);
+                if (size < 0) {
+                    return -errno;
+                }
+            }
+        }
 #else
         size = lseek(fd, 0LL, SEEK_END);
         if (size < 0) {

+ 125 - 0
block/write-threshold.c

@@ -0,0 +1,125 @@
+/*
+ * QEMU System Emulator block write threshold notification
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ *  Francesco Romani <fromani@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "block/block_int.h"
+#include "block/coroutine.h"
+#include "block/write-threshold.h"
+#include "qemu/notify.h"
+#include "qapi-event.h"
+#include "qmp-commands.h"
+
+
+uint64_t bdrv_write_threshold_get(const BlockDriverState *bs)
+{
+    return bs->write_threshold_offset;
+}
+
+bool bdrv_write_threshold_is_set(const BlockDriverState *bs)
+{
+    return bs->write_threshold_offset > 0;
+}
+
+static void write_threshold_disable(BlockDriverState *bs)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        notifier_with_return_remove(&bs->write_threshold_notifier);
+        bs->write_threshold_offset = 0;
+    }
+}
+
+uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
+                                       const BdrvTrackedRequest *req)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        if (req->offset > bs->write_threshold_offset) {
+            return (req->offset - bs->write_threshold_offset) + req->bytes;
+        }
+        if ((req->offset + req->bytes) > bs->write_threshold_offset) {
+            return (req->offset + req->bytes) - bs->write_threshold_offset;
+        }
+    }
+    return 0;
+}
+
+static int coroutine_fn before_write_notify(NotifierWithReturn *notifier,
+                                            void *opaque)
+{
+    BdrvTrackedRequest *req = opaque;
+    BlockDriverState *bs = req->bs;
+    uint64_t amount = 0;
+
+    amount = bdrv_write_threshold_exceeded(bs, req);
+    if (amount > 0) {
+        qapi_event_send_block_write_threshold(
+            bs->node_name,
+            amount,
+            bs->write_threshold_offset,
+            &error_abort);
+
+        /* autodisable to avoid flooding the monitor */
+        write_threshold_disable(bs);
+    }
+
+    return 0; /* should always let other notifiers run */
+}
+
+static void write_threshold_register_notifier(BlockDriverState *bs)
+{
+    bs->write_threshold_notifier.notify = before_write_notify;
+    notifier_with_return_list_add(&bs->before_write_notifiers,
+                                  &bs->write_threshold_notifier);
+}
+
+static void write_threshold_update(BlockDriverState *bs,
+                                   int64_t threshold_bytes)
+{
+    bs->write_threshold_offset = threshold_bytes;
+}
+
+void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes)
+{
+    if (bdrv_write_threshold_is_set(bs)) {
+        if (threshold_bytes > 0) {
+            write_threshold_update(bs, threshold_bytes);
+        } else {
+            write_threshold_disable(bs);
+        }
+    } else {
+        if (threshold_bytes > 0) {
+            /* avoid multiple registration */
+            write_threshold_register_notifier(bs);
+            write_threshold_update(bs, threshold_bytes);
+        }
+        /* discard bogus disable request */
+    }
+}
+
+void qmp_block_set_write_threshold(const char *node_name,
+                                   uint64_t threshold_bytes,
+                                   Error **errp)
+{
+    BlockDriverState *bs;
+    AioContext *aio_context;
+
+    bs = bdrv_find_node(node_name);
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, node_name);
+        return;
+    }
+
+    aio_context = bdrv_get_aio_context(bs);
+    aio_context_acquire(aio_context);
+
+    bdrv_write_threshold_set(bs, threshold_bytes);
+
+    aio_context_release(aio_context);
+}

+ 11 - 33
blockdev.c

@@ -1793,7 +1793,6 @@ void qmp_block_passwd(bool has_device, const char *device,
     Error *local_err = NULL;
     BlockDriverState *bs;
     AioContext *aio_context;
-    int err;
 
     bs = bdrv_lookup_bs(has_device ? device : NULL,
                         has_node_name ? node_name : NULL,
@@ -1806,16 +1805,8 @@ void qmp_block_passwd(bool has_device, const char *device,
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    err = bdrv_set_key(bs, password);
-    if (err == -EINVAL) {
-        error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, bdrv_get_device_name(bs));
-        goto out;
-    } else if (err < 0) {
-        error_set(errp, QERR_INVALID_PASSWORD);
-        goto out;
-    }
+    bdrv_add_key(bs, password, errp);
 
-out:
     aio_context_release(aio_context);
 }
 
@@ -1833,18 +1824,7 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename,
         return;
     }
 
-    if (bdrv_key_required(bs)) {
-        if (password) {
-            if (bdrv_set_key(bs, password) < 0) {
-                error_set(errp, QERR_INVALID_PASSWORD);
-            }
-        } else {
-            error_set(errp, QERR_DEVICE_ENCRYPTED, bdrv_get_device_name(bs),
-                      bdrv_get_encrypted_filename(bs));
-        }
-    } else if (password) {
-        error_set(errp, QERR_DEVICE_NOT_ENCRYPTED, bdrv_get_device_name(bs));
-    }
+    bdrv_add_key(bs, password, errp);
 }
 
 void qmp_change_blockdev(const char *device, const char *filename,
@@ -2653,7 +2633,8 @@ out:
 }
 
 /* Get the block job for a given device name and acquire its AioContext */
-static BlockJob *find_block_job(const char *device, AioContext **aio_context)
+static BlockJob *find_block_job(const char *device, AioContext **aio_context,
+                                Error **errp)
 {
     BlockDriverState *bs;
 
@@ -2673,6 +2654,8 @@ static BlockJob *find_block_job(const char *device, AioContext **aio_context)
     return bs->job;
 
 notfound:
+    error_set(errp, ERROR_CLASS_DEVICE_NOT_ACTIVE,
+              "No active block job on device '%s'", device);
     *aio_context = NULL;
     return NULL;
 }
@@ -2680,10 +2663,9 @@ notfound:
 void qmp_block_job_set_speed(const char *device, int64_t speed, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2695,10 +2677,9 @@ void qmp_block_job_cancel(const char *device,
                           bool has_force, bool force, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2721,10 +2702,9 @@ out:
 void qmp_block_job_pause(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2736,10 +2716,9 @@ void qmp_block_job_pause(const char *device, Error **errp)
 void qmp_block_job_resume(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 
@@ -2751,10 +2730,9 @@ void qmp_block_job_resume(const char *device, Error **errp)
 void qmp_block_job_complete(const char *device, Error **errp)
 {
     AioContext *aio_context;
-    BlockJob *job = find_block_job(device, &aio_context);
+    BlockJob *job = find_block_job(device, &aio_context, errp);
 
     if (!job) {
-        error_set(errp, QERR_BLOCK_JOB_NOT_ACTIVE, device);
         return;
     }
 

+ 50 - 0
configure

@@ -313,6 +313,7 @@ glx=""
 zlib="yes"
 lzo=""
 snappy=""
+bzip2=""
 guest_agent=""
 guest_agent_with_vss="no"
 vss_win32_sdk=""
@@ -1060,6 +1061,10 @@ for opt do
   ;;
   --enable-snappy) snappy="yes"
   ;;
+  --disable-bzip2) bzip2="no"
+  ;;
+  --enable-bzip2) bzip2="yes"
+  ;;
   --enable-guest-agent) guest_agent="yes"
   ;;
   --disable-guest-agent) guest_agent="no"
@@ -1374,6 +1379,8 @@ Advanced options (experts only):
   --enable-usb-redir       enable usb network redirection support
   --enable-lzo             enable the support of lzo compression library
   --enable-snappy          enable the support of snappy compression library
+  --enable-bzip2           enable the support of bzip2 compression library (for
+                           reading bzip2-compressed dmg images)
   --disable-guest-agent    disable building of the QEMU Guest Agent
   --enable-guest-agent     enable building of the QEMU Guest Agent
   --with-vss-sdk=SDK-path  enable Windows VSS support in QEMU Guest Agent
@@ -1819,6 +1826,24 @@ EOF
     fi
 fi
 
+##########################################
+# bzip2 check
+
+if test "$bzip2" != "no" ; then
+    cat > $TMPC << EOF
+#include <bzlib.h>
+int main(void) { BZ2_bzlibVersion(); return 0; }
+EOF
+    if compile_prog "" "-lbz2" ; then
+        bzip2="yes"
+    else
+        if test "$bzip2" = "yes"; then
+            feature_not_found "libbzip2" "Install libbzip2 devel"
+        fi
+        bzip2="no"
+    fi
+fi
+
 ##########################################
 # libseccomp check
 
@@ -3335,6 +3360,22 @@ if compile_prog "" "" ; then
   fallocate_punch_hole=yes
 fi
 
+# check that fallocate supports range zeroing inside the file
+fallocate_zero_range=no
+cat > $TMPC << EOF
+#include <fcntl.h>
+#include <linux/falloc.h>
+
+int main(void)
+{
+    fallocate(0, FALLOC_FL_ZERO_RANGE, 0, 0);
+    return 0;
+}
+EOF
+if compile_prog "" "" ; then
+  fallocate_zero_range=yes
+fi
+
 # check for posix_fallocate
 posix_fallocate=no
 cat > $TMPC << EOF
@@ -4369,6 +4410,7 @@ echo "vhdx              $vhdx"
 echo "Quorum            $quorum"
 echo "lzo support       $lzo"
 echo "snappy support    $snappy"
+echo "bzip2 support     $bzip2"
 echo "NUMA host support $numa"
 
 if test "$sdl_too_old" = "yes"; then
@@ -4567,6 +4609,9 @@ fi
 if test "$fallocate_punch_hole" = "yes" ; then
   echo "CONFIG_FALLOCATE_PUNCH_HOLE=y" >> $config_host_mak
 fi
+if test "$fallocate_zero_range" = "yes" ; then
+  echo "CONFIG_FALLOCATE_ZERO_RANGE=y" >> $config_host_mak
+fi
 if test "$posix_fallocate" = "yes" ; then
   echo "CONFIG_POSIX_FALLOCATE=y" >> $config_host_mak
 fi
@@ -4724,6 +4769,11 @@ if test "$snappy" = "yes" ; then
   echo "CONFIG_SNAPPY=y" >> $config_host_mak
 fi
 
+if test "$bzip2" = "yes" ; then
+  echo "CONFIG_BZIP2=y" >> $config_host_mak
+  echo "BZIP2_LIBS=-lbz2" >> $config_host_mak
+fi
+
 if test "$libiscsi" = "yes" ; then
   echo "CONFIG_LIBISCSI=m" >> $config_host_mak
   echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak

+ 5 - 1
hmp.c

@@ -474,6 +474,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        " wr_total_time_ns=%" PRId64
                        " rd_total_time_ns=%" PRId64
                        " flush_total_time_ns=%" PRId64
+                       " rd_merged=%" PRId64
+                       " wr_merged=%" PRId64
                        "\n",
                        stats->value->stats->rd_bytes,
                        stats->value->stats->wr_bytes,
@@ -482,7 +484,9 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        stats->value->stats->flush_operations,
                        stats->value->stats->wr_total_time_ns,
                        stats->value->stats->rd_total_time_ns,
-                       stats->value->stats->flush_total_time_ns);
+                       stats->value->stats->flush_total_time_ns,
+                       stats->value->stats->rd_merged,
+                       stats->value->stats->wr_merged);
     }
 
     qapi_free_BlockStatsList(stats_list);

+ 4 - 4
hw/block/dataplane/virtio-blk.c

@@ -96,9 +96,7 @@ static void handle_notify(EventNotifier *e)
     event_notifier_test_and_clear(&s->host_notifier);
     blk_io_plug(s->conf->conf.blk);
     for (;;) {
-        MultiReqBuffer mrb = {
-            .num_writes = 0,
-        };
+        MultiReqBuffer mrb = {};
         int ret;
 
         /* Disable guest->host notifies to avoid unnecessary vmexits */
@@ -120,7 +118,9 @@ static void handle_notify(EventNotifier *e)
             virtio_blk_handle_request(req, &mrb);
         }
 
-        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
+        if (mrb.num_reqs) {
+            virtio_blk_submit_multireq(s->conf->conf.blk, &mrb);
+        }
 
         if (likely(ret == -EAGAIN)) { /* vring emptied */
             /* Re-enable guest->host notifies and stop processing the vring.

+ 205 - 94
hw/block/virtio-blk.c

@@ -34,6 +34,7 @@ VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
     req->dev = s;
     req->qiov.size = 0;
     req->next = NULL;
+    req->mr_next = NULL;
     return req;
 }
 
@@ -84,20 +85,32 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
 
 static void virtio_blk_rw_complete(void *opaque, int ret)
 {
-    VirtIOBlockReq *req = opaque;
+    VirtIOBlockReq *next = opaque;
 
-    trace_virtio_blk_rw_complete(req, ret);
+    while (next) {
+        VirtIOBlockReq *req = next;
+        next = req->mr_next;
+        trace_virtio_blk_rw_complete(req, ret);
 
-    if (ret) {
-        int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
-        bool is_read = !(p & VIRTIO_BLK_T_OUT);
-        if (virtio_blk_handle_rw_error(req, -ret, is_read))
-            return;
-    }
+        if (req->qiov.nalloc != -1) {
+            /* If nalloc is != 1 req->qiov is a local copy of the original
+             * external iovec. It was allocated in submit_merged_requests
+             * to be able to merge requests. */
+            qemu_iovec_destroy(&req->qiov);
+        }
 
-    virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
-    block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
-    virtio_blk_free_request(req);
+        if (ret) {
+            int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
+            bool is_read = !(p & VIRTIO_BLK_T_OUT);
+            if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
+                continue;
+            }
+        }
+
+        virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
+        block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
+        virtio_blk_free_request(req);
+    }
 }
 
 static void virtio_blk_flush_complete(void *opaque, int ret)
@@ -291,24 +304,127 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
     }
 }
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
+static inline void submit_requests(BlockBackend *blk, MultiReqBuffer *mrb,
+                                   int start, int num_reqs, int niov)
 {
-    int i, ret;
+    QEMUIOVector *qiov = &mrb->reqs[start]->qiov;
+    int64_t sector_num = mrb->reqs[start]->sector_num;
+    int nb_sectors = mrb->reqs[start]->qiov.size / BDRV_SECTOR_SIZE;
+    bool is_write = mrb->is_write;
+
+    if (num_reqs > 1) {
+        int i;
+        struct iovec *tmp_iov = qiov->iov;
+        int tmp_niov = qiov->niov;
+
+        /* mrb->reqs[start]->qiov was initialized from external so we can't
+         * modifiy it here. We need to initialize it locally and then add the
+         * external iovecs. */
+        qemu_iovec_init(qiov, niov);
+
+        for (i = 0; i < tmp_niov; i++) {
+            qemu_iovec_add(qiov, tmp_iov[i].iov_base, tmp_iov[i].iov_len);
+        }
 
-    if (!mrb->num_writes) {
+        for (i = start + 1; i < start + num_reqs; i++) {
+            qemu_iovec_concat(qiov, &mrb->reqs[i]->qiov, 0,
+                              mrb->reqs[i]->qiov.size);
+            mrb->reqs[i - 1]->mr_next = mrb->reqs[i];
+            nb_sectors += mrb->reqs[i]->qiov.size / BDRV_SECTOR_SIZE;
+        }
+        assert(nb_sectors == qiov->size / BDRV_SECTOR_SIZE);
+
+        trace_virtio_blk_submit_multireq(mrb, start, num_reqs, sector_num,
+                                         nb_sectors, is_write);
+        block_acct_merge_done(blk_get_stats(blk),
+                              is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
+                              num_reqs - 1);
+    }
+
+    if (is_write) {
+        blk_aio_writev(blk, sector_num, qiov, nb_sectors,
+                       virtio_blk_rw_complete, mrb->reqs[start]);
+    } else {
+        blk_aio_readv(blk, sector_num, qiov, nb_sectors,
+                      virtio_blk_rw_complete, mrb->reqs[start]);
+    }
+}
+
+static int multireq_compare(const void *a, const void *b)
+{
+    const VirtIOBlockReq *req1 = *(VirtIOBlockReq **)a,
+                         *req2 = *(VirtIOBlockReq **)b;
+
+    /*
+     * Note that we can't simply subtract sector_num1 from sector_num2
+     * here as that could overflow the return value.
+     */
+    if (req1->sector_num > req2->sector_num) {
+        return 1;
+    } else if (req1->sector_num < req2->sector_num) {
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb)
+{
+    int i = 0, start = 0, num_reqs = 0, niov = 0, nb_sectors = 0;
+    int max_xfer_len = 0;
+    int64_t sector_num = 0;
+
+    if (mrb->num_reqs == 1) {
+        submit_requests(blk, mrb, 0, 1, -1);
+        mrb->num_reqs = 0;
         return;
     }
 
-    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
-    if (ret != 0) {
-        for (i = 0; i < mrb->num_writes; i++) {
-            if (mrb->blkreq[i].error) {
-                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
+    max_xfer_len = blk_get_max_transfer_length(mrb->reqs[0]->dev->blk);
+    max_xfer_len = MIN_NON_ZERO(max_xfer_len, BDRV_REQUEST_MAX_SECTORS);
+
+    qsort(mrb->reqs, mrb->num_reqs, sizeof(*mrb->reqs),
+          &multireq_compare);
+
+    for (i = 0; i < mrb->num_reqs; i++) {
+        VirtIOBlockReq *req = mrb->reqs[i];
+        if (num_reqs > 0) {
+            bool merge = true;
+
+            /* merge would exceed maximum number of IOVs */
+            if (niov + req->qiov.niov > IOV_MAX) {
+                merge = false;
+            }
+
+            /* merge would exceed maximum transfer length of backend device */
+            if (req->qiov.size / BDRV_SECTOR_SIZE + nb_sectors > max_xfer_len) {
+                merge = false;
+            }
+
+            /* requests are not sequential */
+            if (sector_num + nb_sectors != req->sector_num) {
+                merge = false;
+            }
+
+            if (!merge) {
+                submit_requests(blk, mrb, start, num_reqs, niov);
+                num_reqs = 0;
             }
         }
+
+        if (num_reqs == 0) {
+            sector_num = req->sector_num;
+            nb_sectors = niov = 0;
+            start = i;
+        }
+
+        nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
+        niov += req->qiov.niov;
+        num_reqs++;
     }
 
-    mrb->num_writes = 0;
+    submit_requests(blk, mrb, start, num_reqs, niov);
+    mrb->num_reqs = 0;
 }
 
 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
@@ -319,7 +435,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     /*
      * Make sure all outstanding writes are posted to the backing device.
      */
-    virtio_submit_multiwrite(req->dev->blk, mrb);
+    if (mrb->is_write && mrb->num_reqs > 0) {
+        virtio_blk_submit_multireq(req->dev->blk, mrb);
+    }
     blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
 }
 
@@ -329,6 +447,9 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     uint64_t nb_sectors = size >> BDRV_SECTOR_BITS;
     uint64_t total_sectors;
 
+    if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+        return false;
+    }
     if (sector & dev->sector_mask) {
         return false;
     }
@@ -342,60 +463,6 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     return true;
 }
 
-static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
-{
-    BlockRequest *blkreq;
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_WRITE);
-
-    if (mrb->num_writes == 32) {
-        virtio_submit_multiwrite(req->dev->blk, mrb);
-    }
-
-    blkreq = &mrb->blkreq[mrb->num_writes];
-    blkreq->sector = sector;
-    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
-    blkreq->qiov = &req->qiov;
-    blkreq->cb = virtio_blk_rw_complete;
-    blkreq->opaque = req;
-    blkreq->error = 0;
-
-    mrb->num_writes++;
-}
-
-static void virtio_blk_handle_read(VirtIOBlockReq *req)
-{
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_READ);
-    blk_aio_readv(req->dev->blk, sector, &req->qiov,
-                  req->qiov.size / BDRV_SECTOR_SIZE,
-                  virtio_blk_rw_complete, req);
-}
-
 void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 {
     uint32_t type;
@@ -430,11 +497,58 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
 
     type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
 
-    if (type & VIRTIO_BLK_T_FLUSH) {
+    /* VIRTIO_BLK_T_OUT defines the command direction. VIRTIO_BLK_T_BARRIER
+     * is an optional flag. Altough a guest should not send this flag if
+     * not negotiated we ignored it in the past. So keep ignoring it. */
+    switch (type & ~(VIRTIO_BLK_T_OUT | VIRTIO_BLK_T_BARRIER)) {
+    case VIRTIO_BLK_T_IN:
+    {
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        req->sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev),
+                                       &req->out.sector);
+
+        if (is_write) {
+            qemu_iovec_init_external(&req->qiov, iov, out_num);
+            trace_virtio_blk_handle_write(req, req->sector_num,
+                                          req->qiov.size / BDRV_SECTOR_SIZE);
+        } else {
+            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
+            trace_virtio_blk_handle_read(req, req->sector_num,
+                                         req->qiov.size / BDRV_SECTOR_SIZE);
+        }
+
+        if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
+                                      req->qiov.size)) {
+            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+            virtio_blk_free_request(req);
+            return;
+        }
+
+        block_acct_start(blk_get_stats(req->dev->blk),
+                         &req->acct, req->qiov.size,
+                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+        /* merge would exceed maximum number of requests or IO direction
+         * changes */
+        if (mrb->num_reqs > 0 && (mrb->num_reqs == VIRTIO_BLK_MAX_MERGE_REQS ||
+                                  is_write != mrb->is_write ||
+                                  !req->dev->conf.request_merging)) {
+            virtio_blk_submit_multireq(req->dev->blk, mrb);
+        }
+
+        assert(mrb->num_reqs < VIRTIO_BLK_MAX_MERGE_REQS);
+        mrb->reqs[mrb->num_reqs++] = req;
+        mrb->is_write = is_write;
+        break;
+    }
+    case VIRTIO_BLK_T_FLUSH:
         virtio_blk_handle_flush(req, mrb);
-    } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
+        break;
+    case VIRTIO_BLK_T_SCSI_CMD:
         virtio_blk_handle_scsi(req);
-    } else if (type & VIRTIO_BLK_T_GET_ID) {
+        break;
+    case VIRTIO_BLK_T_GET_ID:
+    {
         VirtIOBlock *s = req->dev;
 
         /*
@@ -448,14 +562,9 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         iov_from_buf(in_iov, in_num, 0, serial, size);
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
         virtio_blk_free_request(req);
-    } else if (type & VIRTIO_BLK_T_OUT) {
-        qemu_iovec_init_external(&req->qiov, iov, out_num);
-        virtio_blk_handle_write(req, mrb);
-    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
-        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
-        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
-        virtio_blk_handle_read(req);
-    } else {
+        break;
+    }
+    default:
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
         virtio_blk_free_request(req);
     }
@@ -465,9 +574,7 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     VirtIOBlockReq *req;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb = {};
 
     /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
      * dataplane here instead of waiting for .set_status().
@@ -481,7 +588,9 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
         virtio_blk_handle_request(req, &mrb);
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    if (mrb.num_reqs) {
+        virtio_blk_submit_multireq(s->blk, &mrb);
+    }
 
     /*
      * FIXME: Want to check for completions before returning to guest mode,
@@ -494,9 +603,7 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 {
     VirtIOBlock *s = opaque;
     VirtIOBlockReq *req = s->rq;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb = {};
 
     qemu_bh_delete(s->bh);
     s->bh = NULL;
@@ -509,7 +616,9 @@ static void virtio_blk_dma_restart_bh(void *opaque)
         req = next;
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    if (mrb.num_reqs) {
+        virtio_blk_submit_multireq(s->blk, &mrb);
+    }
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
@@ -842,6 +951,8 @@ static Property virtio_blk_properties[] = {
 #ifdef __linux__
     DEFINE_PROP_BIT("scsi", VirtIOBlock, conf.scsi, 0, true),
 #endif
+    DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
+                    true),
     DEFINE_PROP_BIT("x-data-plane", VirtIOBlock, conf.data_plane, 0, false),
     DEFINE_PROP_END_OF_LIST(),
 };

+ 17 - 0
hw/ide/atapi.c

@@ -394,6 +394,23 @@ static void ide_atapi_cmd_read(IDEState *s, int lba, int nb_sectors,
     }
 }
 
+
+/* Called by *_restart_bh when the transfer function points
+ * to ide_atapi_cmd
+ */
+void ide_atapi_dma_restart(IDEState *s)
+{
+    /*
+     * I'm not sure we have enough stored to restart the command
+     * safely, so give the guest an error it should recover from.
+     * I'm assuming most guests will try to recover from something
+     * listed as a medium error on a CD; it seems to work on Linux.
+     * This would be more of a problem if we did any other type of
+     * DMA operation.
+     */
+    ide_atapi_cmd_error(s, MEDIUM_ERROR, ASC_NO_SEEK_COMPLETE);
+}
+
 static inline uint8_t ide_atapi_set_profile(uint8_t *buf, uint8_t *index,
                                             uint16_t profile)
 {

+ 1 - 0
hw/ide/core.c

@@ -2417,6 +2417,7 @@ static int ide_drive_pio_post_load(void *opaque, int version_id)
     s->end_transfer_func = transfer_end_table[s->end_transfer_fn_idx];
     s->data_ptr = s->io_buffer + s->cur_io_buffer_offset;
     s->data_end = s->data_ptr + s->cur_io_buffer_len;
+    s->atapi_dma = s->feature & 1; /* as per cmd_packet */
 
     return 0;
 }

+ 2 - 0
hw/ide/internal.h

@@ -289,6 +289,7 @@ typedef struct IDEDMAOps IDEDMAOps;
 #define ATAPI_INT_REASON_TAG            0xf8
 
 /* same constants as bochs */
+#define ASC_NO_SEEK_COMPLETE                 0x02
 #define ASC_ILLEGAL_OPCODE                   0x20
 #define ASC_LOGICAL_BLOCK_OOR                0x21
 #define ASC_INV_FIELD_IN_CMD_PACKET          0x24
@@ -530,6 +531,7 @@ void ide_dma_error(IDEState *s);
 
 void ide_atapi_cmd_ok(IDEState *s);
 void ide_atapi_cmd_error(IDEState *s, int sense_key, int asc);
+void ide_atapi_dma_restart(IDEState *s);
 void ide_atapi_io_error(IDEState *s, int ret);
 
 void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val);

+ 11 - 0
hw/ide/pci.c

@@ -235,6 +235,17 @@ static void bmdma_restart_bh(void *opaque)
         }
     } else if (error_status & IDE_RETRY_FLUSH) {
         ide_flush_cache(bmdma_active_if(bm));
+    } else {
+        IDEState *s = bmdma_active_if(bm);
+
+        /*
+         * We've not got any bits to tell us about ATAPI - but
+         * we do have the end_transfer_func that tells us what
+         * we're trying to do.
+         */
+        if (s->end_transfer_func == ide_atapi_cmd) {
+            ide_atapi_dma_restart(s);
+        }
     }
 }
 

+ 3 - 0
include/block/accounting.h

@@ -39,6 +39,7 @@ typedef struct BlockAcctStats {
     uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
     uint64_t nr_ops[BLOCK_MAX_IOTYPE];
     uint64_t total_time_ns[BLOCK_MAX_IOTYPE];
+    uint64_t merged[BLOCK_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 } BlockAcctStats;
 
@@ -53,5 +54,7 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie);
 void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
                                unsigned int nb_sectors);
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                           int num_requests);
 
 #endif

+ 4 - 0
include/block/block.h

@@ -83,6 +83,9 @@ typedef enum {
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
 #define BDRV_SECTOR_MASK   ~(BDRV_SECTOR_SIZE - 1)
 
+#define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
+                                     INT_MAX >> BDRV_SECTOR_BITS)
+
 /*
  * Allocation status flags
  * BDRV_BLOCK_DATA: data is read from bs->file or another file
@@ -378,6 +381,7 @@ BlockDriverState *bdrv_next(BlockDriverState *bs);
 int bdrv_is_encrypted(BlockDriverState *bs);
 int bdrv_key_required(BlockDriverState *bs);
 int bdrv_set_key(BlockDriverState *bs, const char *key);
+void bdrv_add_key(BlockDriverState *bs, const char *key, Error **errp);
 int bdrv_query_missing_keys(void);
 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
                          void *opaque);

+ 4 - 0
include/block/block_int.h

@@ -412,6 +412,10 @@ struct BlockDriverState {
 
     /* The error object in use for blocking operations on backing_hd */
     Error *backing_blocker;
+
+    /* threshold limit for writes, in bytes. "High water mark". */
+    uint64_t write_threshold_offset;
+    NotifierWithReturn write_threshold_notifier;
 };
 
 

+ 1 - 1
include/block/nbd.h

@@ -75,7 +75,7 @@ enum {
 
 ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read);
 int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
-                          off_t *size, size_t *blocksize);
+                          off_t *size, size_t *blocksize, Error **errp);
 int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize);
 ssize_t nbd_send_request(int csock, struct nbd_request *request);
 ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply);

+ 64 - 0
include/block/write-threshold.h

@@ -0,0 +1,64 @@
+/*
+ * QEMU System Emulator block write threshold notification
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ *  Francesco Romani <fromani@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+#ifndef BLOCK_WRITE_THRESHOLD_H
+#define BLOCK_WRITE_THRESHOLD_H
+
+#include <stdint.h>
+
+#include "qemu/typedefs.h"
+#include "qemu-common.h"
+
+/*
+ * bdrv_write_threshold_set:
+ *
+ * Set the write threshold for block devices, in bytes.
+ * Notify when a write exceeds the threshold, meaning the device
+ * is becoming full, so it can be transparently resized.
+ * To be used with thin-provisioned block devices.
+ *
+ * Use threshold_bytes == 0 to disable.
+ */
+void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes);
+
+/*
+ * bdrv_write_threshold_get
+ *
+ * Get the configured write threshold, in bytes.
+ * Zero means no threshold configured.
+ */
+uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
+
+/*
+ * bdrv_write_threshold_is_set
+ *
+ * Tell if a write threshold is set for a given BDS.
+ */
+bool bdrv_write_threshold_is_set(const BlockDriverState *bs);
+
+/*
+ * bdrv_write_threshold_exceeded
+ *
+ * Return the extent of a write request that exceeded the threshold,
+ * or zero if the request is below the threshold.
+ * Return zero also if the threshold was not set.
+ *
+ * NOTE: here we assume the following holds for each request this code
+ * deals with:
+ *
+ * assert((req->offset + req->bytes) <= UINT64_MAX)
+ *
+ * Please not there is *not* an actual C assert().
+ */
+uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs,
+                                       const BdrvTrackedRequest *req);
+
+#endif

+ 12 - 6
include/hw/virtio/virtio-blk.h

@@ -113,6 +113,7 @@ struct VirtIOBlkConf
     uint32_t scsi;
     uint32_t config_wce;
     uint32_t data_plane;
+    uint32_t request_merging;
 };
 
 struct VirtIOBlockDataPlane;
@@ -134,27 +135,32 @@ typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
-typedef struct MultiReqBuffer {
-    BlockRequest        blkreq[32];
-    unsigned int        num_writes;
-} MultiReqBuffer;
-
 typedef struct VirtIOBlockReq {
+    int64_t sector_num;
     VirtIOBlock *dev;
     VirtQueueElement elem;
     struct virtio_blk_inhdr *in;
     struct virtio_blk_outhdr out;
     QEMUIOVector qiov;
     struct VirtIOBlockReq *next;
+    struct VirtIOBlockReq *mr_next;
     BlockAcctCookie acct;
 } VirtIOBlockReq;
 
+#define VIRTIO_BLK_MAX_MERGE_REQS 32
+
+typedef struct MultiReqBuffer {
+    VirtIOBlockReq *reqs[VIRTIO_BLK_MAX_MERGE_REQS];
+    unsigned int num_reqs;
+    bool is_write;
+} MultiReqBuffer;
+
 VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s);
 
 void virtio_blk_free_request(VirtIOBlockReq *req);
 
 void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb);
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb);
+void virtio_blk_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb);
 
 #endif

+ 0 - 9
include/qapi/qmp/qerror.h

@@ -37,9 +37,6 @@ void qerror_report_err(Error *err);
 #define QERR_BASE_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Base '%s' not found"
 
-#define QERR_BLOCK_JOB_NOT_ACTIVE \
-    ERROR_CLASS_DEVICE_NOT_ACTIVE, "No active block job on device '%s'"
-
 #define QERR_BLOCK_JOB_NOT_READY \
     ERROR_CLASS_GENERIC_ERROR, "The active block job for device '%s' cannot be completed"
 
@@ -52,9 +49,6 @@ void qerror_report_err(Error *err);
 #define QERR_BUS_NOT_FOUND \
     ERROR_CLASS_GENERIC_ERROR, "Bus '%s' not found"
 
-#define QERR_DEVICE_ENCRYPTED \
-    ERROR_CLASS_DEVICE_ENCRYPTED, "'%s' (%s) is encrypted"
-
 #define QERR_DEVICE_HAS_NO_MEDIUM \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' has no medium"
 
@@ -70,9 +64,6 @@ void qerror_report_err(Error *err);
 #define QERR_DEVICE_NO_HOTPLUG \
     ERROR_CLASS_GENERIC_ERROR, "Device '%s' does not support hotplugging"
 
-#define QERR_DEVICE_NOT_ENCRYPTED \
-    ERROR_CLASS_GENERIC_ERROR, "Device '%s' is not encrypted"
-
 #define QERR_DEVICE_NOT_FOUND \
     ERROR_CLASS_DEVICE_NOT_FOUND, "Device '%s' not found"
 

+ 1 - 0
include/sysemu/block-backend.h

@@ -127,6 +127,7 @@ int blk_is_inserted(BlockBackend *blk);
 void blk_lock_medium(BlockBackend *blk, bool locked);
 void blk_eject(BlockBackend *blk, bool eject_flag);
 int blk_get_flags(BlockBackend *blk);
+int blk_get_max_transfer_length(BlockBackend *blk);
 void blk_set_guest_block_size(BlockBackend *blk, int align);
 void *blk_blockalign(BlockBackend *blk, size_t size);
 bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp);

+ 11 - 5
monitor.c

@@ -5368,9 +5368,12 @@ static void bdrv_password_cb(void *opaque, const char *password,
     Monitor *mon = opaque;
     BlockDriverState *bs = readline_opaque;
     int ret = 0;
+    Error *local_err = NULL;
 
-    if (bdrv_set_key(bs, password) != 0) {
-        monitor_printf(mon, "invalid password\n");
+    bdrv_add_key(bs, password, &local_err);
+    if (local_err) {
+        monitor_printf(mon, "%s\n", error_get_pretty(local_err));
+        error_free(local_err);
         ret = -EPERM;
     }
     if (mon->password_completion_cb)
@@ -5388,17 +5391,20 @@ int monitor_read_bdrv_key_start(Monitor *mon, BlockDriverState *bs,
                                 BlockCompletionFunc *completion_cb,
                                 void *opaque)
 {
+    Error *local_err = NULL;
     int err;
 
-    if (!bdrv_key_required(bs)) {
+    bdrv_add_key(bs, NULL, &local_err);
+    if (!local_err) {
         if (completion_cb)
             completion_cb(opaque, 0);
         return 0;
     }
 
+    /* Need a key for @bs */
+
     if (monitor_ctrl_mode(mon)) {
-        qerror_report(QERR_DEVICE_ENCRYPTED, bdrv_get_device_name(bs),
-                      bdrv_get_encrypted_filename(bs));
+        qerror_report_err(local_err);
         return -1;
     }
 

+ 25 - 17
nbd.c

@@ -494,7 +494,7 @@ fail:
 }
 
 int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
-                          off_t *size, size_t *blocksize)
+                          off_t *size, size_t *blocksize, Error **errp)
 {
     char buf[256];
     uint64_t magic, s;
@@ -506,13 +506,13 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
     rc = -EINVAL;
 
     if (read_sync(csock, buf, 8) != 8) {
-        LOG("read failed");
+        error_setg(errp, "Failed to read data");
         goto fail;
     }
 
     buf[8] = '\0';
     if (strlen(buf) == 0) {
-        LOG("server connection closed");
+        error_setg(errp, "Server connection closed unexpectedly");
         goto fail;
     }
 
@@ -527,12 +527,12 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
           qemu_isprint(buf[7]) ? buf[7] : '.');
 
     if (memcmp(buf, "NBDMAGIC", 8) != 0) {
-        LOG("Invalid magic received");
+        error_setg(errp, "Invalid magic received");
         goto fail;
     }
 
     if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
-        LOG("read failed");
+        error_setg(errp, "Failed to read magic");
         goto fail;
     }
     magic = be64_to_cpu(magic);
@@ -545,52 +545,60 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
 
         TRACE("Checking magic (opts_magic)");
         if (magic != NBD_OPTS_MAGIC) {
-            LOG("Bad magic received");
+            if (magic == NBD_CLIENT_MAGIC) {
+                error_setg(errp, "Server does not support export names");
+            } else {
+                error_setg(errp, "Bad magic received");
+            }
             goto fail;
         }
         if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
-            LOG("flags read failed");
+            error_setg(errp, "Failed to read server flags");
             goto fail;
         }
         *flags = be16_to_cpu(tmp) << 16;
         /* reserved for future use */
         if (write_sync(csock, &reserved, sizeof(reserved)) !=
             sizeof(reserved)) {
-            LOG("write failed (reserved)");
+            error_setg(errp, "Failed to read reserved field");
             goto fail;
         }
         /* write the export name */
         magic = cpu_to_be64(magic);
         if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
-            LOG("write failed (magic)");
+            error_setg(errp, "Failed to send export name magic");
             goto fail;
         }
         opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
         if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
-            LOG("write failed (opt)");
+            error_setg(errp, "Failed to send export name option number");
             goto fail;
         }
         namesize = cpu_to_be32(strlen(name));
         if (write_sync(csock, &namesize, sizeof(namesize)) !=
             sizeof(namesize)) {
-            LOG("write failed (namesize)");
+            error_setg(errp, "Failed to send export name length");
             goto fail;
         }
         if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
-            LOG("write failed (name)");
+            error_setg(errp, "Failed to send export name");
             goto fail;
         }
     } else {
         TRACE("Checking magic (cli_magic)");
 
         if (magic != NBD_CLIENT_MAGIC) {
-            LOG("Bad magic received");
+            if (magic == NBD_OPTS_MAGIC) {
+                error_setg(errp, "Server requires an export name");
+            } else {
+                error_setg(errp, "Bad magic received");
+            }
             goto fail;
         }
     }
 
     if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
-        LOG("read failed");
+        error_setg(errp, "Failed to read export length");
         goto fail;
     }
     *size = be64_to_cpu(s);
@@ -599,19 +607,19 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
 
     if (!name) {
         if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
-            LOG("read failed (flags)");
+            error_setg(errp, "Failed to read export flags");
             goto fail;
         }
         *flags = be32_to_cpup(flags);
     } else {
         if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
-            LOG("read failed (tmp)");
+            error_setg(errp, "Failed to read export flags");
             goto fail;
         }
         *flags |= be32_to_cpu(tmp);
     }
     if (read_sync(csock, &buf, 124) != 124) {
-        LOG("read failed (buf)");
+        error_setg(errp, "Failed to read reserved block");
         goto fail;
     }
     rc = 0;

+ 58 - 2
qapi/block-core.json

@@ -257,6 +257,9 @@
 #
 # @cache: the cache mode used for the block device (since: 2.3)
 #
+# @write_threshold: configured write threshold for the device.
+#                   0 if disabled. (Since 2.3)
+#
 # Since: 0.14.0
 #
 ##
@@ -271,7 +274,8 @@
             '*bps_max': 'int', '*bps_rd_max': 'int',
             '*bps_wr_max': 'int', '*iops_max': 'int',
             '*iops_rd_max': 'int', '*iops_wr_max': 'int',
-            '*iops_size': 'int', 'cache': 'BlockdevCacheInfo' } }
+            '*iops_size': 'int', 'cache': 'BlockdevCacheInfo',
+            'write_threshold': 'int' } }
 
 ##
 # @BlockDeviceIoStatus:
@@ -407,13 +411,20 @@
 #                     growable sparse files (like qcow2) that are used on top
 #                     of a physical device.
 #
+# @rd_merged: Number of read requests that have been merged into another
+#             request (Since 2.3).
+#
+# @wr_merged: Number of write requests that have been merged into another
+#             request (Since 2.3).
+#
 # Since: 0.14.0
 ##
 { 'type': 'BlockDeviceStats',
   'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'rd_operations': 'int',
            'wr_operations': 'int', 'flush_operations': 'int',
            'flush_total_time_ns': 'int', 'wr_total_time_ns': 'int',
-           'rd_total_time_ns': 'int', 'wr_highest_offset': 'int' } }
+           'rd_total_time_ns': 'int', 'wr_highest_offset': 'int',
+           'rd_merged': 'int', 'wr_merged': 'int' } }
 
 ##
 # @BlockStats:
@@ -1910,3 +1921,48 @@
 ##
 { 'enum': 'PreallocMode',
   'data': [ 'off', 'metadata', 'falloc', 'full' ] }
+
+##
+# @BLOCK_WRITE_THRESHOLD
+#
+# Emitted when writes on block device reaches or exceeds the
+# configured write threshold. For thin-provisioned devices, this
+# means the device should be extended to avoid pausing for
+# disk exhaustion.
+# The event is one shot. Once triggered, it needs to be
+# re-registered with another block-set-threshold command.
+#
+# @node-name: graph node name on which the threshold was exceeded.
+#
+# @amount-exceeded: amount of data which exceeded the threshold, in bytes.
+#
+# @write-threshold: last configured threshold, in bytes.
+#
+# Since: 2.3
+##
+{ 'event': 'BLOCK_WRITE_THRESHOLD',
+  'data': { 'node-name': 'str',
+            'amount-exceeded': 'uint64',
+            'write-threshold': 'uint64' } }
+
+##
+# @block-set-write-threshold
+#
+# Change the write threshold for a block drive. An event will be delivered
+# if a write to this block drive crosses the configured threshold.
+# This is useful to transparently resize thin-provisioned drives without
+# the guest OS noticing.
+#
+# @node-name: graph node name on which the threshold must be set.
+#
+# @write-threshold: configured threshold for the block device, bytes.
+#                   Use 0 to disable the threshold.
+#
+# Returns: Nothing on success
+#          If @node name is not found on the block device graph,
+#          DeviceNotFound
+#
+# Since: 2.3
+##
+{ 'command': 'block-set-write-threshold',
+  'data': { 'node-name': 'str', 'write-threshold': 'uint64' } }

+ 1 - 1
qemu-img.c

@@ -35,7 +35,7 @@
 #include "block/qapi.h"
 #include <getopt.h>
 
-#define QEMU_IMG_VERSION "qemu-img version " QEMU_VERSION \
+#define QEMU_IMG_VERSION "qemu-img version " QEMU_VERSION QEMU_PKGVERSION \
                           ", Copyright (c) 2004-2008 Fabrice Bellard\n"
 
 typedef struct img_cmd_t {

+ 6 - 1
qemu-nbd.c

@@ -284,6 +284,7 @@ static void *nbd_client_thread(void *arg)
     int fd, sock;
     int ret;
     pthread_t show_parts_thread;
+    Error *local_error = NULL;
 
     sock = unix_socket_outgoing(sockpath);
     if (sock < 0) {
@@ -291,8 +292,12 @@ static void *nbd_client_thread(void *arg)
     }
 
     ret = nbd_receive_negotiate(sock, NULL, &nbdflags,
-                                &size, &blocksize);
+                                &size, &blocksize, &local_error);
     if (ret < 0) {
+        if (local_error) {
+            fprintf(stderr, "%s\n", error_get_pretty(local_error));
+            error_free(local_error);
+        }
         goto out_socket;
     }
 

+ 50 - 4
qmp-commands.hx

@@ -2146,6 +2146,8 @@ Each json-object contain the following:
          - "iops_size": I/O size when limiting by iops (json-int)
          - "detect_zeroes": detect and optimize zero writing (json-string)
              - Possible values: "off", "on", "unmap"
+         - "write_threshold": write offset threshold in bytes, a event will be
+                              emitted if crossed. Zero if disabled (json-int)
          - "image": the detail of the image, it is a json-object containing
             the following:
              - "filename": image file name (json-string)
@@ -2223,6 +2225,7 @@ Example:
                "iops_wr_max": 0,
                "iops_size": 0,
                "detect_zeroes": "on",
+               "write_threshold": 0,
                "image":{
                   "filename":"disks/test.qcow2",
                   "format":"qcow2",
@@ -2303,6 +2306,10 @@ Each json-object contain the following:
     - "flush_total_time_ns": total time spend on cache flushes in nano-seconds (json-int)
     - "wr_highest_offset": Highest offset of a sector written since the
                            BlockDriverState has been opened (json-int)
+    - "rd_merged": number of read requests that have been merged into
+                   another request (json-int)
+    - "wr_merged": number of write requests that have been merged into
+                   another request (json-int)
 - "parent": Contains recursively the statistics of the underlying
             protocol (e.g. the host file for a qcow2 image). If there is
             no underlying protocol, this field is omitted
@@ -2326,6 +2333,8 @@ Example:
                   "rd_total_times_ns":3465673657
                   "flush_total_times_ns":49653
                   "flush_operations":61,
+                  "rd_merged":0,
+                  "wr_merged":0
                }
             },
             "stats":{
@@ -2337,7 +2346,9 @@ Example:
                "flush_operations":51,
                "wr_total_times_ns":313253456
                "rd_total_times_ns":3465673657
-               "flush_total_times_ns":49653
+               "flush_total_times_ns":49653,
+               "rd_merged":0,
+               "wr_merged":0
             }
          },
          {
@@ -2351,7 +2362,9 @@ Example:
                "flush_operations":0,
                "wr_total_times_ns":0
                "rd_total_times_ns":0
-               "flush_total_times_ns":0
+               "flush_total_times_ns":0,
+               "rd_merged":0,
+               "wr_merged":0
             }
          },
          {
@@ -2365,7 +2378,9 @@ Example:
                "flush_operations":0,
                "wr_total_times_ns":0
                "rd_total_times_ns":0
-               "flush_total_times_ns":0
+               "flush_total_times_ns":0,
+               "rd_merged":0,
+               "wr_merged":0
             }
          },
          {
@@ -2379,7 +2394,9 @@ Example:
                "flush_operations":0,
                "wr_total_times_ns":0
                "rd_total_times_ns":0
-               "flush_total_times_ns":0
+               "flush_total_times_ns":0,
+               "rd_merged":0,
+               "wr_merged":0
             }
          }
       ]
@@ -3671,6 +3688,7 @@ Example:
                    "iops_rd_max": 0,
                    "iops_wr_max": 0,
                    "iops_size": 0,
+                   "write_threshold": 0,
                    "image":{
                       "filename":"disks/test.qcow2",
                       "format":"qcow2",
@@ -3906,4 +3924,32 @@ Move mouse pointer to absolute coordinates (20000, 400).
                { "type": "abs", "data" : { "axis": "Y", "value" : 400 } } ] } }
 <- { "return": {} }
 
+EQMP
+
+    {
+        .name       = "block-set-write-threshold",
+        .args_type  = "node-name:s,write-threshold:l",
+        .mhandler.cmd_new = qmp_marshal_input_block_set_write_threshold,
+    },
+
+SQMP
+block-set-write-threshold
+------------
+
+Change the write threshold for a block drive. The threshold is an offset,
+thus must be non-negative. Default is no write threshold.
+Setting the threshold to zero disables it.
+
+Arguments:
+
+- "node-name": the node name in the block driver state graph (json-string)
+- "write-threshold": the write threshold in bytes (json-int)
+
+Example:
+
+-> { "execute": "block-set-write-threshold",
+  "arguments": { "node-name": "mydev",
+                 "write-threshold": 17179869184 } }
+<- { "return": {} }
+
 EQMP

+ 4 - 4
qmp.c

@@ -154,6 +154,7 @@ SpiceInfo *qmp_query_spice(Error **errp)
 
 void qmp_cont(Error **errp)
 {
+    Error *local_err = NULL;
     BlockDriverState *bs;
 
     if (runstate_needs_reset()) {
@@ -167,10 +168,9 @@ void qmp_cont(Error **errp)
         bdrv_iostatus_reset(bs);
     }
     for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
-        if (bdrv_key_required(bs)) {
-            error_set(errp, QERR_DEVICE_ENCRYPTED,
-                      bdrv_get_device_name(bs),
-                      bdrv_get_encrypted_filename(bs));
+        bdrv_add_key(bs, NULL, &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
             return;
         }
     }

+ 3 - 0
tests/Makefile

@@ -68,6 +68,8 @@ check-unit-y += tests/check-qom-interface$(EXESUF)
 gcov-files-check-qom-interface-y = qom/object.c
 check-unit-y += tests/test-qemu-opts$(EXESUF)
 gcov-files-test-qemu-opts-y = qom/test-qemu-opts.c
+check-unit-y += tests/test-write-threshold$(EXESUF)
+gcov-files-test-write-threshold-y = block/write-threshold.c
 
 check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
 
@@ -360,6 +362,7 @@ tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
 tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-char.o qemu-timer.o $(qtest-obj-y)
 tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
 tests/test-qemu-opts$(EXESUF): tests/test-qemu-opts.o libqemuutil.a libqemustub.a
+tests/test-write-threshold$(EXESUF): tests/test-write-threshold.o $(block-obj-y) libqemuutil.a libqemustub.a
 
 ifeq ($(CONFIG_POSIX),y)
 LIBS += -lutil

+ 5 - 0
tests/qemu-iotests/067.out

@@ -43,6 +43,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk -device virti
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -218,6 +219,7 @@ Testing: -drive file=TEST_DIR/t.qcow2,format=qcow2,if=none,id=disk
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -423,6 +425,7 @@ Testing:
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -607,6 +610,7 @@ Testing:
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,
@@ -717,6 +721,7 @@ Testing:
                 "drv": "qcow2",
                 "iops": 0,
                 "bps_wr": 0,
+                "write_threshold": 0,
                 "encrypted": false,
                 "bps": 0,
                 "bps_rd": 0,

+ 2 - 1
tests/qemu-iotests/083

@@ -56,7 +56,8 @@ filter_nbd() {
 	#
 	# Filter out the TCP port number since this changes between runs.
 	sed -e 's#^.*nbd\.c:.*##g' \
-	    -e 's#nbd:127\.0\.0\.1:[^:]*:#nbd:127\.0\.0\.1:PORT:#g'
+	    -e 's#nbd:127\.0\.0\.1:[^:]*:#nbd:127\.0\.0\.1:PORT:#g' \
+            -e 's#\(exportname=foo\|PORT\): Failed to .*$#\1#'
 }
 
 check_disconnect() {

+ 28 - 53
tests/qemu-iotests/083.out

@@ -1,163 +1,138 @@
 QA output created by 083
 === Check disconnect before neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect after neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 8 neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 16 neg1 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect before export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect after export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 4 export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 12 export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 16 export ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect before neg2 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect after neg2 ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect 8 neg2 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect 10 neg2 ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo
 no file open, try 'help open'
 
 === Check disconnect before request ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect after request ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect before reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect after reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect 4 reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect 8 reply ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect before data ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT:exportname=foo: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 === Check disconnect after data ===
 
 
-read failed: Input/output error
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 === Check disconnect before neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 8 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 16 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 24 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect 28 neg-classic ===
 
-
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not open image: Invalid argument
+qemu-io: can't open device nbd:127.0.0.1:PORT
 no file open, try 'help open'
 
 === Check disconnect after neg-classic ===
 
 
-qemu-io: can't open device nbd:127.0.0.1:PORT: Could not read image for determining its format: Input/output error
-no file open, try 'help open'
+read failed: Input/output error
 
 *** done

+ 12 - 0
tests/qemu-iotests/100

@@ -55,6 +55,8 @@ echo "== verify pattern =="
 $QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Sequential requests =="
 _make_test_img $size
@@ -66,6 +68,8 @@ $QEMU_IO -c "read -P 0xcd 0 4k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0xce 4k 4k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 8k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Superset overlapping requests =="
 _make_test_img $size
@@ -79,6 +83,8 @@ $QEMU_IO -c "read -P 0xcd 0 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0xcd 3k 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Subset overlapping requests =="
 _make_test_img $size
@@ -92,6 +98,8 @@ $QEMU_IO -c "read -P 0xce 0 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0xce 3k 1k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Head overlapping requests =="
 _make_test_img $size
@@ -104,6 +112,8 @@ echo "== verify pattern =="
 $QEMU_IO -c "read -P 0xce 2k 2k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Tail overlapping requests =="
 _make_test_img $size
@@ -116,6 +126,8 @@ echo "== verify pattern =="
 $QEMU_IO -c "read -P 0xce 0k 2k" "$TEST_IMG" | _filter_qemu_io
 $QEMU_IO -c "read -P 0 4k 4k" "$TEST_IMG" | _filter_qemu_io
 
+_cleanup_test_img
+
 echo
 echo "== Disjoint requests =="
 _make_test_img $size

+ 3 - 6
tests/qemu-iotests/104

@@ -28,11 +28,7 @@ here=`pwd`
 tmp=/tmp/$$
 status=1	# failure is the default!
 
-_cleanup()
-{
-	_cleanup_test_img
-}
-trap "_cleanup; exit \$status" 0 1 2 3 15
+trap "exit \$status" 0 1 2 3 15
 
 # get standard environment, filters and checks
 . ./common.rc
@@ -47,8 +43,9 @@ echo
 image_sizes="1024 1234"
 
 for s in $image_sizes; do
-    _make_test_img $s | _filter_img_create
+    _make_test_img $s
     _img_info | _filter_img_info
+    _cleanup_test_img
 done
 
 # success, all done

+ 96 - 0
tests/qemu-iotests/116

@@ -0,0 +1,96 @@
+#!/bin/bash
+#
+# Test error code paths for invalid QED images
+#
+# The aim of this test is to exercise the error paths in qed_open() to ensure
+# there are no crashes with invalid input files.
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=stefanha@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qed
+_supported_proto generic
+_supported_os Linux
+
+
+size=128M
+
+echo
+echo "== truncated header cluster =="
+_make_test_img $size
+truncate -s 512 "$TEST_IMG"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid header magic =="
+_make_test_img $size
+poke_file "$TEST_IMG" "0" "QEDX"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid cluster size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "4" "\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid table size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "8" "\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid header size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "12" "\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid L1 table offset =="
+_make_test_img $size
+poke_file "$TEST_IMG" "40" "\xff\xff\xff\xff\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+echo
+echo "== invalid image size =="
+_make_test_img $size
+poke_file "$TEST_IMG" "48" "\xff\xff\xff\xff\xff\xff\xff\xff"
+$QEMU_IO -f "$IMGFMT" -c "read 0 $size" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0

+ 37 - 0
tests/qemu-iotests/116.out

@@ -0,0 +1,37 @@
+QA output created by 116
+
+== truncated header cluster ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid header magic ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Image not in QED format
+no file open, try 'help open'
+
+== invalid cluster size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid table size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid header size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid L1 table offset ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+
+== invalid image size ==
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+qemu-io: can't open device TEST_DIR/t.qed: Could not open 'TEST_DIR/t.qed': Invalid argument
+no file open, try 'help open'
+*** done

+ 1 - 0
tests/qemu-iotests/common.filter

@@ -200,6 +200,7 @@ _filter_img_info()
     sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
         -e "s#$TEST_DIR#TEST_DIR#g" \
         -e "s#$IMGFMT#IMGFMT#g" \
+        -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \
         -e "/encrypted: yes/d" \
         -e "/cluster_size: [0-9]\\+/d" \
         -e "/table_size: [0-9]\\+/d" \

+ 1 - 1
tests/qemu-iotests/common.rc

@@ -153,7 +153,7 @@ _make_test_img()
 
     # Start an NBD server on the image file, which is what we'll be talking to
     if [ $IMGPROTO = "nbd" ]; then
-        eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810  $TEST_IMG_FILE &"
+        eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 -f $IMGFMT  $TEST_IMG_FILE &"
         QEMU_NBD_PID=$!
         sleep 1 # FIXME: qemu-nbd needs to be listening before we continue
     fi

+ 1 - 0
tests/qemu-iotests/group

@@ -116,3 +116,4 @@
 111 rw auto quick
 113 rw auto quick
 114 rw auto quick
+116 rw auto quick

+ 1 - 1
tests/qemu-iotests/iotests.py

@@ -288,7 +288,7 @@ def main(supported_fmts=[], supported_oses=['linux']):
     if supported_fmts and (imgfmt not in supported_fmts):
         notrun('not suitable for this image format: %s' % imgfmt)
 
-    if sys.platform not in supported_oses:
+    if True not in [sys.platform.startswith(x) for x in supported_oses]:
         notrun('not suitable for this OS: %s' % sys.platform)
 
     # We need to filter out the time taken from the output so that qemu-iotest

+ 119 - 0
tests/test-write-threshold.c

@@ -0,0 +1,119 @@
+/*
+ * Test block device write threshold
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include <glib.h>
+#include <stdint.h>
+#include "block/block_int.h"
+#include "block/write-threshold.h"
+
+
+static void test_threshold_not_set_on_init(void)
+{
+    uint64_t res;
+    BlockDriverState bs;
+    memset(&bs, 0, sizeof(bs));
+
+    g_assert(!bdrv_write_threshold_is_set(&bs));
+
+    res = bdrv_write_threshold_get(&bs);
+    g_assert_cmpint(res, ==, 0);
+}
+
+static void test_threshold_set_get(void)
+{
+    uint64_t threshold = 4 * 1024 * 1024;
+    uint64_t res;
+    BlockDriverState bs;
+    memset(&bs, 0, sizeof(bs));
+
+    bdrv_write_threshold_set(&bs, threshold);
+
+    g_assert(bdrv_write_threshold_is_set(&bs));
+
+    res = bdrv_write_threshold_get(&bs);
+    g_assert_cmpint(res, ==, threshold);
+}
+
+static void test_threshold_multi_set_get(void)
+{
+    uint64_t threshold1 = 4 * 1024 * 1024;
+    uint64_t threshold2 = 15 * 1024 * 1024;
+    uint64_t res;
+    BlockDriverState bs;
+    memset(&bs, 0, sizeof(bs));
+
+    bdrv_write_threshold_set(&bs, threshold1);
+    bdrv_write_threshold_set(&bs, threshold2);
+    res = bdrv_write_threshold_get(&bs);
+    g_assert_cmpint(res, ==, threshold2);
+}
+
+static void test_threshold_not_trigger(void)
+{
+    uint64_t amount = 0;
+    uint64_t threshold = 4 * 1024 * 1024;
+    BlockDriverState bs;
+    BdrvTrackedRequest req;
+
+    memset(&bs, 0, sizeof(bs));
+    memset(&req, 0, sizeof(req));
+    req.offset = 1024;
+    req.bytes = 1024;
+
+    bdrv_write_threshold_set(&bs, threshold);
+    amount = bdrv_write_threshold_exceeded(&bs, &req);
+    g_assert_cmpuint(amount, ==, 0);
+}
+
+
+static void test_threshold_trigger(void)
+{
+    uint64_t amount = 0;
+    uint64_t threshold = 4 * 1024 * 1024;
+    BlockDriverState bs;
+    BdrvTrackedRequest req;
+
+    memset(&bs, 0, sizeof(bs));
+    memset(&req, 0, sizeof(req));
+    req.offset = (4 * 1024 * 1024) - 1024;
+    req.bytes = 2 * 1024;
+
+    bdrv_write_threshold_set(&bs, threshold);
+    amount = bdrv_write_threshold_exceeded(&bs, &req);
+    g_assert_cmpuint(amount, >=, 1024);
+}
+
+typedef struct TestStruct {
+    const char *name;
+    void (*func)(void);
+} TestStruct;
+
+
+int main(int argc, char **argv)
+{
+    size_t i;
+    TestStruct tests[] = {
+        { "/write-threshold/not-set-on-init",
+          test_threshold_not_set_on_init },
+        { "/write-threshold/set-get",
+          test_threshold_set_get },
+        { "/write-threshold/multi-set-get",
+          test_threshold_multi_set_get },
+        { "/write-threshold/not-trigger",
+          test_threshold_not_trigger },
+        { "/write-threshold/trigger",
+          test_threshold_trigger },
+        { NULL, NULL }
+    };
+
+    g_test_init(&argc, &argv, NULL);
+    for (i = 0; tests[i].name != NULL; i++) {
+        g_test_add_func(tests[i].name, tests[i].func);
+    }
+    return g_test_run();
+}

+ 1 - 0
trace-events

@@ -116,6 +116,7 @@ virtio_blk_req_complete(void *req, int status) "req %p status %d"
 virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
 virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
 virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
+virtio_blk_submit_multireq(void *mrb, int start, int num_reqs, uint64_t sector, size_t nsectors, bool is_write) "mrb %p start %d num_reqs %d sector %"PRIu64" nsectors %zu is_write %d"
 
 # hw/block/dataplane/virtio-blk.c
 virtio_blk_data_plane_start(void *s) "dataplane %p"