19 files changed, 1273 insertions, 135 deletions
diff --git a/block/Makefile.objs b/block/Makefile.objs
index b5754d39b..554f429d0 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -3,9 +3,12 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-c
 block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-obj-y += qed-check.o
 block-obj-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
-block-obj-y += stream.o
 block-obj-$(CONFIG_WIN32) += raw-win32.o
 block-obj-$(CONFIG_POSIX) += raw-posix.o
 block-obj-$(CONFIG_LIBISCSI) += iscsi.o
 block-obj-$(CONFIG_CURL) += curl.o
 block-obj-$(CONFIG_RBD) += rbd.o
+block-obj-$(CONFIG_GLUSTERFS) += gluster.o
+
+common-obj-y += stream.o
+common-obj-y += commit.o
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 59dcea065..1206d5256 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -28,6 +28,7 @@
 
 typedef struct BDRVBlkdebugState {
     int state;
+    int new_state;
     QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
     QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
 } BDRVBlkdebugState;
@@ -403,12 +404,12 @@ static void blkdebug_close(BlockDriverState *bs)
 }
 
 static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
-    int old_state, bool injected)
+    bool injected)
 {
     BDRVBlkdebugState *s = bs->opaque;
 
     /* Only process rules for the current state */
-    if (rule->state && rule->state != old_state) {
+    if (rule->state && rule->state != s->state) {
         return injected;
     }
 
@@ -423,7 +424,7 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
         break;
 
     case ACTION_SET_STATE:
-        s->state = rule->options.set_state.new_state;
+        s->new_state = rule->options.set_state.new_state;
         break;
     }
     return injected;
@@ -433,15 +434,16 @@ static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
     BDRVBlkdebugState *s = bs->opaque;
     struct BlkdebugRule *rule;
-    int old_state = s->state;
     bool injected;
 
     assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
 
     injected = false;
+    s->new_state = s->state;
     QLIST_FOREACH(rule, &s->rules[event], next) {
-        injected = process_rule(bs, rule, old_state, injected);
+        injected = process_rule(bs, rule, injected);
     }
+    s->state = s->new_state;
 }
 
 static int64_t blkdebug_getlength(BlockDriverState *bs)
diff --git a/block/commit.c b/block/commit.c
new file mode 100644
index 000000000..733c91403
--- /dev/null
+++ b/block/commit.c
@@ -0,0 +1,268 @@
+/*
+ * Live block commit
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Jeff Cody   <jcody@redhat.com>
+ *  Based on stream.c by Stefan Hajnoczi
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block_int.h"
+#include "blockjob.h"
+#include "qemu/ratelimit.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CommitBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    BlockDriverState *active;
+    BlockDriverState *top;
+    BlockDriverState *base;
+    BlockdevOnError on_error;
+    int base_flags;
+    int orig_overlay_flags;
+} CommitBlockJob;
+
+static int coroutine_fn commit_populate(BlockDriverState *bs,
+                                        BlockDriverState *base,
+                                        int64_t sector_num, int nb_sectors,
+                                        void *buf)
+{
+    int ret = 0;
+
+    ret = bdrv_read(bs, sector_num, buf, nb_sectors);
+    if (ret) {
+        return ret;
+    }
+
+    ret = bdrv_write(base, sector_num, buf, nb_sectors);
+    if (ret) {
+        return ret;
+    }
+
+    return 0;
+}
+
+static void coroutine_fn commit_run(void *opaque)
+{
+    CommitBlockJob *s = opaque;
+    BlockDriverState *active = s->active;
+    BlockDriverState *top = s->top;
+    BlockDriverState *base = s->base;
+    BlockDriverState *overlay_bs = NULL;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n = 0;
+    void *buf;
+    int bytes_written = 0;
+    int64_t base_len;
+
+    ret = s->common.len = bdrv_getlength(top);
+
+
+    if (s->common.len < 0) {
+        goto exit_restore_reopen;
+    }
+
+    ret = base_len = bdrv_getlength(base);
+    if (base_len < 0) {
+        goto exit_restore_reopen;
+    }
+
+    if (base_len < s->common.len) {
+        ret = bdrv_truncate(base, s->common.len);
+        if (ret) {
+            goto exit_restore_reopen;
+        }
+    }
+
+    overlay_bs = bdrv_find_overlay(active, top);
+
+    end = s->common.len >> BDRV_SECTOR_BITS;
+    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
+
+    for (sector_num = 0; sector_num < end; sector_num += n) {
+        uint64_t delay_ns = 0;
+        bool copy;
+
+wait:
+        /* Note that even when no rate limit is applied we need to yield
+         * with no pending I/O here so that qemu_aio_flush() returns.
+         */
+        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+        if (block_job_is_cancelled(&s->common)) {
+            break;
+        }
+        /* Copy if allocated above the base */
+        ret = bdrv_co_is_allocated_above(top, base, sector_num,
+                                         COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+                                         &n);
+        copy = (ret == 1);
+        trace_commit_one_iteration(s, sector_num, n, ret);
+        if (copy) {
+            if (s->common.speed) {
+                delay_ns = ratelimit_calculate_delay(&s->limit, n);
+                if (delay_ns > 0) {
+                    goto wait;
+                }
+            }
+            ret = commit_populate(top, base, sector_num, n, buf);
+            bytes_written += n * BDRV_SECTOR_SIZE;
+        }
+        if (ret < 0) {
+            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
+                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
+                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
+                goto exit_free_buf;
+            } else {
+                n = 0;
+                continue;
+            }
+        }
+        /* Publish progress */
+        s->common.offset += n * BDRV_SECTOR_SIZE;
+    }
+
+    ret = 0;
+
+    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
+        /* success */
+        ret = bdrv_drop_intermediate(active, top, base);
+    }
+
+exit_free_buf:
+    qemu_vfree(buf);
+
+exit_restore_reopen:
+    /* restore base open flags here if appropriate (e.g., change the base back
+     * to r/o). These reopens do not need to be atomic, since we won't abort
+     * even on failure here */
+    if (s->base_flags != bdrv_get_flags(base)) {
+        bdrv_reopen(base, s->base_flags, NULL);
+    }
+    if (s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
+        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+    }
+
+    block_job_complete(&s->common, ret);
+}
+
+static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType commit_job_type = {
+    .instance_size = sizeof(CommitBlockJob),
+    .job_type      = "commit",
+    .set_speed     = commit_set_speed,
+};
+
+void commit_start(BlockDriverState *bs, BlockDriverState *base,
+                  BlockDriverState *top, int64_t speed,
+                  BlockdevOnError on_error, BlockDriverCompletionFunc *cb,
+                  void *opaque, Error **errp)
+{
+    CommitBlockJob *s;
+    BlockReopenQueue *reopen_queue = NULL;
+    int orig_overlay_flags;
+    int orig_base_flags;
+    BlockDriverState *overlay_bs;
+    Error *local_err = NULL;
+
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER_COMBINATION);
+        return;
+    }
+
+    /* Once we support top == active layer, remove this check */
+    if (top == bs) {
+        error_setg(errp,
+                   "Top image as the active layer is currently unsupported");
+        return;
+    }
+
+    if (top == base) {
+        error_setg(errp, "Invalid files for merge: top and base are the same");
+        return;
+    }
+
+    /* top and base may be valid, but let's make sure that base is reachable
+     * from top */
+    if (bdrv_find_backing_image(top, base->filename) != base) {
+        error_setg(errp,
+                   "Base (%s) is not reachable from top (%s)",
+                   base->filename, top->filename);
+        return;
+    }
+
+    overlay_bs = bdrv_find_overlay(bs, top);
+
+    if (overlay_bs == NULL) {
+        error_setg(errp, "Could not find overlay image for %s:", top->filename);
+        return;
+    }
+
+    orig_base_flags    = bdrv_get_flags(base);
+    orig_overlay_flags = bdrv_get_flags(overlay_bs);
+
+    /* convert base & overlay_bs to r/w, if necessary */
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
+    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
+                                         orig_overlay_flags | BDRV_O_RDWR);
+    }
+    if (reopen_queue) {
+        bdrv_reopen_multiple(reopen_queue, &local_err);
+        if (local_err != NULL) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+
+
+    s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
+    if (!s) {
+        return;
+    }
+
+    s->base   = base;
+    s->top    = top;
+    s->active = bs;
+
+    s->base_flags          = orig_base_flags;
+    s->orig_overlay_flags  = orig_overlay_flags;
+
+    s->on_error = on_error;
+    s->common.co = qemu_coroutine_create(commit_run);
+
+    trace_commit_start(bs, base, top, s, s->common.co, opaque);
+    qemu_coroutine_enter(s->common.co, s);
+}
diff --git a/block/curl.c b/block/curl.c
index e7c3634d3..c1074cd2e 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -542,8 +542,7 @@ static void curl_close(BlockDriverState *bs)
     }
     if (s->multi)
         curl_multi_cleanup(s->multi);
-    if (s->url)
-        free(s->url);
+    g_free(s->url);
 }
 
 static int64_t curl_getlength(BlockDriverState *bs)
diff --git a/block/gluster.c b/block/gluster.c
new file mode 100644
index 000000000..3588d7377
--- /dev/null
+++ b/block/gluster.c
@@ -0,0 +1,624 @@
+/*
+ * GlusterFS backend for QEMU
+ *
+ * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com>
+ *
+ * Pipe handling mechanism in AIO implementation is derived from
+ * block/rbd.c. Hence,
+ *
+ * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>,
+ *                         Josh Durgin <josh.durgin@dreamhost.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include <glusterfs/api/glfs.h>
+#include "block_int.h"
+#include "qemu_socket.h"
+#include "uri.h"
+
+typedef struct GlusterAIOCB {
+    BlockDriverAIOCB common;
+    int64_t size;
+    int ret;
+    bool *finished;
+    QEMUBH *bh;
+} GlusterAIOCB;
+
+typedef struct BDRVGlusterState {
+    struct glfs *glfs;
+    int fds[2];
+    struct glfs_fd *fd;
+    int qemu_aio_count;
+    int event_reader_pos;
+    GlusterAIOCB *event_acb;
+} BDRVGlusterState;
+
+#define GLUSTER_FD_READ  0
+#define GLUSTER_FD_WRITE 1
+
+typedef struct GlusterConf {
+    char *server;
+    int port;
+    char *volname;
+    char *image;
+    char *transport;
+} GlusterConf;
+
+static void qemu_gluster_gconf_free(GlusterConf *gconf)
+{
+    g_free(gconf->server);
+    g_free(gconf->volname);
+    g_free(gconf->image);
+    g_free(gconf->transport);
+    g_free(gconf);
+}
+
+static int parse_volume_options(GlusterConf *gconf, char *path)
+{
+    char *p, *q;
+
+    if (!path) {
+        return -EINVAL;
+    }
+
+    /* volume */
+    p = q = path + strspn(path, "/");
+    p += strcspn(p, "/");
+    if (*p == '\0') {
+        return -EINVAL;
+    }
+    gconf->volname = g_strndup(q, p - q);
+
+    /* image */
+    p += strspn(p, "/");
+    if (*p == '\0') {
+        return -EINVAL;
+    }
+    gconf->image = g_strdup(p);
+    return 0;
+}
+
+/*
+ * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...]
+ *
+ * 'gluster' is the protocol.
+ *
+ * 'transport' specifies the transport type used to connect to gluster
+ * management daemon (glusterd). Valid transport types are
+ * tcp, unix and rdma. If a transport type isn't specified, then tcp
+ * type is assumed.
+ *
+ * 'server' specifies the server where the volume file specification for
+ * the given volume resides. This can be either hostname, ipv4 address
+ * or ipv6 address. ipv6 address needs to be within square brackets [ ].
+ * If transport type is 'unix', then 'server' field should not be specifed.
+ * The 'socket' field needs to be populated with the path to unix domain
+ * socket.
+ *
+ * 'port' is the port number on which glusterd is listening. This is optional
+ * and if not specified, QEMU will send 0 which will make gluster to use the
+ * default port. If the transport type is unix, then 'port' should not be
+ * specified.
+ *
+ * 'volname' is the name of the gluster volume which contains the VM image.
+ *
+ * 'image' is the path to the actual VM image that resides on gluster volume.
+ *
+ * Examples:
+ *
+ * file=gluster://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4/testvol/a.img
+ * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img
+ * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img
+ * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img
+ * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket
+ * file=gluster+rdma://1.2.3.4:24007/testvol/a.img
+ */
+static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename)
+{
+    URI *uri;
+    QueryParams *qp = NULL;
+    bool is_unix = false;
+    int ret = 0;
+
+    uri = uri_parse(filename);
+    if (!uri) {
+        return -EINVAL;
+    }
+
+    /* transport */
+    if (!strcmp(uri->scheme, "gluster")) {
+        gconf->transport = g_strdup("tcp");
+    } else if (!strcmp(uri->scheme, "gluster+tcp")) {
+        gconf->transport = g_strdup("tcp");
+    } else if (!strcmp(uri->scheme, "gluster+unix")) {
+        gconf->transport = g_strdup("unix");
+        is_unix = true;
+    } else if (!strcmp(uri->scheme, "gluster+rdma")) {
+        gconf->transport = g_strdup("rdma");
+    } else {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = parse_volume_options(gconf, uri->path);
+    if (ret < 0) {
+        goto out;
+    }
+
+    qp = query_params_parse(uri->query);
+    if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) {
+        ret = -EINVAL;
+        goto out;
+    }
+
+    if (is_unix) {
+        if (uri->server || uri->port) {
+            ret = -EINVAL;
+            goto out;
+        }
+        if (strcmp(qp->p[0].name, "socket")) {
+            ret = -EINVAL;
+            goto out;
+        }
+        gconf->server = g_strdup(qp->p[0].value);
+    } else {
+        gconf->server = g_strdup(uri->server);
+        gconf->port = uri->port;
+    }
+
+out:
+    if (qp) {
+        query_params_free(qp);
+    }
+    uri_free(uri);
+    return ret;
+}
+
+static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename)
+{
+    struct glfs *glfs = NULL;
+    int ret;
+    int old_errno;
+
+    ret = qemu_gluster_parseuri(gconf, filename);
+    if (ret < 0) {
+        error_report("Usage: file=gluster[+transport]://[server[:port]]/"
+            "volname/image[?socket=...]");
+        errno = -ret;
+        goto out;
+    }
+
+    glfs = glfs_new(gconf->volname);
+    if (!glfs) {
+        goto out;
+    }
+
+    ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server,
+            gconf->port);
+    if (ret < 0) {
+        goto out;
+    }
+
+    /*
+     * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when
+     * GlusterFS makes GF_LOG_* macros available to libgfapi users.
+     */
+    ret = glfs_set_logging(glfs, "-", 4);
+    if (ret < 0) {
+        goto out;
+    }
+
+    ret = glfs_init(glfs);
+    if (ret) {
+        error_report("Gluster connection failed for server=%s port=%d "
+             "volume=%s image=%s transport=%s\n", gconf->server, gconf->port,
+             gconf->volname, gconf->image, gconf->transport);
+        goto out;
+    }
+    return glfs;
+
+out:
+    if (glfs) {
+        old_errno = errno;
+        glfs_fini(glfs);
+        errno = old_errno;
+    }
+    return NULL;
+}
+
+static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s)
+{
+    int ret;
+    bool *finished = acb->finished;
+    BlockDriverCompletionFunc *cb = acb->common.cb;
+    void *opaque = acb->common.opaque;
+
+    if (!acb->ret || acb->ret == acb->size) {
+        ret = 0; /* Success */
+    } else if (acb->ret < 0) {
+        ret = acb->ret; /* Read/Write failed */
+    } else {
+        ret = -EIO; /* Partial read/write - fail it */
+    }
+
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    cb(opaque, ret);
+    if (finished) {
+        *finished = true;
+    }
+}
+
+static void qemu_gluster_aio_event_reader(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+    ssize_t ret;
+
+    do {
+        char *p = (char *)&s->event_acb;
+
+        ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos,
+                   sizeof(s->event_acb) - s->event_reader_pos);
+        if (ret > 0) {
+            s->event_reader_pos += ret;
+            if (s->event_reader_pos == sizeof(s->event_acb)) {
+                s->event_reader_pos = 0;
+                qemu_gluster_complete_aio(s->event_acb, s);
+            }
+        }
+    } while (ret < 0 && errno == EINTR);
+}
+
+static int qemu_gluster_aio_flush_cb(void *opaque)
+{
+    BDRVGlusterState *s = opaque;
+
+    return (s->qemu_aio_count > 0);
+}
+
+static int qemu_gluster_open(BlockDriverState *bs, const char *filename,
+    int bdrv_flags)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int open_flags = O_BINARY;
+    int ret = 0;
+    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+    s->glfs = qemu_gluster_init(gconf, filename);
+    if (!s->glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    if (bdrv_flags & BDRV_O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    }
+
+    s->fd = glfs_open(s->glfs, gconf->image, open_flags);
+    if (!s->fd) {
+        ret = -errno;
+        goto out;
+    }
+
+    ret = qemu_pipe(s->fds);
+    if (ret < 0) {
+        ret = -errno;
+        goto out;
+    }
+    fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ],
+        qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s);
+
+out:
+    qemu_gluster_gconf_free(gconf);
+    if (!ret) {
+        return ret;
+    }
+    if (s->fd) {
+        glfs_close(s->fd);
+    }
+    if (s->glfs) {
+        glfs_fini(s->glfs);
+    }
+    return ret;
+}
+
+static int qemu_gluster_create(const char *filename,
+        QEMUOptionParameter *options)
+{
+    struct glfs *glfs;
+    struct glfs_fd *fd;
+    int ret = 0;
+    int64_t total_size = 0;
+    GlusterConf *gconf = g_malloc0(sizeof(GlusterConf));
+
+    glfs = qemu_gluster_init(gconf, filename);
+    if (!glfs) {
+        ret = -errno;
+        goto out;
+    }
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            total_size = options->value.n / BDRV_SECTOR_SIZE;
+        }
+        options++;
+    }
+
+    fd = glfs_creat(glfs, gconf->image,
+        O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR);
+    if (!fd) {
+        ret = -errno;
+    } else {
+        if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) {
+            ret = -errno;
+        }
+        if (glfs_close(fd) != 0) {
+            ret = -errno;
+        }
+    }
+out:
+    qemu_gluster_gconf_free(gconf);
+    if (glfs) {
+        glfs_fini(glfs);
+    }
+    return ret;
+}
+
+static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)blockacb;
+    bool finished = false;
+
+    acb->finished = &finished;
+    while (!finished) {
+        qemu_aio_wait();
+    }
+}
+
+static AIOPool gluster_aio_pool = {
+    .aiocb_size = sizeof(GlusterAIOCB),
+    .cancel = qemu_gluster_aio_cancel,
+};
+
+static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
+{
+    GlusterAIOCB *acb = (GlusterAIOCB *)arg;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVGlusterState *s = bs->opaque;
+    int retval;
+
+    acb->ret = ret;
+    retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb));
+    if (retval != sizeof(acb)) {
+        /*
+         * Gluster AIO callback thread failed to notify the waiting
+         * QEMU thread about IO completion.
+         *
+         * Complete this IO request and make the disk inaccessible for
+         * subsequent reads and writes.
+         */
+        error_report("Gluster failed to notify QEMU about IO completion");
+
+        qemu_mutex_lock_iothread(); /* We are in gluster thread context */
+        acb->common.cb(acb->common.opaque, -EIO);
+        qemu_aio_release(acb);
+        s->qemu_aio_count--;
+        close(s->fds[GLUSTER_FD_READ]);
+        close(s->fds[GLUSTER_FD_WRITE]);
+        qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL,
+            NULL);
+        bs->drv = NULL; /* Make the disk inaccessible */
+        qemu_mutex_unlock_iothread();
+    }
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int write)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+    size_t size;
+    off_t offset;
+
+    offset = sector_num * BDRV_SECTOR_SIZE;
+    size = nb_sectors * BDRV_SECTOR_SIZE;
+    s->qemu_aio_count++;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->size = size;
+    acb->ret = 0;
+    acb->finished = NULL;
+
+    if (write) {
+        ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    } else {
+        ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
+            &gluster_finish_aiocb, acb);
+    }
+
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+}
+
+static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    GlusterAIOCB *acb;
+    BDRVGlusterState *s = bs->opaque;
+
+    acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque);
+    acb->size = 0;
+    acb->ret = 0;
+    acb->finished = NULL;
+    s->qemu_aio_count++;
+
+    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    if (ret < 0) {
+        goto out;
+    }
+    return &acb->common;
+
+out:
+    s->qemu_aio_count--;
+    qemu_aio_release(acb);
+    return NULL;
+}
+
+static int64_t qemu_gluster_getlength(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    int64_t ret;
+
+    ret = glfs_lseek(s->fd, 0, SEEK_END);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return ret;
+    }
+}
+
+static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+    struct stat st;
+    int ret;
+
+    ret = glfs_fstat(s->fd, &st);
+    if (ret < 0) {
+        return -errno;
+    } else {
+        return st.st_blocks * 512;
+    }
+}
+
+static void qemu_gluster_close(BlockDriverState *bs)
+{
+    BDRVGlusterState *s = bs->opaque;
+
+    close(s->fds[GLUSTER_FD_READ]);
+    close(s->fds[GLUSTER_FD_WRITE]);
+    qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL);
+
+    if (s->fd) {
+        glfs_close(s->fd);
+        s->fd = NULL;
+    }
+    glfs_fini(s->glfs);
+}
+
+static QEMUOptionParameter qemu_gluster_create_options[] = {
+    {
+        .name = BLOCK_OPT_SIZE,
+        .type = OPT_SIZE,
+        .help = "Virtual disk size"
+    },
+    { NULL }
+};
+
+static BlockDriver bdrv_gluster = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_tcp = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+tcp",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_unix = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+unix",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static BlockDriver bdrv_gluster_rdma = {
+    .format_name                  = "gluster",
+    .protocol_name                = "gluster+rdma",
+    .instance_size                = sizeof(BDRVGlusterState),
+    .bdrv_file_open               = qemu_gluster_open,
+    .bdrv_close                   = qemu_gluster_close,
+    .bdrv_create                  = qemu_gluster_create,
+    .bdrv_getlength               = qemu_gluster_getlength,
+    .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size,
+    .bdrv_aio_readv               = qemu_gluster_aio_readv,
+    .bdrv_aio_writev              = qemu_gluster_aio_writev,
+    .bdrv_aio_flush               = qemu_gluster_aio_flush,
+    .create_options               = qemu_gluster_create_options,
+};
+
+static void bdrv_gluster_init(void)
+{
+    bdrv_register(&bdrv_gluster_rdma);
+    bdrv_register(&bdrv_gluster_unix);
+    bdrv_register(&bdrv_gluster_tcp);
+    bdrv_register(&bdrv_gluster);
+}
+
+block_init(bdrv_gluster_init);
diff --git a/block/iscsi.c b/block/iscsi.c
index 0b96165ec..d0b1a10ee 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -167,12 +167,6 @@ iscsi_set_events(IscsiLun *iscsilun)
 
     }
 
-    /* If we just added an event, the callback might be delayed
-     * unless we call qemu_notify_event().
-     */
-    if (ev & ~iscsilun->events) {
-        qemu_notify_event();
-    }
     iscsilun->events = ev;
 }
 
@@ -268,10 +262,6 @@ iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num,
     acb->task->xfer_dir = SCSI_XFER_WRITE;
     acb->task->cdb_size = 16;
     acb->task->cdb[0] = 0x8a;
-    if (!(bs->open_flags & BDRV_O_CACHE_WB)) {
-        /* set FUA on writes when cache mode is write through */
-        acb->task->cdb[1] |= 0x04;
-    }
     lba = sector_qemu2lun(sector_num, iscsilun);
     *(uint32_t *)&acb->task->cdb[2]  = htonl(lba >> 32);
     *(uint32_t *)&acb->task->cdb[6]  = htonl(lba & 0xffffffff);
@@ -628,9 +618,17 @@ static BlockDriverAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
     return &acb->common;
 }
 
+
+static void ioctl_cb(void *opaque, int status)
+{
+    int *p_status = opaque;
+    *p_status = status;
+}
+
 static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
 {
     IscsiLun *iscsilun = bs->opaque;
+    int status;
 
     switch (req) {
     case SG_GET_VERSION_NUM:
@@ -639,6 +637,15 @@ static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
     case SG_GET_SCSI_ID:
         ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
         break;
+    case SG_IO:
+        status = -EINPROGRESS;
+        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
+
+        while (status == -EINPROGRESS) {
+            qemu_aio_wait();
+        }
+
+        return 0;
     default:
         return -1;
     }
diff --git a/block/qcow.c b/block/qcow.c
index 7b5ab87d2..b239c82ae 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -197,6 +197,15 @@ static int qcow_open(BlockDriverState *bs, int flags)
     return ret;
 }
 
+
+/* We have nothing to do for QCOW reopen, stubs just return
+ * success */
+static int qcow_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int qcow_set_key(BlockDriverState *bs, const char *key)
 {
     BDRVQcowState *s = bs->opaque;
@@ -868,6 +877,7 @@ static BlockDriver bdrv_qcow = {
     .bdrv_probe		= qcow_probe,
     .bdrv_open		= qcow_open,
     .bdrv_close		= qcow_close,
+    .bdrv_reopen_prepare = qcow_reopen_prepare,
     .bdrv_create	= qcow_create,
 
     .bdrv_co_readv          = qcow_co_readv,
diff --git a/block/qcow2.c b/block/qcow2.c
index 8f183f146..c1ff31f48 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -52,6 +52,7 @@ typedef struct {
     uint32_t magic;
     uint32_t len;
 } QCowExtension;
+
 #define  QCOW2_EXT_MAGIC_END 0
 #define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
 #define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
@@ -558,6 +559,14 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key)
     return 0;
 }
 
+/* We have nothing to do for QCOW2 reopen, stubs just return
+ * success */
+static int qcow2_reopen_prepare(BDRVReopenState *state,
+                                BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -1087,6 +1096,7 @@ int qcow2_update_header(BlockDriverState *bs)
             goto fail;
         }
 
+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
         strncpy(buf, bs->backing_file, buflen);
 
         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
@@ -1679,6 +1689,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_probe         = qcow2_probe,
     .bdrv_open          = qcow2_open,
     .bdrv_close         = qcow2_close,
+    .bdrv_reopen_prepare  = qcow2_reopen_prepare,
     .bdrv_create        = qcow2_create,
     .bdrv_co_is_allocated = qcow2_co_is_allocated,
     .bdrv_set_key       = qcow2_set_key,
diff --git a/block/qed-table.c b/block/qed-table.c
index ce07b0554..de845ec3d 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -103,7 +103,6 @@ static void qed_write_table_cb(void *opaque, int ret)
 out:
     qemu_vfree(write_table_cb->table);
     gencb_complete(&write_table_cb->gencb, ret);
-    return;
 }
 
 /**
diff --git a/block/qed.c b/block/qed.c
index 21cb23987..6c182ca91 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -505,6 +505,14 @@ out:
     return ret;
 }
 
+/* We have nothing to do for QED reopen, stubs just return
+ * success */
+static int bdrv_qed_reopen_prepare(BDRVReopenState *state,
+                                   BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static void bdrv_qed_close(BlockDriverState *bs)
 {
     BDRVQEDState *s = bs->opaque;
@@ -1564,6 +1572,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_rebind              = bdrv_qed_rebind,
     .bdrv_open                = bdrv_qed_open,
     .bdrv_close               = bdrv_qed_close,
+    .bdrv_reopen_prepare      = bdrv_qed_reopen_prepare,
     .bdrv_create              = bdrv_qed_create,
     .bdrv_co_is_allocated     = bdrv_qed_co_is_allocated,
     .bdrv_make_empty          = bdrv_qed_make_empty,
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6be20b192..28d439fa8 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -133,13 +133,19 @@ typedef struct BDRVRawState {
     int use_aio;
     void *aio_ctx;
 #endif
-    uint8_t *aligned_buf;
-    unsigned aligned_buf_size;
 #ifdef CONFIG_XFS
     bool is_xfs : 1;
 #endif
 } BDRVRawState;
 
+typedef struct BDRVRawReopenState {
+    int fd;
+    int open_flags;
+#ifdef CONFIG_LINUX_AIO
+    int use_aio;
+#endif
+} BDRVRawReopenState;
+
 static int fd_open(BlockDriverState *bs);
 static int64_t raw_getlength(BlockDriverState *bs);
 
@@ -185,6 +191,57 @@ static int raw_normalize_devicepath(const char **filename)
 }
 #endif
 
+static void raw_parse_flags(int bdrv_flags, int *open_flags)
+{
+    assert(open_flags != NULL);
+
+    *open_flags |= O_BINARY;
+    *open_flags &= ~O_ACCMODE;
+    if (bdrv_flags & BDRV_O_RDWR) {
+        *open_flags |= O_RDWR;
+    } else {
+        *open_flags |= O_RDONLY;
+    }
+
+    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+     * and O_DIRECT for no caching. */
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        *open_flags |= O_DIRECT;
+    }
+}
+
+#ifdef CONFIG_LINUX_AIO
+static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags)
+{
+    int ret = -1;
+    assert(aio_ctx != NULL);
+    assert(use_aio != NULL);
+    /*
+     * Currently Linux do AIO only for files opened with O_DIRECT
+     * specified so check NOCACHE flag too
+     */
+    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+
+        /* if non-NULL, laio_init() has already been run */
+        if (*aio_ctx == NULL) {
+            *aio_ctx = laio_init();
+            if (!*aio_ctx) {
+                goto error;
+            }
+        }
+        *use_aio = 1;
+    } else {
+        *use_aio = 0;
+    }
+
+    ret = 0;
+
+error:
+    return ret;
+}
+#endif
+
 static int raw_open_common(BlockDriverState *bs, const char *filename,
                            int bdrv_flags, int open_flags)
 {
@@ -196,20 +253,8 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
         return ret;
     }
 
-    s->open_flags = open_flags | O_BINARY;
-    s->open_flags &= ~O_ACCMODE;
-    if (bdrv_flags & BDRV_O_RDWR) {
-        s->open_flags |= O_RDWR;
-    } else {
-        s->open_flags |= O_RDONLY;
-    }
-
-    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
-     * and O_DIRECT for no caching. */
-    if ((bdrv_flags & BDRV_O_NOCACHE))
-        s->open_flags |= O_DIRECT;
-    if (!(bdrv_flags & BDRV_O_CACHE_WB))
-        s->open_flags |= O_DSYNC;
+    s->open_flags = open_flags;
+    raw_parse_flags(bdrv_flags, &s->open_flags);
 
     s->fd = -1;
     fd = qemu_open(filename, s->open_flags, 0644);
@@ -220,45 +265,17 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
         return ret;
     }
     s->fd = fd;
-    s->aligned_buf = NULL;
-
-    if ((bdrv_flags & BDRV_O_NOCACHE)) {
-        /*
-         * Allocate a buffer for read/modify/write cycles.  Chose the size
-         * pessimistically as we don't know the block size yet.
-         */
-        s->aligned_buf_size = 32 * MAX_BLOCKSIZE;
-        s->aligned_buf = qemu_memalign(MAX_BLOCKSIZE, s->aligned_buf_size);
-        if (s->aligned_buf == NULL) {
-            goto out_close;
-        }
-    }
 
     /* We're falling back to POSIX AIO in some cases so init always */
     if (paio_init() < 0) {
-        goto out_free_buf;
+        goto out_close;
     }
 
 #ifdef CONFIG_LINUX_AIO
-    /*
-     * Currently Linux do AIO only for files opened with O_DIRECT
-     * specified so check NOCACHE flag too
-     */
-    if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
-                      (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
-
-        s->aio_ctx = laio_init();
-        if (!s->aio_ctx) {
-            goto out_free_buf;
-        }
-        s->use_aio = 1;
-    } else
-#endif
-    {
-#ifdef CONFIG_LINUX_AIO
-        s->use_aio = 0;
-#endif
+    if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) {
+        goto out_close;
     }
+#endif
 
 #ifdef CONFIG_XFS
     if (platform_test_xfs_fd(s->fd)) {
@@ -268,8 +285,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
 
     return 0;
 
-out_free_buf:
-    qemu_vfree(s->aligned_buf);
 out_close:
     qemu_close(fd);
     return -errno;
@@ -283,6 +298,109 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
     return raw_open_common(bs, filename, flags, 0);
 }
 
+static int raw_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    BDRVRawState *s;
+    BDRVRawReopenState *raw_s;
+    int ret = 0;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    s = state->bs->opaque;
+
+    state->opaque = g_malloc0(sizeof(BDRVRawReopenState));
+    raw_s = state->opaque;
+
+#ifdef CONFIG_LINUX_AIO
+    raw_s->use_aio = s->use_aio;
+
+    /* we can use s->aio_ctx instead of a copy, because the use_aio flag is
+     * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio()
+     * won't override aio_ctx if aio_ctx is non-NULL */
+    if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) {
+        return -1;
+    }
+#endif
+
+    raw_parse_flags(state->flags, &raw_s->open_flags);
+
+    raw_s->fd = -1;
+
+    int fcntl_flags = O_APPEND | O_ASYNC | O_NONBLOCK;
+#ifdef O_NOATIME
+    fcntl_flags |= O_NOATIME;
+#endif
+
+    if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) {
+        /* dup the original fd */
+        /* TODO: use qemu fcntl wrapper */
+#ifdef F_DUPFD_CLOEXEC
+        raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0);
+#else
+        raw_s->fd = dup(s->fd);
+        if (raw_s->fd != -1) {
+            qemu_set_cloexec(raw_s->fd);
+        }
+#endif
+        if (raw_s->fd >= 0) {
+            ret = fcntl_setfl(raw_s->fd, raw_s->open_flags);
+            if (ret) {
+                qemu_close(raw_s->fd);
+                raw_s->fd = -1;
+            }
+        }
+    }
+
+    /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */
+    if (raw_s->fd == -1) {
+        assert(!(raw_s->open_flags & O_CREAT));
+        raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags);
+        if (raw_s->fd == -1) {
+            ret = -1;
+        }
+    }
+    return ret;
+}
+
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+    BDRVRawState *s = state->bs->opaque;
+
+    s->open_flags = raw_s->open_flags;
+
+    qemu_close(s->fd);
+    s->fd = raw_s->fd;
+#ifdef CONFIG_LINUX_AIO
+    s->use_aio = raw_s->use_aio;
+#endif
+
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+    BDRVRawReopenState *raw_s = state->opaque;
+
+     /* nothing to do if NULL, we didn't get far enough */
+    if (raw_s == NULL) {
+        return;
+    }
+
+    if (raw_s->fd >= 0) {
+        qemu_close(raw_s->fd);
+        raw_s->fd = -1;
+    }
+    g_free(state->opaque);
+    state->opaque = NULL;
+}
+
+
 /* XXX: use host sector size if necessary with:
 #ifdef DIOCGSECTORSIZE
         {
@@ -330,7 +448,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
      * boundary.  Check if this is the case or tell the low-level
      * driver that it needs to copy the buffer.
      */
-    if (s->aligned_buf) {
+    if ((bs->open_flags & BDRV_O_NOCACHE)) {
         if (!qiov_is_aligned(bs, qiov)) {
             type |= QEMU_AIO_MISALIGNED;
 #ifdef CONFIG_LINUX_AIO
@@ -378,8 +496,6 @@ static void raw_close(BlockDriverState *bs)
     if (s->fd >= 0) {
         qemu_close(s->fd);
         s->fd = -1;
-        if (s->aligned_buf != NULL)
-            qemu_vfree(s->aligned_buf);
     }
 }
 
@@ -735,6 +851,9 @@ static BlockDriver bdrv_file = {
     .instance_size = sizeof(BDRVRawState),
     .bdrv_probe = NULL, /* no probe for protocols */
     .bdrv_file_open = raw_open,
+    .bdrv_reopen_prepare = raw_reopen_prepare,
+    .bdrv_reopen_commit = raw_reopen_commit,
+    .bdrv_reopen_abort = raw_reopen_abort,
     .bdrv_close = raw_close,
     .bdrv_create = raw_create,
     .bdrv_co_discard = raw_co_discard,
diff --git a/block/raw-win32.c b/block/raw-win32.c
index c56bf8337..78c830648 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -77,6 +77,23 @@ static int set_sparse(int fd)
 				 NULL, 0, NULL, 0, &returned, NULL);
 }
 
+static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped)
+{
+    assert(access_flags != NULL);
+    assert(overlapped != NULL);
+
+    if (flags & BDRV_O_RDWR) {
+        *access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        *access_flags = GENERIC_READ;
+    }
+
+    *overlapped = FILE_ATTRIBUTE_NORMAL;
+    if (flags & BDRV_O_NOCACHE) {
+        *overlapped |= FILE_FLAG_NO_BUFFERING;
+    }
+}
+
 static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 {
     BDRVRawState *s = bs->opaque;
@@ -85,17 +102,8 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
 
     s->type = FTYPE_FILE;
 
-    if (flags & BDRV_O_RDWR) {
-        access_flags = GENERIC_READ | GENERIC_WRITE;
-    } else {
-        access_flags = GENERIC_READ;
-    }
+    raw_parse_flags(flags, &access_flags, &overlapped);
 
-    overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NOCACHE)
-        overlapped |= FILE_FLAG_NO_BUFFERING;
-    if (!(flags & BDRV_O_CACHE_WB))
-        overlapped |= FILE_FLAG_WRITE_THROUGH;
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           OPEN_EXISTING, overlapped, NULL);
@@ -374,18 +382,10 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
     }
     s->type = find_device_type(bs, filename);
 
-    if (flags & BDRV_O_RDWR) {
-        access_flags = GENERIC_READ | GENERIC_WRITE;
-    } else {
-        access_flags = GENERIC_READ;
-    }
+    raw_parse_flags(flags, &access_flags, &overlapped);
+
     create_flags = OPEN_EXISTING;
 
-    overlapped = FILE_ATTRIBUTE_NORMAL;
-    if (flags & BDRV_O_NOCACHE)
-        overlapped |= FILE_FLAG_NO_BUFFERING;
-    if (!(flags & BDRV_O_CACHE_WB))
-        overlapped |= FILE_FLAG_WRITE_THROUGH;
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           create_flags, overlapped, NULL);
diff --git a/block/raw.c b/block/raw.c
index ff34ea41e..253e949b8 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -9,6 +9,14 @@ static int raw_open(BlockDriverState *bs, int flags)
     return 0;
 }
 
+/* We have nothing to do for raw reopen, stubs just return
+ * success */
+static int raw_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue,  Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num,
                                      int nb_sectors, QEMUIOVector *qiov)
 {
@@ -115,6 +123,8 @@ static BlockDriver bdrv_raw = {
     .bdrv_open          = raw_open,
     .bdrv_close         = raw_close,
 
+    .bdrv_reopen_prepare  = raw_reopen_prepare,
+
     .bdrv_co_readv          = raw_co_readv,
     .bdrv_co_writev         = raw_co_writev,
     .bdrv_co_is_allocated   = raw_co_is_allocated,
diff --git a/block/rbd.c b/block/rbd.c
index 5a0f79fc8..015a9db0a 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -487,12 +487,6 @@ static int qemu_rbd_open(BlockDriverState *bs, const char *filename, int flags)
         rados_conf_set(s->cluster, "rbd_cache", "false");
     } else {
         rados_conf_set(s->cluster, "rbd_cache", "true");
-        if (!(flags & BDRV_O_CACHE_WB)) {
-            r = rados_conf_set(s->cluster, "rbd_cache_max_dirty", "0");
-            if (r < 0) {
-                rados_conf_set(s->cluster, "rbd_cache", "false");
-            }
-        }
     }
 
     if (strstr(conf, "conf=") == NULL) {
diff --git a/block/sheepdog.c b/block/sheepdog.c
index df4f44107..f35ff5bbe 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -866,14 +866,14 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
         s->port = 0;
     }
 
-    strncpy(vdi, p, SD_MAX_VDI_LEN);
+    pstrcpy(vdi, SD_MAX_VDI_LEN, p);
 
     p = strchr(vdi, ':');
     if (p) {
         *p++ = '\0';
         *snapid = strtoul(p, NULL, 10);
         if (*snapid == 0) {
-            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
+            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
         }
     } else {
         *snapid = CURRENT_VDI_ID; /* search current vdi */
@@ -900,7 +900,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
         return fd;
     }
 
-    memset(buf, 0, sizeof(buf));
+    /* This pair of strncpy calls ensures that the buffer is zero-filled,
+     * which is desirable since we'll soon be sending those bytes, and
+     * don't want the send_req to read uninitialized data.
+     */
     strncpy(buf, filename, SD_MAX_VDI_LEN);
     strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
 
@@ -1114,14 +1117,12 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
         goto out;
     }
 
-    if (flags & BDRV_O_CACHE_WB) {
-        s->cache_enabled = 1;
-        s->flush_fd = connect_to_sdog(s->addr, s->port);
-        if (s->flush_fd < 0) {
-            error_report("failed to connect");
-            ret = s->flush_fd;
-            goto out;
-        }
+    s->cache_enabled = 1;
+    s->flush_fd = connect_to_sdog(s->addr, s->port);
+    if (s->flush_fd < 0) {
+        error_report("failed to connect");
+        ret = s->flush_fd;
+        goto out;
     }
 
     if (snapid || tag[0] != '\0') {
@@ -1151,7 +1152,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     s->max_dirty_data_idx = 0;
 
     bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
-    strncpy(s->name, vdi, sizeof(s->name));
+    pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
     g_free(buf);
     return 0;
@@ -1179,8 +1180,11 @@ static int do_sd_create(char *filename, int64_t vdi_size,
         return fd;
     }
 
+    /* FIXME: would it be better to fail (e.g., return -EIO) when filename
+     * does not fit in buf?  For now, just truncate and avoid buffer overrun.
+     */
     memset(buf, 0, sizeof(buf));
-    strncpy(buf, filename, SD_MAX_VDI_LEN);
+    pstrcpy(buf, sizeof(buf), filename);
 
     memset(&hdr, 0, sizeof(hdr));
     hdr.opcode = SD_OP_NEW_VDI;
@@ -1754,6 +1758,9 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     s->inode.vm_state_size = sn_info->vm_state_size;
     s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
+    /* It appears that inode.tag does not require a NUL terminator,
+     * which means this use of strncpy is ok.
+     */
     strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
     /* we don't need to update entire object */
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
@@ -1813,13 +1820,13 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 
     memcpy(old_s, s, sizeof(BDRVSheepdogState));
 
-    memset(vdi, 0, sizeof(vdi));
-    strncpy(vdi, s->name, sizeof(vdi));
+    pstrcpy(vdi, sizeof(vdi), s->name);
 
-    memset(tag, 0, sizeof(tag));
     snapid = strtoul(snapshot_id, NULL, 10);
-    if (!snapid) {
-        strncpy(tag, s->name, sizeof(tag));
+    if (snapid) {
+        tag[0] = 0;
+    } else {
+        pstrcpy(tag, sizeof(tag), s->name);
     }
 
     ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
@@ -1948,8 +1955,9 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
             snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
                      inode.snap_id);
-            strncpy(sn_tab[found].name, inode.tag,
-                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
+            pstrcpy(sn_tab[found].name,
+                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
+                    inode.tag);
             found++;
         }
     }
@@ -1986,7 +1994,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
         vdi_index = pos / SD_DATA_OBJ_SIZE;
         offset = pos % SD_DATA_OBJ_SIZE;
 
-        data_len = MIN(remaining, SD_DATA_OBJ_SIZE);
+        data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
 
         vmstate_oid = vid_to_vmstate_oid(s->inode.vdi_id, vdi_index);
 
@@ -2007,6 +2015,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data,
         }
 
         pos += data_len;
+        data += data_len;
         remaining -= data_len;
     }
     ret = size;
diff --git a/block/stream.c b/block/stream.c
index c4f87dd5b..792665276 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -13,6 +13,7 @@
 
 #include "trace.h"
 #include "block_int.h"
+#include "blockjob.h"
 #include "qemu/ratelimit.h"
 
 enum {
@@ -30,6 +31,7 @@ typedef struct StreamBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *base;
+    BlockdevOnError on_error;
     char backing_file_id[1024];
 } StreamBlockJob;
 
@@ -77,6 +79,7 @@ static void coroutine_fn stream_run(void *opaque)
     BlockDriverState *bs = s->common.bs;
     BlockDriverState *base = s->base;
     int64_t sector_num, end;
+    int error = 0;
     int ret = 0;
     int n = 0;
     void *buf;
@@ -141,7 +144,19 @@ wait:
             ret = stream_populate(bs, sector_num, n, buf);
         }
         if (ret < 0) {
-            break;
+            BlockErrorAction action =
+                block_job_error_action(&s->common, s->common.bs, s->on_error,
+                                       true, -ret);
+            if (action == BDRV_ACTION_STOP) {
+                n = 0;
+                continue;
+            }
+            if (error == 0) {
+                error = ret;
+            }
+            if (action == BDRV_ACTION_REPORT) {
+                break;
+            }
         }
         ret = 0;
 
@@ -153,6 +168,9 @@ wait:
         bdrv_disable_copy_on_read(bs);
     }
 
+    /* Do not remove the backing file if an error was there but ignored.  */
+    ret = error;
+
     if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
         const char *base_id = NULL, *base_fmt = NULL;
         if (base) {
@@ -188,11 +206,19 @@ static BlockJobType stream_job_type = {
 
 void stream_start(BlockDriverState *bs, BlockDriverState *base,
                   const char *base_id, int64_t speed,
+                  BlockdevOnError on_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp)
 {
     StreamBlockJob *s;
 
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+        return;
+    }
+
     s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
@@ -203,6 +229,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
         pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
     }
 
+    s->on_error = on_error;
     s->common.co = qemu_coroutine_create(stream_run);
     trace_stream_start(bs, base, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
diff --git a/block/vdi.c b/block/vdi.c
index c4f1529db..f35b12ec9 100644
--- a/block/vdi.c
+++ b/block/vdi.c
@@ -454,6 +454,12 @@ static int vdi_open(BlockDriverState *bs, int flags)
     return -1;
 }
 
+static int vdi_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, int *pnum)
 {
@@ -628,7 +634,6 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
     VdiHeader header;
     size_t i;
     size_t bmap_size;
-    uint32_t *bmap;
 
     logout("\n");
 
@@ -693,21 +698,21 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options)
         result = -errno;
     }
 
-    bmap = NULL;
     if (bmap_size > 0) {
-        bmap = (uint32_t *)g_malloc0(bmap_size);
-    }
-    for (i = 0; i < blocks; i++) {
-        if (image_type == VDI_TYPE_STATIC) {
-            bmap[i] = i;
-        } else {
-            bmap[i] = VDI_UNALLOCATED;
+        uint32_t *bmap = g_malloc0(bmap_size);
+        for (i = 0; i < blocks; i++) {
+            if (image_type == VDI_TYPE_STATIC) {
+                bmap[i] = i;
+            } else {
+                bmap[i] = VDI_UNALLOCATED;
+            }
         }
+        if (write(fd, bmap, bmap_size) < 0) {
+            result = -errno;
+        }
+        g_free(bmap);
     }
-    if (write(fd, bmap, bmap_size) < 0) {
-        result = -errno;
-    }
-    g_free(bmap);
+
     if (image_type == VDI_TYPE_STATIC) {
         if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) {
             result = -errno;
@@ -762,6 +767,7 @@ static BlockDriver bdrv_vdi = {
     .bdrv_probe = vdi_probe,
     .bdrv_open = vdi_open,
     .bdrv_close = vdi_close,
+    .bdrv_reopen_prepare = vdi_reopen_prepare,
     .bdrv_create = vdi_create,
     .bdrv_co_is_allocated = vdi_co_is_allocated,
     .bdrv_make_empty = vdi_make_empty,
diff --git a/block/vmdk.c b/block/vmdk.c
index bba4c61a7..1a80e5a24 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -300,6 +300,40 @@ static int vmdk_is_cid_valid(BlockDriverState *bs)
     return 1;
 }
 
+/* Queue extents, if any, for reopen() */
+static int vmdk_reopen_prepare(BDRVReopenState *state,
+                               BlockReopenQueue *queue, Error **errp)
+{
+    BDRVVmdkState *s;
+    int ret = -1;
+    int i;
+    VmdkExtent *e;
+
+    assert(state != NULL);
+    assert(state->bs != NULL);
+
+    if (queue == NULL) {
+        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
+                 "No reopen queue for VMDK extents");
+        goto exit;
+    }
+
+    s = state->bs->opaque;
+
+    assert(s != NULL);
+
+    for (i = 0; i < s->num_extents; i++) {
+        e = &s->extents[i];
+        if (e->file != state->bs->file) {
+            bdrv_reopen_queue(queue, e->file, state->flags);
+        }
+    }
+    ret = 0;
+
+exit:
+    return ret;
+}
+
 static int vmdk_parent_open(BlockDriverState *bs)
 {
     char *p_name;
@@ -1374,8 +1408,7 @@ static int relative_path(char *dest, int dest_size,
         return -1;
     }
     if (path_is_absolute(target)) {
-        dest[dest_size - 1] = '\0';
-        strncpy(dest, target, dest_size - 1);
+        pstrcpy(dest, dest_size, target);
         return 0;
     }
     while (base[i] == target[i]) {
@@ -1646,6 +1679,7 @@ static BlockDriver bdrv_vmdk = {
     .instance_size  = sizeof(BDRVVmdkState),
     .bdrv_probe     = vmdk_probe,
     .bdrv_open      = vmdk_open,
+    .bdrv_reopen_prepare = vmdk_reopen_prepare,
     .bdrv_read      = vmdk_co_read,
     .bdrv_write     = vmdk_co_write,
     .bdrv_close     = vmdk_close,
diff --git a/block/vpc.c b/block/vpc.c
index c0b82c4f5..b6bf52f14 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -265,6 +265,12 @@ static int vpc_open(BlockDriverState *bs, int flags)
     return err;
 }
 
+static int vpc_reopen_prepare(BDRVReopenState *state,
+                              BlockReopenQueue *queue, Error **errp)
+{
+    return 0;
+}
+
 /*
  * Returns the absolute byte offset of the given sector in the image file.
  * If the sector is not allocated, -1 is returned instead.
@@ -783,6 +789,7 @@ static BlockDriver bdrv_vpc = {
     .bdrv_probe     = vpc_probe,
     .bdrv_open      = vpc_open,
     .bdrv_close     = vpc_close,
+    .bdrv_reopen_prepare = vpc_reopen_prepare,
     .bdrv_create    = vpc_create,
 
     .bdrv_read              = vpc_co_read,