From b422f4194dc2589ecf35a48066c9c8043e2c0157 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Tue, 31 Aug 2010 11:22:29 +0200
Subject: scsi-disk: fix the mode data length field returned by the MODE SENSE
 command

The MODE DATA LENGTH field indicates the length in bytes of the following
data that is available to be transferred. The mode data length does not include
the number of bytes in the MODE DATA LENGTH field.

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 78e70c30612833fd0017cfa5b519bc23df808927)
---
 hw/scsi-disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index f43f2d097..57439f456 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -652,7 +652,7 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
     }
 
     buflen = p - outbuf;
-    outbuf[0] = buflen - 4;
+    outbuf[0] = buflen - 1;
     if (buflen > req->cmd.xfer)
         buflen = req->cmd.xfer;
     return buflen;
-- 
cgit v1.2.3


From 5105d99b7f93095b0a04084bcdee545c7aa4036e Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Tue, 31 Aug 2010 14:08:23 +0200
Subject: scsi-disk: fix the mode data header returned by the MODE SENSE(10)
 command

The header for the  MODE SENSE(10) command is 8 bytes long.

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit ce512ee115b20bfc8a562d528a3f14eeff9ddf64)
---
 hw/scsi-disk.c | 38 ++++++++++++++++++++++++++++++++------
 1 file changed, 32 insertions(+), 6 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 57439f456..22489b8d2 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -606,6 +606,7 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
     uint64_t nb_sectors;
     int page, dbd, buflen;
     uint8_t *p;
+    uint8_t dev_specific_param;
 
     dbd = req->cmd.buf[1]  & 0x8;
     page = req->cmd.buf[2] & 0x3f;
@@ -613,16 +614,31 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
     memset(outbuf, 0, req->cmd.xfer);
     p = outbuf;
 
-    p[1] = 0; /* Default media type.  */
-    p[3] = 0; /* Block descriptor length.  */
     if (bdrv_is_read_only(s->bs)) {
-        p[2] = 0x80; /* Readonly.  */
+        dev_specific_param = 0x80; /* Readonly.  */
+    } else {
+        dev_specific_param = 0x00;
+    }
+
+    if (req->cmd.buf[0] == MODE_SENSE) {
+        p[1] = 0; /* Default media type.  */
+        p[2] = dev_specific_param;
+        p[3] = 0; /* Block descriptor length.  */
+        p += 4;
+    } else { /* MODE_SENSE_10 */
+        p[2] = 0; /* Default media type.  */
+        p[3] = dev_specific_param;
+        p[6] = p[7] = 0; /* Block descriptor length.  */
+        p += 8;
     }
-    p += 4;
 
     bdrv_get_geometry(s->bs, &nb_sectors);
     if ((~dbd) & nb_sectors) {
-        outbuf[3] = 8; /* Block descriptor length  */
+        if (req->cmd.buf[0] == MODE_SENSE) {
+            outbuf[3] = 8; /* Block descriptor length  */
+        } else { /* MODE_SENSE_10 */
+            outbuf[7] = 8; /* Block descriptor length  */
+        }
         nb_sectors /= s->cluster_size;
         nb_sectors--;
         if (nb_sectors > 0xffffff)
@@ -652,7 +668,17 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
     }
 
     buflen = p - outbuf;
-    outbuf[0] = buflen - 1;
+    /*
+     * The mode data length field specifies the length in bytes of the
+     * following data that is available to be transferred. The mode data
+     * length does not include itself.
+     */
+    if (req->cmd.buf[0] == MODE_SENSE) {
+        outbuf[0] = buflen - 1;
+    } else { /* MODE_SENSE_10 */
+        outbuf[0] = ((buflen - 2) >> 8) & 0xff;
+        outbuf[1] = (buflen - 2) & 0xff;
+    }
     if (buflen > req->cmd.xfer)
         buflen = req->cmd.xfer;
     return buflen;
-- 
cgit v1.2.3


From 3bc5aa187f76aedec23deaf1ec50b0a53fa78d34 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Tue, 31 Aug 2010 14:08:24 +0200
Subject: scsi-disk: respect the page control (PC) field in the MODE SENSE
 command

The page control (PC) field defines the type of mode parameter values
to be returned in the mode pages:

PC=0 : Current values
PC=1 : Changeable values
PC=2 : Default values
PC=3 : Saved values

The current implementation always returns the same type of parameters.
This is OK for Current and Default values as we don't support changes
to be done by the MODE SELECT command.

For Saved values the following applies (implemented by this patch):
"A PC field value of 3h requests that the target return the saved
values of the mode parameters. Implementation of saved page parameters
is optional. Mode parameters not supported by the target shall be set
to zero. If saved values are not implemented, the command shall be
terminated with CHECK CONDITION status, the sense key set to
ILLEGAL REQUEST and the additional sense code set to
SAVING PARAMETERS NOT SUPPORTED."

For Changeable values the following applies (implemented by this patch):
"A PC field value of 1h requests that the target return a mask denoting
those mode parameters that are changeable. In the mask, the fields of
the mode parameters that are changeable shall be set to all one bits and
the fields of the mode parameters that are non-changeable (i.e. defined
by the target) shall be set to all zero bits."

In newer versions of the SCSI-2 spec the following clause was added.
"If the logical unit does not implement changeable parameters mode pages
and the device server receives a MODE SENSE command with 01b in the PC
field, then the command shall be terminated with CHECK CONDITION status,
with the sense key set to ILLEGAL REQUEST, and the additional sense code
set to INVALID FIELD IN CDB."

This was not yet included in the SCSI-2 Working Drafts from 1986-1993.
I assume that the variant to return CHECK CONDITION for PC=1 is not
widely implemented by real devices. I have a legacy OS which fails,
if MODE_SENSE returns non GOOD for PC=1. So for highest compatibility I
implemented the former variant with this patch.

The last Working Draft X3T9.2 Rev. 10L 7-SEP-93 can be found here:
http://ldkelley.com/SCSI2/SCSI2/SCSI2-08.html#8.2.10

In mode_sense_page() this patch also avoids multiple hard coded
definitions of the same mode page length. Instead I use the varable
p[1]. In fact the returned length of the mode pages 4 and 5 were wrong
(2 bytes less).

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 282ab04eb1e6f4faa6c5d2827e3209c4a1eec40e)
---
 hw/scsi-disk.c | 45 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 22489b8d2..384d19f11 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -485,16 +485,26 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
     return buflen;
 }
 
-static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p)
+static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p,
+                           int page_control)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     BlockDriverState *bdrv = s->bs;
     int cylinders, heads, secs;
 
+    /*
+     * If Changeable Values are requested, a mask denoting those mode parameters
+     * that are changeable shall be returned. As we currently don't support
+     * parameter changes via MODE_SELECT all bits are returned set to zero.
+     * The buffer was already menset to zero by the caller of this function.
+     */
     switch (page) {
     case 4: /* Rigid disk device geometry page. */
         p[0] = 4;
         p[1] = 0x16;
+        if (page_control == 1) { /* Changeable Values */
+            return p[1] + 2;
+        }
         /* if a geometry hint is available, use it */
         bdrv_get_geometry_hint(bdrv, &cylinders, &heads, &secs);
         p[2] = (cylinders >> 16) & 0xff;
@@ -519,11 +529,14 @@ static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p)
         /* Medium rotation rate [rpm], 5400 rpm */
         p[20] = (5400 >> 8) & 0xff;
         p[21] = 5400 & 0xff;
-        return 0x16;
+        return p[1] + 2;
 
     case 5: /* Flexible disk device geometry page. */
         p[0] = 5;
         p[1] = 0x1e;
+        if (page_control == 1) { /* Changeable Values */
+            return p[1] + 2;
+        }
         /* Transfer rate [kbit/s], 5Mbit/s */
         p[2] = 5000 >> 8;
         p[3] = 5000 & 0xff;
@@ -555,21 +568,27 @@ static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p)
         /* Medium rotation rate [rpm], 5400 rpm */
         p[28] = (5400 >> 8) & 0xff;
         p[29] = 5400 & 0xff;
-        return 0x1e;
+        return p[1] + 2;
 
     case 8: /* Caching page.  */
         p[0] = 8;
         p[1] = 0x12;
+        if (page_control == 1) { /* Changeable Values */
+            return p[1] + 2;
+        }
         if (bdrv_enable_write_cache(s->bs)) {
             p[2] = 4; /* WCE */
         }
-        return 20;
+        return p[1] + 2;
 
     case 0x2a: /* CD Capabilities and Mechanical Status page. */
         if (bdrv_get_type_hint(bdrv) != BDRV_TYPE_CDROM)
             return 0;
         p[0] = 0x2a;
         p[1] = 0x14;
+        if (page_control == 1) { /* Changeable Values */
+            return p[1] + 2;
+        }
         p[2] = 3; // CD-R & CD-RW read
         p[3] = 0; // Writing not supported
         p[4] = 0x7f; /* Audio, composite, digital out,
@@ -593,7 +612,7 @@ static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p)
         p[19] = (16 * 176) & 0xff;
         p[20] = (16 * 176) >> 8; // 16x write speed current
         p[21] = (16 * 176) & 0xff;
-        return 22;
+        return p[1] + 2;
 
     default:
         return 0;
@@ -604,13 +623,15 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, req->dev);
     uint64_t nb_sectors;
-    int page, dbd, buflen;
+    int page, dbd, buflen, page_control;
     uint8_t *p;
     uint8_t dev_specific_param;
 
     dbd = req->cmd.buf[1]  & 0x8;
     page = req->cmd.buf[2] & 0x3f;
-    DPRINTF("Mode Sense (page %d, len %zd)\n", page, req->cmd.xfer);
+    page_control = (req->cmd.buf[2] & 0xc0) >> 6;
+    DPRINTF("Mode Sense(%d) (page %d, len %d, page_control %d)\n",
+        (req->cmd.buf[0] == MODE_SENSE) ? 6 : 10, page, len, page_control);
     memset(outbuf, 0, req->cmd.xfer);
     p = outbuf;
 
@@ -654,16 +675,20 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
         p += 8;
     }
 
+    if (page_control == 3) { /* Saved Values */
+        return -1; /* ILLEGAL_REQUEST */
+    }
+
     switch (page) {
     case 0x04:
     case 0x05:
     case 0x08:
     case 0x2a:
-        p += mode_sense_page(req, page, p);
+        p += mode_sense_page(req, page, p, page_control);
         break;
     case 0x3f:
-        p += mode_sense_page(req, 0x08, p);
-        p += mode_sense_page(req, 0x2a, p);
+        p += mode_sense_page(req, 0x08, p, page_control);
+        p += mode_sense_page(req, 0x2a, p, page_control);
         break;
     }
 
-- 
cgit v1.2.3


From 5aa0e6cb569a4eef1be0073eff3c315a3c7bf049 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Tue, 31 Aug 2010 14:08:25 +0200
Subject: scsi-disk: fix the block descriptor returned by the MODE SENSE
 command

The block descriptor contains the number of blocks, not the highest LBA.
Real hard disks return 0 if the number of blocks exceed the maximum 0xFFFFFF.

SCSI-Spec:
http://ldkelley.com/SCSI2/SCSI2/SCSI2-08.html#8.3.3
"The number of blocks field specifies the number of logical blocks on the
medium to which the density code and block length fields apply. A value
of zero indicates that all of the remaining logical blocks of the logical
unit shall have the medium characteristics specified."

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 2488b74081650a5312fe1515660b6cb095244c34)
---
 hw/scsi-disk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 384d19f11..2a107b1bf 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -661,9 +661,8 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
             outbuf[7] = 8; /* Block descriptor length  */
         }
         nb_sectors /= s->cluster_size;
-        nb_sectors--;
         if (nb_sectors > 0xffffff)
-            nb_sectors = 0xffffff;
+            nb_sectors = 0;
         p[0] = 0; /* media density code */
         p[1] = (nb_sectors >> 16) & 0xff;
         p[2] = (nb_sectors >> 8) & 0xff;
-- 
cgit v1.2.3


From d65741acf42daffa422a2c3e77465df49e550d49 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Tue, 31 Aug 2010 14:08:26 +0200
Subject: scsi-disk: return CHECK CONDITION for unknown page codes in the MODE
 SENSE command

SCSI-Spec:
http://ldkelley.com/SCSI2/SCSI2/SCSI2-08.html#8.2.10
"An initiator may request any one or all of the supported mode pages
from a target. If an initiator issues a MODE SENSE command with a
page code value not implemented by the target, the target shall return
CHECK CONDITION status and shall set the sense key to ILLEGAL REQUEST
and the additional sense code to INVALID FIELD IN CDB."

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit a9c17b2bf3639662fbdeb736289ebabfda9fa21a)
---
 hw/scsi-disk.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 2a107b1bf..70c7a0819 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -689,6 +689,8 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
         p += mode_sense_page(req, 0x08, p, page_control);
         p += mode_sense_page(req, 0x2a, p, page_control);
         break;
+    default:
+        return -1; /* ILLEGAL_REQUEST */
     }
 
     buflen = p - outbuf;
-- 
cgit v1.2.3


From e632519ab82a5f9310427e7756dba880c86386d0 Mon Sep 17 00:00:00 2001
From: Bernhard Kohl <bernhard.kohl@nsn.com>
Date: Tue, 31 Aug 2010 14:08:27 +0200
Subject: scsi-disk: fix the check of the DBD bit in the MODE SENSE command

The DBD bit does not work as expected.

SCSI-Spec:
http://ldkelley.com/SCSI2/SCSI2/SCSI2-08.html#8.2.10
"A disable block descriptors (DBD) bit of zero indicates that the target
may return zero or more block descriptors in the returned MODE SENSE
data (see 8.3.3), at the target's discretion. A DBD bit of one
specifies that the target shall not return any block descriptors in the
returned MODE SENSE data."

Signed-off-by: Bernhard Kohl <bernhard.kohl@nsn.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 333d50fe3d9a1ff0a6a1a44ef42a0d3a2a7f2abe)
---
 hw/scsi-disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 70c7a0819..b64dd3146 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -654,7 +654,7 @@ static int scsi_disk_emulate_mode_sense(SCSIRequest *req, uint8_t *outbuf)
     }
 
     bdrv_get_geometry(s->bs, &nb_sectors);
-    if ((~dbd) & nb_sectors) {
+    if (!dbd && nb_sectors) {
         if (req->cmd.buf[0] == MODE_SENSE) {
             outbuf[3] = 8; /* Block descriptor length  */
         } else { /* MODE_SENSE_10 */
-- 
cgit v1.2.3


From 375d40709e55e132f7fe9d83acd4aa6e30d43971 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 2 Sep 2010 17:48:32 +0200
Subject: raw-posix: Don't use file name for host_cdrom detection on Linux

On Linux, we have code to detect CD-ROMs using an ioctl. We shouldn't lose
anything but false positives by removing the check for a /dev/cd* path.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 897804d6299af372a43110799cbe1d6804d5e1bc)
---
 block/raw-posix.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 72fb8cebd..0fce44909 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1154,9 +1154,6 @@ static int cdrom_probe_device(const char *filename)
     int fd, ret;
     int prio = 0;
 
-    if (strstart(filename, "/dev/cd", NULL))
-        prio = 50;
-
     fd = open(filename, O_RDONLY | O_NONBLOCK);
     if (fd < 0) {
         goto out;
-- 
cgit v1.2.3


From 78b6890828d08d4306e313e3f550d1b4b61685c8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 31 Aug 2010 13:44:25 +0200
Subject: qemu-img convert: Use cache=unsafe for output image

If qemu-img crashes during the conversion, the user will throw away the broken
output file anyway and start over. So no need to be too cautious.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 1bd8e175580a87c7b9e6791faca7626f9bc3ceeb)
---
 qemu-img.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index d2a978b91..4e035e44c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -783,7 +783,8 @@ static int img_convert(int argc, char **argv)
         goto out;
     }
 
-    out_bs = bdrv_new_open(out_filename, out_fmt, BDRV_O_FLAGS | BDRV_O_RDWR);
+    out_bs = bdrv_new_open(out_filename, out_fmt,
+        BDRV_O_FLAGS | BDRV_O_RDWR | BDRV_O_NO_FLUSH);
     if (!out_bs) {
         ret = -1;
         goto out;
-- 
cgit v1.2.3


From 5a0d460c350c58362651458824618cf6b3de3710 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Tue, 31 Aug 2010 15:08:03 +0200
Subject: block: Fix BDRV_O_CACHE_MASK

BDRV_O_CACHE_MASK should have been extended when cache=unsafe introduced a new
flag BDRV_O_NO_FLUSH. There are currently no users that would change their
behaviour because of this, but let's clean it up before things break.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit ceb25e5c7554255931f7f5647d3ad9df36111f53)
---
 block.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.h b/block.h
index db131a343..5f6438010 100644
--- a/block.h
+++ b/block.h
@@ -35,7 +35,7 @@ typedef struct QEMUSnapshotInfo {
 #define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
 #define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
 
-#define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
+#define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH)
 
 #define BDRV_SECTOR_BITS   9
 #define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
-- 
cgit v1.2.3


From 2c25b8131631c39145fe1f5b7745943e0e0c0406 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 1 Sep 2010 12:40:52 +0200
Subject: qcow2: Remove unnecessary flush after L2 write

When a new cluster was allocated, we only need a flush after the write to the
L2 table if it was a COW and we need to decrease the refcounts of the old
clusters.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
(cherry picked from commit 7ec5e6a4ca43494949465f9f9f3d9e4c7c620503)
---
 block/qcow2-cluster.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 166922f8b..f562b1602 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -655,7 +655,7 @@ static int write_l2_entries(BlockDriverState *bs, uint64_t *l2_table,
     int ret;
 
     BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
-    ret = bdrv_pwrite_sync(bs->file, l2_offset + start_offset,
+    ret = bdrv_pwrite(bs->file, l2_offset + start_offset,
         &l2_table[l2_start_index], len);
     if (ret < 0) {
         return ret;
@@ -718,9 +718,17 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
         goto err;
     }
 
-    for (i = 0; i < j; i++)
-        qcow2_free_any_clusters(bs,
-            be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
+    /*
+     * If this was a COW, we need to decrease the refcount of the old cluster.
+     * Also flush bs->file to get the right order for L2 and refcount update.
+     */
+    if (j != 0) {
+        bdrv_flush(bs->file);
+        for (i = 0; i < j; i++) {
+            qcow2_free_any_clusters(bs,
+                be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED, 1);
+        }
+    }
 
     ret = 0;
 err:
-- 
cgit v1.2.3


From 1b191088aed81f1b3661b1554f98a038872a9423 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <mail@kevin-wolf.de>
Date: Fri, 10 Sep 2010 12:27:02 +0200
Subject: vvfat: Fix segfault on write to read-only disk

vvfat tries to set the readonly flag in its open function, but nowadays
this is overwritted with the readonly=... command line option. Check in
bdrv_write if the vvfat was opened read-only and return an error in this
case.

Without this check, vvfat tries to access the qcow bs, which is NULL
without enabled write support.

Signed-off-by: Kevin Wolf <mail@kevin-wolf.de>
(cherry picked from commit bfd0049440f53745d31eb93c208f0f3ab6308027)
---
 block/vvfat.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/vvfat.c b/block/vvfat.c
index 365332aa2..5898d664b 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2665,6 +2665,11 @@ static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
 
 DLOG(checkpoint());
 
+    /* Check if we're operating in read-only mode */
+    if (s->qcow == NULL) {
+        return -EACCES;
+    }
+
     vvfat_close_current_file(s);
 
     /*
-- 
cgit v1.2.3


From 345a6d2b54f0c3705a088e626d24e9376ff4f233 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <mail@kevin-wolf.de>
Date: Fri, 10 Sep 2010 12:27:03 +0200
Subject: vvfat: Fix double free for opening the image rw

Allocation and deallocation of bs->opaque is not in the control of a
block driver. Therefore it should not set bs->opaque to a data structure
used by another bs, or closing the image will lead to a double free.

Signed-off-by: Kevin Wolf <mail@kevin-wolf.de>
(cherry picked from commit 0af1e52e93bf5da63b15f1f9596dd4c076da07dc)
---
 block/vvfat.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 5898d664b..07720371c 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2768,12 +2768,12 @@ static int vvfat_is_allocated(BlockDriverState *bs,
 
 static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
 	const uint8_t* buffer, int nb_sectors) {
-    BDRVVVFATState* s = bs->opaque;
+    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     return try_commit(s);
 }
 
 static void write_target_close(BlockDriverState *bs) {
-    BDRVVVFATState* s = bs->opaque;
+    BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque);
     bdrv_delete(s->qcow);
     free(s->qcow_filename);
 }
@@ -2816,7 +2816,8 @@ static int enable_write_target(BDRVVVFATState *s)
 
     s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
     s->bs->backing_hd->drv = &vvfat_write_target;
-    s->bs->backing_hd->opaque = s;
+    s->bs->backing_hd->opaque = qemu_malloc(sizeof(void*));
+    *(void**)s->bs->backing_hd->opaque = s;
 
     return 0;
 }
-- 
cgit v1.2.3


From a3c4a01fb25a93d2b76d1fde1958ee616c152c5b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <mail@kevin-wolf.de>
Date: Fri, 10 Sep 2010 12:27:04 +0200
Subject: vvfat: Use cache=unsafe

The qcow file used for write support in vvfat is a temporary file,
so we can use cache=unsafe there. Without this, write support is just
too slow to be of any use.

Signed-off-by: Kevin Wolf <mail@kevin-wolf.de>
(cherry picked from commit 35ccd8aed64727dbefa1b274a8000b46318bfea1)
---
 block/vvfat.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index 07720371c..53e57bf22 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2788,6 +2788,7 @@ static int enable_write_target(BDRVVVFATState *s)
 {
     BlockDriver *bdrv_qcow;
     QEMUOptionParameter *options;
+    int ret;
     int size = sector2cluster(s, s->sector_count);
     s->used_clusters = calloc(size, 1);
 
@@ -2803,11 +2804,16 @@ static int enable_write_target(BDRVVVFATState *s)
 
     if (bdrv_create(bdrv_qcow, s->qcow_filename, options) < 0)
 	return -1;
+
     s->qcow = bdrv_new("");
-    if (s->qcow == NULL ||
-        bdrv_open(s->qcow, s->qcow_filename, BDRV_O_RDWR, bdrv_qcow) < 0)
-    {
-	return -1;
+    if (s->qcow == NULL) {
+        return -1;
+    }
+
+    ret = bdrv_open(s->qcow, s->qcow_filename,
+            BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow);
+    if (ret < 0) {
+	return ret;
     }
 
 #ifndef _WIN32
-- 
cgit v1.2.3


From 5b500f974ba1701436ec0e4743c802c0651a3f4f Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Mon, 26 Jul 2010 18:10:57 -0600
Subject: Add qemu_ram_alloc_from_ptr function

Provide a function to add an allocated region of memory to the qemu RAM.

This patch is copied from Marcelo's qemu_ram_map() in qemu-kvm and given the
clearer name qemu_ram_alloc_from_ptr().

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 84b89d782f03b99770759f1d9d6e4e95a2641c35)
---
 cpu-common.h |  2 ++
 exec.c       | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/cpu-common.h b/cpu-common.h
index 71e7933c5..0426bc8e2 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -40,6 +40,8 @@ static inline void cpu_register_physical_memory(target_phys_addr_t start_addr,
 }
 
 ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr);
+ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
+                        ram_addr_t size, void *host);
 ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size);
 void qemu_ram_free(ram_addr_t addr);
 /* This should only be used for ram local to a device.  */
diff --git a/exec.c b/exec.c
index 868cd7fff..8636316b0 100644
--- a/exec.c
+++ b/exec.c
@@ -2808,6 +2808,49 @@ static ram_addr_t last_ram_offset(void)
     return last;
 }
 
+ram_addr_t qemu_ram_alloc_from_ptr(DeviceState *dev, const char *name,
+                        ram_addr_t size, void *host)
+{
+    RAMBlock *new_block, *block;
+
+    size = TARGET_PAGE_ALIGN(size);
+    new_block = qemu_mallocz(sizeof(*new_block));
+
+    if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) {
+        char *id = dev->parent_bus->info->get_dev_path(dev);
+        if (id) {
+            snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
+            qemu_free(id);
+        }
+    }
+    pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
+
+    QLIST_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(block->idstr, new_block->idstr)) {
+            fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
+                    new_block->idstr);
+            abort();
+        }
+    }
+
+    new_block->host = host;
+
+    new_block->offset = find_ram_offset(size);
+    new_block->length = size;
+
+    QLIST_INSERT_HEAD(&ram_list.blocks, new_block, next);
+
+    ram_list.phys_dirty = qemu_realloc(ram_list.phys_dirty,
+                                       last_ram_offset() >> TARGET_PAGE_BITS);
+    memset(ram_list.phys_dirty + (new_block->offset >> TARGET_PAGE_BITS),
+           0xff, size >> TARGET_PAGE_BITS);
+
+    if (kvm_enabled())
+        kvm_setup_guest_memory(new_block->host, size);
+
+    return new_block->offset;
+}
+
 ram_addr_t qemu_ram_alloc(DeviceState *dev, const char *name, ram_addr_t size)
 {
     RAMBlock *new_block, *block;
-- 
cgit v1.2.3


From 6f8d14beb2d816ac719a801884bf0984b9564025 Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Mon, 26 Jul 2010 18:10:58 -0600
Subject: Device specification for shared memory PCI device

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit b6828931ebac027b869e40ec9518a291078dafe5)
---
 docs/specs/ivshmem_device_spec.txt | 96 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 96 insertions(+)
 create mode 100644 docs/specs/ivshmem_device_spec.txt

diff --git a/docs/specs/ivshmem_device_spec.txt b/docs/specs/ivshmem_device_spec.txt
new file mode 100644
index 000000000..23dd2ba89
--- /dev/null
+++ b/docs/specs/ivshmem_device_spec.txt
@@ -0,0 +1,96 @@
+
+Device Specification for Inter-VM shared memory device
+------------------------------------------------------
+
+The Inter-VM shared memory device is designed to share a region of memory to
+userspace in multiple virtual guests.  The memory region does not belong to any
+guest, but is a POSIX memory object on the host.  Optionally, the device may
+support sending interrupts to other guests sharing the same memory region.
+
+
+The Inter-VM PCI device
+-----------------------
+
+*BARs*
+
+The device supports three BARs.  BAR0 is a 1 Kbyte MMIO region to support
+registers.  BAR1 is used for MSI-X when it is enabled in the device.  BAR2 is
+used to map the shared memory object from the host.  The size of BAR2 is
+specified when the guest is started and must be a power of 2 in size.
+
+*Registers*
+
+The device currently supports 4 registers of 32-bits each.  Registers
+are used for synchronization between guests sharing the same memory object when
+interrupts are supported (this requires using the shared memory server).
+
+The server assigns each VM an ID number and sends this ID number to the Qemu
+process when the guest starts.
+
+enum ivshmem_registers {
+    IntrMask = 0,
+    IntrStatus = 4,
+    IVPosition = 8,
+    Doorbell = 12
+};
+
+The first two registers are the interrupt mask and status registers.  Mask and
+status are only used with pin-based interrupts.  They are unused with MSI
+interrupts.
+
+Status Register: The status register is set to 1 when an interrupt occurs.
+
+Mask Register: The mask register is bitwise ANDed with the interrupt status
+and the result will raise an interrupt if it is non-zero.  However, since 1 is
+the only value the status will be set to, it is only the first bit of the mask
+that has any effect.  Therefore interrupts can be masked by setting the first
+bit to 0 and unmasked by setting the first bit to 1.
+
+IVPosition Register: The IVPosition register is read-only and reports the
+guest's ID number.  The guest IDs are non-negative integers.  When using the
+server, since the server is a separate process, the VM ID will only be set when
+the device is ready (shared memory is received from the server and accessible via
+the device).  If the device is not ready, the IVPosition will return -1.
+Applications should ensure that they have a valid VM ID before accessing the
+shared memory.
+
+Doorbell Register:  To interrupt another guest, a guest must write to the
+Doorbell register.  The doorbell register is 32-bits, logically divided into
+two 16-bit fields.  The high 16-bits are the guest ID to interrupt and the low
+16-bits are the interrupt vector to trigger.  The semantics of the value
+written to the doorbell depends on whether the device is using MSI or a regular
+pin-based interrupt.  In short, MSI uses vectors while regular interrupts set the
+status register.
+
+Regular Interrupts
+
+If regular interrupts are used (due to either a guest not supporting MSI or the
+user specifying not to use them on startup) then the value written to the lower
+16-bits of the Doorbell register results is arbitrary and will trigger an
+interrupt in the destination guest.
+
+Message Signalled Interrupts
+
+A ivshmem device may support multiple MSI vectors.  If so, the lower 16-bits
+written to the Doorbell register must be between 0 and the maximum number of
+vectors the guest supports.  The lower 16 bits written to the doorbell is the
+MSI vector that will be raised in the destination guest.  The number of MSI
+vectors is configurable but it is set when the VM is started.
+
+The important thing to remember with MSI is that it is only a signal, no status
+is set (since MSI interrupts are not shared).  All information other than the
+interrupt itself should be communicated via the shared memory region.  Devices
+supporting multiple MSI vectors can use different vectors to indicate different
+events have occurred.  The semantics of interrupt vectors are left to the
+user's discretion.
+
+
+Usage in the Guest
+------------------
+
+The shared memory device is intended to be used with the provided UIO driver.
+Very little configuration is needed.  The guest should map BAR0 to access the
+registers (an array of 32-bit ints allows simple writing) and map BAR2 to
+access the shared memory region itself.  The size of the shared memory region
+is specified when the guest (or shared memory server) is started.  A guest may
+map the whole shared memory region or only part of it.
-- 
cgit v1.2.3


From 089c6725207b66d6f9fe52eb6a7e9befe135da28 Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Mon, 26 Jul 2010 18:10:59 -0600
Subject: Add function to assign ioeventfd to MMIO.

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 44f1a3d8765b19ee88ca493e8e13cefb6ee50cbe)
---
 kvm-all.c | 32 ++++++++++++++++++++++++++++++++
 kvm.h     |  1 +
 2 files changed, 33 insertions(+)

diff --git a/kvm-all.c b/kvm-all.c
index 7635f2f89..d9a5dd05a 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1241,6 +1241,38 @@ int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
     return r;
 }
 
+int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
+{
+#ifdef KVM_IOEVENTFD
+    int ret;
+    struct kvm_ioeventfd iofd;
+
+    iofd.datamatch = val;
+    iofd.addr = addr;
+    iofd.len = 4;
+    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
+    iofd.fd = fd;
+
+    if (!kvm_enabled()) {
+        return -ENOSYS;
+    }
+
+    if (!assign) {
+        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
+    }
+
+    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
+
+    if (ret < 0) {
+        return -errno;
+    }
+
+    return 0;
+#else
+    return -ENOSYS;
+#endif
+}
+
 int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
 {
 #ifdef KVM_IOEVENTFD
diff --git a/kvm.h b/kvm.h
index 93f81871e..50b6c01ec 100644
--- a/kvm.h
+++ b/kvm.h
@@ -175,6 +175,7 @@ static inline void cpu_synchronize_post_init(CPUState *env)
 }
 
 #endif
+int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign);
 
 int kvm_set_ioeventfd_pio_word(int fd, uint16_t adr, uint16_t val, bool assign);
 #endif
-- 
cgit v1.2.3


From 9367dbbe6d4169ae58f356166329f9a42d88d624 Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Mon, 26 Jul 2010 18:11:00 -0600
Subject: Support marking a device as non-migratable

A non-migratable device should be removed before migration and re-added after.

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 2431296806bc7a40c29b7775e16f36dc1cda4d06)
---
 hw/hw.h  |  2 ++
 savevm.c | 44 +++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/hw/hw.h b/hw/hw.h
index e3c3db27f..4405092b5 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -264,6 +264,8 @@ int register_savevm_live(DeviceState *dev,
                          void *opaque);
 
 void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque);
+void register_device_unmigratable(DeviceState *dev, const char *idstr,
+                                                                void *opaque);
 
 typedef void QEMUResetHandler(void *opaque);
 
diff --git a/savevm.c b/savevm.c
index 4c0e5d3b1..3b00a8458 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1018,6 +1018,7 @@ typedef struct SaveStateEntry {
     const VMStateDescription *vmsd;
     void *opaque;
     CompatEntry *compat;
+    int no_migrate;
 } SaveStateEntry;
 
 
@@ -1081,6 +1082,7 @@ int register_savevm_live(DeviceState *dev,
     se->load_state = load_state;
     se->opaque = opaque;
     se->vmsd = NULL;
+    se->no_migrate = 0;
 
     if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) {
         char *id = dev->parent_bus->info->get_dev_path(dev);
@@ -1147,6 +1149,31 @@ void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
     }
 }
 
+/* mark a device as not to be migrated, that is the device should be
+   unplugged before migration */
+void register_device_unmigratable(DeviceState *dev, const char *idstr,
+                                                            void *opaque)
+{
+    SaveStateEntry *se;
+    char id[256] = "";
+
+    if (dev && dev->parent_bus && dev->parent_bus->info->get_dev_path) {
+        char *path = dev->parent_bus->info->get_dev_path(dev);
+        if (path) {
+            pstrcpy(id, sizeof(id), path);
+            pstrcat(id, sizeof(id), "/");
+            qemu_free(path);
+        }
+    }
+    pstrcat(id, sizeof(id), idstr);
+
+    QTAILQ_FOREACH(se, &savevm_handlers, entry) {
+        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
+            se->no_migrate = 1;
+        }
+    }
+}
+
 int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
                                    const VMStateDescription *vmsd,
                                    void *opaque, int alias_id,
@@ -1353,13 +1380,19 @@ static int vmstate_load(QEMUFile *f, SaveStateEntry *se, int version_id)
     return vmstate_load_state(f, se->vmsd, se->opaque, version_id);
 }
 
-static void vmstate_save(QEMUFile *f, SaveStateEntry *se)
+static int vmstate_save(QEMUFile *f, SaveStateEntry *se)
 {
+    if (se->no_migrate) {
+        return -1;
+    }
+
     if (!se->vmsd) {         /* Old style */
         se->save_state(f, se->opaque);
-        return;
+        return 0;
     }
     vmstate_save_state(f,se->vmsd, se->opaque);
+
+    return 0;
 }
 
 #define QEMU_VM_FILE_MAGIC           0x5145564d
@@ -1454,6 +1487,7 @@ int qemu_savevm_state_iterate(Monitor *mon, QEMUFile *f)
 int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f)
 {
     SaveStateEntry *se;
+    int r;
 
     cpu_synchronize_all_states();
 
@@ -1486,7 +1520,11 @@ int qemu_savevm_state_complete(Monitor *mon, QEMUFile *f)
         qemu_put_be32(f, se->instance_id);
         qemu_put_be32(f, se->version_id);
 
-        vmstate_save(f, se);
+        r = vmstate_save(f, se);
+        if (r < 0) {
+            monitor_printf(mon, "cannot migrate with device '%s'\n", se->idstr);
+            return r;
+        }
     }
 
     qemu_put_byte(f, QEMU_VM_EOF);
-- 
cgit v1.2.3


From ec810f662a2944a3042426838c8715a016c855fc Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Tue, 27 Jul 2010 10:54:13 -0600
Subject: RESEND: Inter-VM shared memory PCI device

resend for bug fix related to removal of irqfd

Support an inter-vm shared memory device that maps a shared-memory object as a
PCI device in the guest.  This patch also supports interrupts between guest by
communicating over a unix domain socket.  This patch applies to the qemu-kvm
repository.

    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]

Interrupts are supported between multiple VMs by using a shared memory server
by using a chardev socket.

    -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
           [,chardev=<id>][,msi=on][,ioeventfd=on][,vectors=n][,role=peer|master]
    -chardev socket,path=<path>,id=<id>

The shared memory server, sample programs and init scripts are in a git repo here:

    www.gitorious.org/nahanni

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 6cbf4c8c6416237e9c323661b87d60792a9d51af)
---
 Makefile.target |   3 +
 hw/ivshmem.c    | 828 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-char.c     |   6 +
 qemu-char.h     |   3 +
 qemu-doc.texi   |  43 +++
 5 files changed, 883 insertions(+)
 create mode 100644 hw/ivshmem.c

diff --git a/Makefile.target b/Makefile.target
index 8a9c427b5..b79149289 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -190,6 +190,9 @@ obj-$(CONFIG_USB_OHCI) += usb-ohci.o
 obj-y += rtl8139.o
 obj-y += e1000.o
 
+# Inter-VM PCI shared memory
+obj-y += ivshmem.o
+
 # Hardware support
 obj-i386-y += vga.o
 obj-i386-y += mc146818rtc.o i8259.o pc.o
diff --git a/hw/ivshmem.c b/hw/ivshmem.c
new file mode 100644
index 000000000..bbb5cbaa1
--- /dev/null
+++ b/hw/ivshmem.c
@@ -0,0 +1,828 @@
+/*
+ * Inter-VM Shared Memory PCI device.
+ *
+ * Author:
+ *      Cam Macdonell <cam@cs.ualberta.ca>
+ *
+ * Based On: cirrus_vga.c
+ *          Copyright (c) 2004 Fabrice Bellard
+ *          Copyright (c) 2004 Makoto Suzuki (suzu)
+ *
+ *      and rtl8139.c
+ *          Copyright (c) 2006 Igor Kovalenko
+ *
+ * This code is licensed under the GNU GPL v2.
+ */
+#include "hw.h"
+#include "pc.h"
+#include "pci.h"
+#include "msix.h"
+#include "kvm.h"
+
+#include <sys/mman.h>
+#include <sys/types.h>
+
+#define IVSHMEM_IOEVENTFD   0
+#define IVSHMEM_MSI     1
+
+#define IVSHMEM_PEER    0
+#define IVSHMEM_MASTER  1
+
+#define IVSHMEM_REG_BAR_SIZE 0x100
+
+//#define DEBUG_IVSHMEM
+#ifdef DEBUG_IVSHMEM
+#define IVSHMEM_DPRINTF(fmt, ...)        \
+    do {printf("IVSHMEM: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define IVSHMEM_DPRINTF(fmt, ...)
+#endif
+
+typedef struct Peer {
+    int nb_eventfds;
+    int *eventfds;
+} Peer;
+
+typedef struct EventfdEntry {
+    PCIDevice *pdev;
+    int vector;
+} EventfdEntry;
+
+typedef struct IVShmemState {
+    PCIDevice dev;
+    uint32_t intrmask;
+    uint32_t intrstatus;
+    uint32_t doorbell;
+
+    CharDriverState **eventfd_chr;
+    CharDriverState *server_chr;
+    int ivshmem_mmio_io_addr;
+
+    pcibus_t mmio_addr;
+    pcibus_t shm_pci_addr;
+    uint64_t ivshmem_offset;
+    uint64_t ivshmem_size; /* size of shared memory region */
+    int shm_fd; /* shared memory file descriptor */
+
+    Peer *peers;
+    int nb_peers; /* how many guests we have space for */
+    int max_peer; /* maximum numbered peer */
+
+    int vm_id;
+    uint32_t vectors;
+    uint32_t features;
+    EventfdEntry *eventfd_table;
+
+    char * shmobj;
+    char * sizearg;
+    char * role;
+    int role_val;   /* scalar to avoid multiple string comparisons */
+} IVShmemState;
+
+/* registers for the Inter-VM shared memory device */
+enum ivshmem_registers {
+    INTRMASK = 0,
+    INTRSTATUS = 4,
+    IVPOSITION = 8,
+    DOORBELL = 12,
+};
+
+static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
+                                                    unsigned int feature) {
+    return (ivs->features & (1 << feature));
+}
+
+static inline bool is_power_of_two(uint64_t x) {
+    return (x & (x - 1)) == 0;
+}
+
+static void ivshmem_map(PCIDevice *pci_dev, int region_num,
+                    pcibus_t addr, pcibus_t size, int type)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
+
+    s->shm_pci_addr = addr;
+
+    if (s->ivshmem_offset > 0) {
+        cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
+                                                            s->ivshmem_offset);
+    }
+
+    IVSHMEM_DPRINTF("guest pci addr = %" FMT_PCIBUS ", guest h/w addr = %"
+        PRIu64 ", size = %" FMT_PCIBUS "\n", addr, s->ivshmem_offset, size);
+
+}
+
+/* accessing registers - based on rtl8139 */
+static void ivshmem_update_irq(IVShmemState *s, int val)
+{
+    int isr;
+    isr = (s->intrstatus & s->intrmask) & 0xffffffff;
+
+    /* don't print ISR resets */
+    if (isr) {
+        IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
+           isr ? 1 : 0, s->intrstatus, s->intrmask);
+    }
+
+    qemu_set_irq(s->dev.irq[0], (isr != 0));
+}
+
+static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
+
+    s->intrmask = val;
+
+    ivshmem_update_irq(s, val);
+}
+
+static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrmask;
+
+    IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
+
+    return ret;
+}
+
+static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
+{
+    IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
+
+    s->intrstatus = val;
+
+    ivshmem_update_irq(s, val);
+    return;
+}
+
+static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
+{
+    uint32_t ret = s->intrstatus;
+
+    /* reading ISR clears all interrupts */
+    s->intrstatus = 0;
+
+    ivshmem_update_irq(s, 0);
+
+    return ret;
+}
+
+static void ivshmem_io_writew(void *opaque, target_phys_addr_t addr,
+                                                            uint32_t val)
+{
+
+    IVSHMEM_DPRINTF("We shouldn't be writing words\n");
+}
+
+static void ivshmem_io_writel(void *opaque, target_phys_addr_t addr,
+                                                            uint32_t val)
+{
+    IVShmemState *s = opaque;
+
+    uint64_t write_one = 1;
+    uint16_t dest = val >> 16;
+    uint16_t vector = val & 0xff;
+
+    addr &= 0xfc;
+
+    IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
+    switch (addr)
+    {
+        case INTRMASK:
+            ivshmem_IntrMask_write(s, val);
+            break;
+
+        case INTRSTATUS:
+            ivshmem_IntrStatus_write(s, val);
+            break;
+
+        case DOORBELL:
+            /* check that dest VM ID is reasonable */
+            if ((dest < 0) || (dest > s->max_peer)) {
+                IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
+                break;
+            }
+
+            /* check doorbell range */
+            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
+                IVSHMEM_DPRINTF("Writing %" PRId64 " to VM %d on vector %d\n",
+                                                    write_one, dest, vector);
+                if (write(s->peers[dest].eventfds[vector],
+                                                    &(write_one), 8) != 8) {
+                    IVSHMEM_DPRINTF("error writing to eventfd\n");
+                }
+            }
+            break;
+        default:
+            IVSHMEM_DPRINTF("Invalid VM Doorbell VM %d\n", dest);
+    }
+}
+
+static void ivshmem_io_writeb(void *opaque, target_phys_addr_t addr,
+                                                                uint32_t val)
+{
+    IVSHMEM_DPRINTF("We shouldn't be writing bytes\n");
+}
+
+static uint32_t ivshmem_io_readw(void *opaque, target_phys_addr_t addr)
+{
+
+    IVSHMEM_DPRINTF("We shouldn't be reading words\n");
+    return 0;
+}
+
+static uint32_t ivshmem_io_readl(void *opaque, target_phys_addr_t addr)
+{
+
+    IVShmemState *s = opaque;
+    uint32_t ret;
+
+    switch (addr)
+    {
+        case INTRMASK:
+            ret = ivshmem_IntrMask_read(s);
+            break;
+
+        case INTRSTATUS:
+            ret = ivshmem_IntrStatus_read(s);
+            break;
+
+        case IVPOSITION:
+            /* return my VM ID if the memory is mapped */
+            if (s->shm_fd > 0) {
+                ret = s->vm_id;
+            } else {
+                ret = -1;
+            }
+            break;
+
+        default:
+            IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
+            ret = 0;
+    }
+
+    return ret;
+}
+
+static uint32_t ivshmem_io_readb(void *opaque, target_phys_addr_t addr)
+{
+    IVSHMEM_DPRINTF("We shouldn't be reading bytes\n");
+
+    return 0;
+}
+
+static CPUReadMemoryFunc * const ivshmem_mmio_read[3] = {
+    ivshmem_io_readb,
+    ivshmem_io_readw,
+    ivshmem_io_readl,
+};
+
+static CPUWriteMemoryFunc * const ivshmem_mmio_write[3] = {
+    ivshmem_io_writeb,
+    ivshmem_io_writew,
+    ivshmem_io_writel,
+};
+
+static void ivshmem_receive(void *opaque, const uint8_t *buf, int size)
+{
+    IVShmemState *s = opaque;
+
+    ivshmem_IntrStatus_write(s, *buf);
+
+    IVSHMEM_DPRINTF("ivshmem_receive 0x%02x\n", *buf);
+}
+
+static int ivshmem_can_receive(void * opaque)
+{
+    return 8;
+}
+
+static void ivshmem_event(void *opaque, int event)
+{
+    IVSHMEM_DPRINTF("ivshmem_event %d\n", event);
+}
+
+static void fake_irqfd(void *opaque, const uint8_t *buf, int size) {
+
+    EventfdEntry *entry = opaque;
+    PCIDevice *pdev = entry->pdev;
+
+    IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, entry->vector);
+    msix_notify(pdev, entry->vector);
+}
+
+static CharDriverState* create_eventfd_chr_device(void * opaque, int eventfd,
+                                                                    int vector)
+{
+    /* create a event character device based on the passed eventfd */
+    IVShmemState *s = opaque;
+    CharDriverState * chr;
+
+    chr = qemu_chr_open_eventfd(eventfd);
+
+    if (chr == NULL) {
+        fprintf(stderr, "creating eventfd for eventfd %d failed\n", eventfd);
+        exit(-1);
+    }
+
+    /* if MSI is supported we need multiple interrupts */
+    if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
+        s->eventfd_table[vector].pdev = &s->dev;
+        s->eventfd_table[vector].vector = vector;
+
+        qemu_chr_add_handlers(chr, ivshmem_can_receive, fake_irqfd,
+                      ivshmem_event, &s->eventfd_table[vector]);
+    } else {
+        qemu_chr_add_handlers(chr, ivshmem_can_receive, ivshmem_receive,
+                      ivshmem_event, s);
+    }
+
+    return chr;
+
+}
+
+static int check_shm_size(IVShmemState *s, int fd) {
+    /* check that the guest isn't going to try and map more memory than the
+     * the object has allocated return -1 to indicate error */
+
+    struct stat buf;
+
+    fstat(fd, &buf);
+
+    if (s->ivshmem_size > buf.st_size) {
+        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
+        fprintf(stderr, " than shared object size (%" PRIu64 " > %ld)\n",
+                                          s->ivshmem_size, buf.st_size);
+        return -1;
+    } else {
+        return 0;
+    }
+}
+
+/* create the shared memory BAR when we are not using the server, so we can
+ * create the BAR and map the memory immediately */
+static void create_shared_memory_BAR(IVShmemState *s, int fd) {
+
+    void * ptr;
+
+    s->shm_fd = fd;
+
+    ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+
+    s->ivshmem_offset = qemu_ram_alloc_from_ptr(&s->dev.qdev, "ivshmem.bar2",
+                                                        s->ivshmem_size, ptr);
+
+    /* region for shared memory */
+    pci_register_bar(&s->dev, 2, s->ivshmem_size,
+                                PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
+}
+
+static void close_guest_eventfds(IVShmemState *s, int posn)
+{
+    int i, guest_curr_max;
+
+    guest_curr_max = s->peers[posn].nb_eventfds;
+
+    for (i = 0; i < guest_curr_max; i++) {
+        kvm_set_ioeventfd_mmio_long(s->peers[posn].eventfds[i],
+                    s->mmio_addr + DOORBELL, (posn << 16) | i, 0);
+        close(s->peers[posn].eventfds[i]);
+    }
+
+    qemu_free(s->peers[posn].eventfds);
+    s->peers[posn].nb_eventfds = 0;
+}
+
+static void setup_ioeventfds(IVShmemState *s) {
+
+    int i, j;
+
+    for (i = 0; i <= s->max_peer; i++) {
+        for (j = 0; j < s->peers[i].nb_eventfds; j++) {
+            kvm_set_ioeventfd_mmio_long(s->peers[i].eventfds[j],
+                    s->mmio_addr + DOORBELL, (i << 16) | j, 1);
+        }
+    }
+}
+
+/* this function increase the dynamic storage need to store data about other
+ * guests */
+static void increase_dynamic_storage(IVShmemState *s, int new_min_size) {
+
+    int j, old_nb_alloc;
+
+    old_nb_alloc = s->nb_peers;
+
+    while (new_min_size >= s->nb_peers)
+        s->nb_peers = s->nb_peers * 2;
+
+    IVSHMEM_DPRINTF("bumping storage to %d guests\n", s->nb_peers);
+    s->peers = qemu_realloc(s->peers, s->nb_peers * sizeof(Peer));
+
+    /* zero out new pointers */
+    for (j = old_nb_alloc; j < s->nb_peers; j++) {
+        s->peers[j].eventfds = NULL;
+        s->peers[j].nb_eventfds = 0;
+    }
+}
+
+static void ivshmem_read(void *opaque, const uint8_t * buf, int flags)
+{
+    IVShmemState *s = opaque;
+    int incoming_fd, tmp_fd;
+    int guest_max_eventfd;
+    long incoming_posn;
+
+    memcpy(&incoming_posn, buf, sizeof(long));
+    /* pick off s->server_chr->msgfd and store it, posn should accompany msg */
+    tmp_fd = qemu_chr_get_msgfd(s->server_chr);
+    IVSHMEM_DPRINTF("posn is %ld, fd is %d\n", incoming_posn, tmp_fd);
+
+    /* make sure we have enough space for this guest */
+    if (incoming_posn >= s->nb_peers) {
+        increase_dynamic_storage(s, incoming_posn);
+    }
+
+    if (tmp_fd == -1) {
+        /* if posn is positive and unseen before then this is our posn*/
+        if ((incoming_posn >= 0) &&
+                            (s->peers[incoming_posn].eventfds == NULL)) {
+            /* receive our posn */
+            s->vm_id = incoming_posn;
+            return;
+        } else {
+            /* otherwise an fd == -1 means an existing guest has gone away */
+            IVSHMEM_DPRINTF("posn %ld has gone away\n", incoming_posn);
+            close_guest_eventfds(s, incoming_posn);
+            return;
+        }
+    }
+
+    /* because of the implementation of get_msgfd, we need a dup */
+    incoming_fd = dup(tmp_fd);
+
+    if (incoming_fd == -1) {
+        fprintf(stderr, "could not allocate file descriptor %s\n",
+                                                            strerror(errno));
+        return;
+    }
+
+    /* if the position is -1, then it's shared memory region fd */
+    if (incoming_posn == -1) {
+
+        void * map_ptr;
+
+        s->max_peer = 0;
+
+        if (check_shm_size(s, incoming_fd) == -1) {
+            exit(-1);
+        }
+
+        /* mmap the region and map into the BAR2 */
+        map_ptr = mmap(0, s->ivshmem_size, PROT_READ|PROT_WRITE, MAP_SHARED,
+                                                            incoming_fd, 0);
+        s->ivshmem_offset = qemu_ram_alloc_from_ptr(&s->dev.qdev,
+                                    "ivshmem.bar2", s->ivshmem_size, map_ptr);
+
+        IVSHMEM_DPRINTF("guest pci addr = %" FMT_PCIBUS ", guest h/w addr = %"
+                         PRIu64 ", size = %" PRIu64 "\n", s->shm_pci_addr,
+                         s->ivshmem_offset, s->ivshmem_size);
+
+        if (s->shm_pci_addr > 0) {
+            /* map memory into BAR2 */
+            cpu_register_physical_memory(s->shm_pci_addr, s->ivshmem_size,
+                                                            s->ivshmem_offset);
+        }
+
+        /* only store the fd if it is successfully mapped */
+        s->shm_fd = incoming_fd;
+
+        return;
+    }
+
+    /* each guest has an array of eventfds, and we keep track of how many
+     * guests for each VM */
+    guest_max_eventfd = s->peers[incoming_posn].nb_eventfds;
+
+    if (guest_max_eventfd == 0) {
+        /* one eventfd per MSI vector */
+        s->peers[incoming_posn].eventfds = (int *) qemu_malloc(s->vectors *
+                                                                sizeof(int));
+    }
+
+    /* this is an eventfd for a particular guest VM */
+    IVSHMEM_DPRINTF("eventfds[%ld][%d] = %d\n", incoming_posn,
+                                            guest_max_eventfd, incoming_fd);
+    s->peers[incoming_posn].eventfds[guest_max_eventfd] = incoming_fd;
+
+    /* increment count for particular guest */
+    s->peers[incoming_posn].nb_eventfds++;
+
+    /* keep track of the maximum VM ID */
+    if (incoming_posn > s->max_peer) {
+        s->max_peer = incoming_posn;
+    }
+
+    if (incoming_posn == s->vm_id) {
+        s->eventfd_chr[guest_max_eventfd] = create_eventfd_chr_device(s,
+                   s->peers[s->vm_id].eventfds[guest_max_eventfd],
+                   guest_max_eventfd);
+    }
+
+    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
+        if (kvm_set_ioeventfd_mmio_long(incoming_fd, s->mmio_addr + DOORBELL,
+                        (incoming_posn << 16) | guest_max_eventfd, 1) < 0) {
+            fprintf(stderr, "ivshmem: ioeventfd not available\n");
+        }
+    }
+
+    return;
+}
+
+static void ivshmem_reset(DeviceState *d)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev.qdev, d);
+
+    s->intrstatus = 0;
+    return;
+}
+
+static void ivshmem_mmio_map(PCIDevice *pci_dev, int region_num,
+                       pcibus_t addr, pcibus_t size, int type)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, pci_dev);
+
+    s->mmio_addr = addr;
+    cpu_register_physical_memory(addr + 0, IVSHMEM_REG_BAR_SIZE,
+                                                s->ivshmem_mmio_io_addr);
+
+    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
+        setup_ioeventfds(s);
+    }
+}
+
+static uint64_t ivshmem_get_size(IVShmemState * s) {
+
+    uint64_t value;
+    char *ptr;
+
+    value = strtoull(s->sizearg, &ptr, 10);
+    switch (*ptr) {
+        case 0: case 'M': case 'm':
+            value <<= 20;
+            break;
+        case 'G': case 'g':
+            value <<= 30;
+            break;
+        default:
+            fprintf(stderr, "qemu: invalid ram size: %s\n", s->sizearg);
+            exit(1);
+    }
+
+    /* BARs must be a power of 2 */
+    if (!is_power_of_two(value)) {
+        fprintf(stderr, "ivshmem: size must be power of 2\n");
+        exit(1);
+    }
+
+    return value;
+}
+
+static void ivshmem_setup_msi(IVShmemState * s) {
+
+    int i;
+
+    /* allocate the MSI-X vectors */
+
+    if (!msix_init(&s->dev, s->vectors, 1, 0)) {
+        pci_register_bar(&s->dev, 1,
+                         msix_bar_size(&s->dev),
+                         PCI_BASE_ADDRESS_SPACE_MEMORY,
+                         msix_mmio_map);
+        IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
+    } else {
+        IVSHMEM_DPRINTF("msix initialization failed\n");
+        exit(1);
+    }
+
+    /* 'activate' the vectors */
+    for (i = 0; i < s->vectors; i++) {
+        msix_vector_use(&s->dev, i);
+    }
+
+    /* allocate Qemu char devices for receiving interrupts */
+    s->eventfd_table = qemu_mallocz(s->vectors * sizeof(EventfdEntry));
+}
+
+static void ivshmem_save(QEMUFile* f, void *opaque)
+{
+    IVShmemState *proxy = opaque;
+
+    IVSHMEM_DPRINTF("ivshmem_save\n");
+    pci_device_save(&proxy->dev, f);
+
+    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
+        msix_save(&proxy->dev, f);
+    } else {
+        qemu_put_be32(f, proxy->intrstatus);
+        qemu_put_be32(f, proxy->intrmask);
+    }
+
+}
+
+static int ivshmem_load(QEMUFile* f, void *opaque, int version_id)
+{
+    IVSHMEM_DPRINTF("ivshmem_load\n");
+
+    IVShmemState *proxy = opaque;
+    int ret, i;
+
+    if (version_id > 0) {
+        return -EINVAL;
+    }
+
+    if (proxy->role_val == IVSHMEM_PEER) {
+        fprintf(stderr, "ivshmem: 'peer' devices are not migratable\n");
+        return -EINVAL;
+    }
+
+    ret = pci_device_load(&proxy->dev, f);
+    if (ret) {
+        return ret;
+    }
+
+    if (ivshmem_has_feature(proxy, IVSHMEM_MSI)) {
+        msix_load(&proxy->dev, f);
+        for (i = 0; i < proxy->vectors; i++) {
+            msix_vector_use(&proxy->dev, i);
+        }
+    } else {
+        proxy->intrstatus = qemu_get_be32(f);
+        proxy->intrmask = qemu_get_be32(f);
+    }
+
+    return 0;
+}
+
+static int pci_ivshmem_init(PCIDevice *dev)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+    uint8_t *pci_conf;
+
+    if (s->sizearg == NULL)
+        s->ivshmem_size = 4 << 20; /* 4 MB default */
+    else {
+        s->ivshmem_size = ivshmem_get_size(s);
+    }
+
+    register_savevm(&s->dev.qdev, "ivshmem", 0, 0, ivshmem_save, ivshmem_load,
+                                                                        dev);
+
+    /* IRQFD requires MSI */
+    if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
+        !ivshmem_has_feature(s, IVSHMEM_MSI)) {
+        fprintf(stderr, "ivshmem: ioeventfd/irqfd requires MSI\n");
+        exit(1);
+    }
+
+    /* check that role is reasonable */
+    if (s->role) {
+        if (strncmp(s->role, "peer", 5) == 0) {
+            s->role_val = IVSHMEM_PEER;
+        } else if (strncmp(s->role, "master", 7) == 0) {
+            s->role_val = IVSHMEM_MASTER;
+        } else {
+            fprintf(stderr, "ivshmem: 'role' must be 'peer' or 'master'\n");
+            exit(1);
+        }
+    } else {
+        s->role_val = IVSHMEM_MASTER; /* default */
+    }
+
+    if (s->role_val == IVSHMEM_PEER) {
+        register_device_unmigratable(&s->dev.qdev, "ivshmem", s);
+    }
+
+    pci_conf = s->dev.config;
+    pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT_QUMRANET);
+    pci_conf[0x02] = 0x10;
+    pci_conf[0x03] = 0x11;
+    pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
+    pci_config_set_class(pci_conf, PCI_CLASS_MEMORY_RAM);
+    pci_conf[PCI_HEADER_TYPE] = PCI_HEADER_TYPE_NORMAL;
+
+    pci_config_set_interrupt_pin(pci_conf, 1);
+
+    s->shm_pci_addr = 0;
+    s->ivshmem_offset = 0;
+    s->shm_fd = 0;
+
+    s->ivshmem_mmio_io_addr = cpu_register_io_memory(ivshmem_mmio_read,
+                                    ivshmem_mmio_write, s);
+    /* region for registers*/
+    pci_register_bar(&s->dev, 0, IVSHMEM_REG_BAR_SIZE,
+                           PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_mmio_map);
+
+    if ((s->server_chr != NULL) &&
+                        (strncmp(s->server_chr->filename, "unix:", 5) == 0)) {
+        /* if we get a UNIX socket as the parameter we will talk
+         * to the ivshmem server to receive the memory region */
+
+        if (s->shmobj != NULL) {
+            fprintf(stderr, "WARNING: do not specify both 'chardev' "
+                                                "and 'shm' with ivshmem\n");
+        }
+
+        IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
+                                                    s->server_chr->filename);
+
+        if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
+            ivshmem_setup_msi(s);
+        }
+
+        /* we allocate enough space for 16 guests and grow as needed */
+        s->nb_peers = 16;
+        s->vm_id = -1;
+
+        /* allocate/initialize space for interrupt handling */
+        s->peers = qemu_mallocz(s->nb_peers * sizeof(Peer));
+
+        pci_register_bar(&s->dev, 2, s->ivshmem_size,
+                                PCI_BASE_ADDRESS_SPACE_MEMORY, ivshmem_map);
+
+        s->eventfd_chr = qemu_mallocz(s->vectors * sizeof(CharDriverState *));
+
+        qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive, ivshmem_read,
+                     ivshmem_event, s);
+    } else {
+        /* just map the file immediately, we're not using a server */
+        int fd;
+
+        if (s->shmobj == NULL) {
+            fprintf(stderr, "Must specify 'chardev' or 'shm' to ivshmem\n");
+        }
+
+        IVSHMEM_DPRINTF("using shm_open (shm object = %s)\n", s->shmobj);
+
+        /* try opening with O_EXCL and if it succeeds zero the memory
+         * by truncating to 0 */
+        if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR|O_EXCL,
+                        S_IRWXU|S_IRWXG|S_IRWXO)) > 0) {
+           /* truncate file to length PCI device's memory */
+            if (ftruncate(fd, s->ivshmem_size) != 0) {
+                fprintf(stderr, "ivshmem: could not truncate shared file\n");
+            }
+
+        } else if ((fd = shm_open(s->shmobj, O_CREAT|O_RDWR,
+                        S_IRWXU|S_IRWXG|S_IRWXO)) < 0) {
+            fprintf(stderr, "ivshmem: could not open shared file\n");
+            exit(-1);
+
+        }
+
+        if (check_shm_size(s, fd) == -1) {
+            exit(-1);
+        }
+
+        create_shared_memory_BAR(s, fd);
+
+    }
+
+    return 0;
+}
+
+static int pci_ivshmem_uninit(PCIDevice *dev)
+{
+    IVShmemState *s = DO_UPCAST(IVShmemState, dev, dev);
+
+    cpu_unregister_io_memory(s->ivshmem_mmio_io_addr);
+    unregister_savevm(&dev->qdev, "ivshmem", s);
+
+    return 0;
+}
+
+static PCIDeviceInfo ivshmem_info = {
+    .qdev.name  = "ivshmem",
+    .qdev.size  = sizeof(IVShmemState),
+    .qdev.reset = ivshmem_reset,
+    .init       = pci_ivshmem_init,
+    .exit       = pci_ivshmem_uninit,
+    .qdev.props = (Property[]) {
+        DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
+        DEFINE_PROP_STRING("size", IVShmemState, sizearg),
+        DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
+        DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD, false),
+        DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
+        DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
+        DEFINE_PROP_STRING("role", IVShmemState, role),
+        DEFINE_PROP_END_OF_LIST(),
+    }
+};
+
+static void ivshmem_register_devices(void)
+{
+    pci_qdev_register(&ivshmem_info);
+}
+
+device_init(ivshmem_register_devices)
diff --git a/qemu-char.c b/qemu-char.c
index 9b69d928e..6a3952ccd 100644
--- a/qemu-char.c
+++ b/qemu-char.c
@@ -2087,6 +2087,12 @@ static void tcp_chr_read(void *opaque)
     }
 }
 
+CharDriverState *qemu_chr_open_eventfd(int eventfd){
+
+    return qemu_chr_open_fd(eventfd, eventfd);
+
+}
+
 static void tcp_chr_connect(void *opaque)
 {
     CharDriverState *chr = opaque;
diff --git a/qemu-char.h b/qemu-char.h
index e3a07838a..6ea01ba17 100644
--- a/qemu-char.h
+++ b/qemu-char.h
@@ -94,6 +94,9 @@ void qemu_chr_info_print(Monitor *mon, const QObject *ret_data);
 void qemu_chr_info(Monitor *mon, QObject **ret_data);
 CharDriverState *qemu_chr_find(const char *name);
 
+/* add an eventfd to the qemu devices that are polled */
+CharDriverState *qemu_chr_open_eventfd(int eventfd);
+
 extern int term_escape_char;
 
 /* async I/O support */
diff --git a/qemu-doc.texi b/qemu-doc.texi
index e67bf44ff..55a966fe7 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -706,6 +706,49 @@ Using the @option{-net socket} option, it is possible to make VLANs
 that span several QEMU instances. See @ref{sec_invocation} to have a
 basic example.
 
+@section Other Devices
+
+@subsection Inter-VM Shared Memory device
+
+With KVM enabled on a Linux host, a shared memory device is available.  Guests
+map a POSIX shared memory region into the guest as a PCI device that enables
+zero-copy communication to the application level of the guests.  The basic
+syntax is:
+
+@example
+qemu -device ivshmem,size=<size in format accepted by -m>[,shm=<shm name>]
+@end example
+
+If desired, interrupts can be sent between guest VMs accessing the same shared
+memory region.  Interrupt support requires using a shared memory server and
+using a chardev socket to connect to it.  The code for the shared memory server
+is qemu.git/contrib/ivshmem-server.  An example syntax when using the shared
+memory server is:
+
+@example
+qemu -device ivshmem,size=<size in format accepted by -m>[,chardev=<id>]
+                        [,msi=on][,ioeventfd=on][,vectors=n][,role=peer|master]
+qemu -chardev socket,path=<path>,id=<id>
+@end example
+
+When using the server, the guest will be assigned a VM ID (>=0) that allows guests
+using the same server to communicate via interrupts.  Guests can read their
+VM ID from a device register (see example code).  Since receiving the shared
+memory region from the server is asynchronous, there is a (small) chance the
+guest may boot before the shared memory is attached.  To allow an application
+to ensure shared memory is attached, the VM ID register will return -1 (an
+invalid VM ID) until the memory is attached.  Once the shared memory is
+attached, the VM ID will return the guest's valid VM ID.  With these semantics,
+the guest application can check to ensure the shared memory is attached to the
+guest before proceeding.
+
+The @option{role} argument can be set to either master or peer and will affect
+how the shared memory is migrated.  With @option{role=master}, the guest will
+copy the shared memory on migration to the destination host.  With
+@option{role=peer}, the guest will not be able to migrate with the device attached.
+With the @option{peer} case, the device should be detached and then reattached
+after migration using the PCI hotplug support.
+
 @node direct_linux_boot
 @section Direct Linux Boot
 
-- 
cgit v1.2.3


From 41de2f0c864dd94908015ca6846e635f028b9b93 Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Sat, 14 Aug 2010 17:47:30 -0600
Subject: Add kvm_set_ioeventfd_mmio_long definition for non-KVM systems

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
(cherry picked from commit 1fd74012750dcd8542708bdcc10becb8780f7493)
---
 kvm-stub.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kvm-stub.c b/kvm-stub.c
index 3378bd3b2..d45f9fa1d 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -136,3 +136,8 @@ int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
 {
     return -ENOSYS;
 }
+
+int kvm_set_ioeventfd_mmio_long(int fd, uint32_t adr, uint32_t val, bool assign)
+{
+    return -ENOSYS;
+}
-- 
cgit v1.2.3


From a385adb70e848d15262f54bf12bdb33413a631f9 Mon Sep 17 00:00:00 2001
From: Cam Macdonell <cam@cs.ualberta.ca>
Date: Sat, 14 Aug 2010 17:47:31 -0600
Subject: Disable build of ivshmem on non-KVM systems

Signed-off-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
(cherry picked from commit 3dcbf8f9ca5f361a38bf1b699080daeb40d5185e)
---
 Makefile.target | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.target b/Makefile.target
index b79149289..c8281e90a 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -191,7 +191,7 @@ obj-y += rtl8139.o
 obj-y += e1000.o
 
 # Inter-VM PCI shared memory
-obj-y += ivshmem.o
+obj-$(CONFIG_KVM) += ivshmem.o
 
 # Hardware support
 obj-i386-y += vga.o
-- 
cgit v1.2.3


From d3c5b2e6703869bb86639b29102df82380cc2d63 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <Jes.Sorensen@redhat.com>
Date: Mon, 30 Aug 2010 12:31:33 +0200
Subject: hw/ivshmem.c don't check for negative values on unsigned data types

There is no need to check for dest < 0 or vector >= 0 as both are
uint16_t.

This should fix problems with broken build with aggressive compiler
flags. Reported by Xudong Hao <xudong.hao@intel.com>

Signed-off-by: Jes Sorensen <Jes.Sorensen@redhat.com>
Acked-by: Cam Macdonell <cam@cs.ualberta.ca>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
(cherry picked from commit 1b27d7a1e8609b2eeb6238f2c629eb82217523f6)
---
 hw/ivshmem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ivshmem.c b/hw/ivshmem.c
index bbb5cbaa1..afebbc365 100644
--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -199,13 +199,13 @@ static void ivshmem_io_writel(void *opaque, target_phys_addr_t addr,
 
         case DOORBELL:
             /* check that dest VM ID is reasonable */
-            if ((dest < 0) || (dest > s->max_peer)) {
+            if (dest > s->max_peer) {
                 IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
                 break;
             }
 
             /* check doorbell range */
-            if ((vector >= 0) && (vector < s->peers[dest].nb_eventfds)) {
+            if (vector < s->peers[dest].nb_eventfds) {
                 IVSHMEM_DPRINTF("Writing %" PRId64 " to VM %d on vector %d\n",
                                                     write_one, dest, vector);
                 if (write(s->peers[dest].eventfds[vector],
-- 
cgit v1.2.3


From 0131c8c2ddeb115c423b8e505789025864d81f87 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 29 Aug 2010 12:43:15 +0300
Subject: Fix ivshmem build on 32-bit hosts

stat() fields can be more or less anything depending on configuration, cast
explicitly to uint64_t to avoid printf() format mismatches.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
(cherry picked from commit ad0a4ac1c0e1859eb0c67900dba696cc459b42a7)
---
 hw/ivshmem.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hw/ivshmem.c b/hw/ivshmem.c
index afebbc365..06dce70e7 100644
--- a/hw/ivshmem.c
+++ b/hw/ivshmem.c
@@ -351,9 +351,10 @@ static int check_shm_size(IVShmemState *s, int fd) {
     fstat(fd, &buf);
 
     if (s->ivshmem_size > buf.st_size) {
-        fprintf(stderr, "IVSHMEM ERROR: Requested memory size greater");
-        fprintf(stderr, " than shared object size (%" PRIu64 " > %ld)\n",
-                                          s->ivshmem_size, buf.st_size);
+        fprintf(stderr,
+                "IVSHMEM ERROR: Requested memory size greater"
+                " than shared object size (%" PRIu64 " > %" PRIu64")\n",
+                s->ivshmem_size, (uint64_t)buf.st_size);
         return -1;
     } else {
         return 0;
-- 
cgit v1.2.3


From 472de0c851af86d26e3ccebf4154a27393091053 Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Mon, 11 Oct 2010 16:37:35 -0500
Subject: Update version for 0.13.0-rc2

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index 0eb28840f..5cbf646ea 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.12.91
+0.12.92
-- 
cgit v1.2.3


From d25de8db83128d4c80f7d9fb570bfeba14ee21d2 Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Mon, 11 Oct 2010 17:09:10 -0500
Subject: Update for v0.13.0-rc3

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index 5cbf646ea..73b08a130 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.12.92
+0.12.93
-- 
cgit v1.2.3


From fc4e0c7018f023a415b39984c4b805992fd6cc17 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Tue, 27 Jul 2010 15:49:19 +0530
Subject: migration: Accept 'cont' only after successful incoming migration

When a 'cont' is issued on a VM that's just waiting for an incoming
migration, the VM reboots and boots into the guest, possibly corrupting
its storage since it could be shared with another VM running elsewhere.

Ensure that a VM started with '-incoming' is only run when an incoming
migration successfully completes.

A new qerror, QERR_MIGRATION_EXPECTED, is added to signal that 'cont'
failed due to no incoming migration has been attempted yet.

Reported-by: Laine Stump <laine@redhat.com>
Signed-off-by: Amit Shah <amit.shah@redhat.com>
Reviewed-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
(cherry picked from commit 8e84865e54cb66fd7b57bb18c312ad3d56b6e276)
---
 migration.c | 2 ++
 monitor.c   | 4 ++++
 qerror.c    | 4 ++++
 qerror.h    | 3 +++
 sysemu.h    | 1 +
 vl.c        | 2 ++
 6 files changed, 16 insertions(+)

diff --git a/migration.c b/migration.c
index 650eb78d2..a160462df 100644
--- a/migration.c
+++ b/migration.c
@@ -67,6 +67,8 @@ void process_incoming_migration(QEMUFile *f)
     qemu_announce_self();
     DPRINTF("successfully loaded vm state\n");
 
+    incoming_expected = false;
+
     if (autostart)
         vm_start();
 }
diff --git a/monitor.c b/monitor.c
index 45fd48291..5366c3652 100644
--- a/monitor.c
+++ b/monitor.c
@@ -1056,6 +1056,10 @@ static int do_cont(Monitor *mon, const QDict *qdict, QObject **ret_data)
 {
     struct bdrv_iterate_context context = { mon, 0 };
 
+    if (incoming_expected) {
+        qerror_report(QERR_MIGRATION_EXPECTED);
+        return -1;
+    }
     bdrv_iterate(encrypted_bdrv_it, &context);
     /* only resume the vm if all keys are set and valid */
     if (!context.err) {
diff --git a/qerror.c b/qerror.c
index 2f6f59061..0af3ab32b 100644
--- a/qerror.c
+++ b/qerror.c
@@ -140,6 +140,10 @@ static const QErrorStringTable qerror_table[] = {
         .error_fmt = QERR_KVM_MISSING_CAP,
         .desc      = "Using KVM without %(capability), %(feature) unavailable",
     },
+    {
+        .error_fmt = QERR_MIGRATION_EXPECTED,
+        .desc      = "An incoming migration is expected before this command can be executed",
+    },
     {
         .error_fmt = QERR_MISSING_PARAMETER,
         .desc      = "Parameter '%(name)' is missing",
diff --git a/qerror.h b/qerror.h
index 9ad00b4b8..62802ea08 100644
--- a/qerror.h
+++ b/qerror.h
@@ -121,6 +121,9 @@ QError *qobject_to_qerror(const QObject *obj);
 #define QERR_KVM_MISSING_CAP \
     "{ 'class': 'KVMMissingCap', 'data': { 'capability': %s, 'feature': %s } }"
 
+#define QERR_MIGRATION_EXPECTED \
+    "{ 'class': 'MigrationExpected', 'data': {} }"
+
 #define QERR_MISSING_PARAMETER \
     "{ 'class': 'MissingParameter', 'data': { 'name': %s } }"
 
diff --git a/sysemu.h b/sysemu.h
index 9c988bb2a..a1f6466ac 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -99,6 +99,7 @@ typedef enum DisplayType
 } DisplayType;
 
 extern int autostart;
+extern int incoming_expected;
 extern int bios_size;
 
 typedef enum {
diff --git a/vl.c b/vl.c
index ba6ee11ec..c2e7cc1d6 100644
--- a/vl.c
+++ b/vl.c
@@ -182,6 +182,7 @@ int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int vm_running;
 int autostart;
+int incoming_expected; /* Started with -incoming and waiting for incoming */
 static int rtc_utc = 1;
 static int rtc_date_offset = -1; /* -1 means no change */
 QEMUClock *rtc_clock;
@@ -2557,6 +2558,7 @@ int main(int argc, char **argv, char **envp)
                 break;
             case QEMU_OPTION_incoming:
                 incoming = optarg;
+                incoming_expected = true;
                 break;
             case QEMU_OPTION_nodefaults:
                 default_serial = 0;
-- 
cgit v1.2.3


From c4127fbf17744374b147b779a333e7bf444839a7 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 19 Aug 2010 10:18:39 -0300
Subject: set proper migration status on ->write error (v5)

If ->write fails, declare migration status as MIG_STATE_ERROR.

Also, in buffered_file.c, ->close the object in case of an
error.

Fixes "migrate -d "exec:dd of=file", where dd fails to open file.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit e447b1a603091cbaa5eed36c0a3c9ed3f2224535)
---
 buffered_file.c | 4 +++-
 migration.c     | 8 +++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/buffered_file.c b/buffered_file.c
index 54dc6c29b..be147d62d 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -222,8 +222,10 @@ static void buffered_rate_tick(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
 
-    if (s->has_error)
+    if (s->has_error) {
+        buffered_close(s);
         return;
+    }
 
     qemu_mod_timer(s->timer, qemu_get_clock(rt_clock) + 100);
 
diff --git a/migration.c b/migration.c
index a160462df..468d51749 100644
--- a/migration.c
+++ b/migration.c
@@ -316,8 +316,14 @@ ssize_t migrate_fd_put_buffer(void *opaque, const void *data, size_t size)
     if (ret == -1)
         ret = -(s->get_error(s));
 
-    if (ret == -EAGAIN)
+    if (ret == -EAGAIN) {
         qemu_set_fd_handler2(s->fd, NULL, NULL, migrate_fd_put_notify, s);
+    } else if (ret < 0) {
+        if (s->mon) {
+            monitor_resume(s->mon);
+        }
+        s->state = MIG_STATE_ERROR;
+    }
 
     return ret;
 }
-- 
cgit v1.2.3


From c082082c7669aa9fba4034484f56583dc2e0eaae Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 19 Aug 2010 10:18:42 -0300
Subject: savevm: Reset last block info at beginning of each save

If we save more than once we need to reset the last block info or else
only the first save has the actual block info and each subsequent save
will only use continue flags, making them unloadable independently.

Found-by: Miguel Di Ciurcio Filho <miguel.filho@gmail.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 760e77eab53f5d92eb0c587e04fd942a905b46af)
---
 arch_init.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 47bb4b2d8..e0bd18c88 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -104,10 +104,11 @@ static int is_dup_page(uint8_t *page, uint8_t ch)
     return 1;
 }
 
+static RAMBlock *last_block;
+static ram_addr_t last_offset;
+
 static int ram_save_block(QEMUFile *f)
 {
-    static RAMBlock *last_block = NULL;
-    static ram_addr_t last_offset = 0;
     RAMBlock *block = last_block;
     ram_addr_t offset = last_offset;
     ram_addr_t current_addr;
@@ -231,6 +232,8 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     if (stage == 1) {
         RAMBlock *block;
         bytes_transferred = 0;
+        last_block = NULL;
+        last_offset = 0;
 
         /* Make sure all dirty bits are set */
         QLIST_FOREACH(block, &ram_list.blocks, next) {
-- 
cgit v1.2.3


From f8e3ee1a7dd8e8b342edfc8198f286403dd85b57 Mon Sep 17 00:00:00 2001
From: Miguel Di Ciurcio Filho <miguel.filho@gmail.com>
Date: Fri, 20 Aug 2010 16:42:29 -0300
Subject: QMP: update 'query-version' documentation

Update the documentation of 'query-version' to output the string version broken
down.

Signed-off-by: Miguel Di Ciurcio Filho <miguel.filho@gmail.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 6597e1a6dc2585c6e313b97fa750b7232dc904fd)
---
 qemu-monitor.hx | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/qemu-monitor.hx b/qemu-monitor.hx
index 2af3de6c2..9c27b3113 100644
--- a/qemu-monitor.hx
+++ b/qemu-monitor.hx
@@ -1623,13 +1623,25 @@ Show QEMU version.
 
 Return a json-object with the following information:
 
-- "qemu": QEMU's version (json-string)
+- "qemu": A json-object containing three integer values:
+    - "major": QEMU's major version (json-int)
+    - "minor": QEMU's minor version (json-int)
+    - "micro": QEMU's micro version (json-int)
 - "package": package's version (json-string)
 
 Example:
 
 -> { "execute": "query-version" }
-<- { "return": { "qemu": "0.11.50", "package": "" } }
+<- {
+      "return":{
+         "qemu":{
+            "major":0,
+            "minor":11,
+            "micro":5
+         },
+         "package":""
+      }
+   }
 
 EQMP
 
-- 
cgit v1.2.3


From b73e06943b201b04f28b3f0c8c30e6c8f22b6385 Mon Sep 17 00:00:00 2001
From: Miguel Di Ciurcio Filho <miguel.filho@gmail.com>
Date: Fri, 20 Aug 2010 16:42:30 -0300
Subject: QMP/monitor: update do_info_version() to output broken down version
 string

This code was originally developed by Daniel P. Berrange <berrange@redhat.com>

Signed-off-by: Miguel Di Ciurcio Filho <miguel.filho@gmail.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 0ec0291d67ccfcc8a488d24341ee138902510ea3)
---
 monitor.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/monitor.c b/monitor.c
index 5366c3652..159bbe7db 100644
--- a/monitor.c
+++ b/monitor.c
@@ -669,17 +669,32 @@ help:
 static void do_info_version_print(Monitor *mon, const QObject *data)
 {
     QDict *qdict;
+    QDict *qemu;
 
     qdict = qobject_to_qdict(data);
+    qemu = qdict_get_qdict(qdict, "qemu");
 
-    monitor_printf(mon, "%s%s\n", qdict_get_str(qdict, "qemu"),
-                                  qdict_get_str(qdict, "package"));
+    monitor_printf(mon, "%" PRId64 ".%" PRId64 ".%" PRId64 "%s\n",
+                  qdict_get_int(qemu, "major"),
+                  qdict_get_int(qemu, "minor"),
+                  qdict_get_int(qemu, "micro"),
+                  qdict_get_str(qdict, "package"));
 }
 
 static void do_info_version(Monitor *mon, QObject **ret_data)
 {
-    *ret_data = qobject_from_jsonf("{ 'qemu': %s, 'package': %s }",
-                                   QEMU_VERSION, QEMU_PKGVERSION);
+    const char *version = QEMU_VERSION;
+    int major = 0, minor = 0, micro = 0;
+    char *tmp;
+
+    major = strtol(version, &tmp, 10);
+    tmp++;
+    minor = strtol(tmp, &tmp, 10);
+    tmp++;
+    micro = strtol(tmp, &tmp, 10);
+
+    *ret_data = qobject_from_jsonf("{ 'qemu': { 'major': %d, 'minor': %d, \
+        'micro': %d }, 'package': %s }", major, minor, micro, QEMU_PKGVERSION);
 }
 
 static void do_info_name_print(Monitor *mon, const QObject *data)
-- 
cgit v1.2.3


From 0d88bb1babef77c970ec88ec73263631c7cf7d25 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 20 Aug 2010 16:42:31 -0300
Subject: QMP doc: Add 'Stability Considerations' section

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 05705ce2f8dfd8dcb5622b66ece21c9737a8502c)
---
 qemu-monitor.hx | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/qemu-monitor.hx b/qemu-monitor.hx
index 9c27b3113..5c1da3398 100644
--- a/qemu-monitor.hx
+++ b/qemu-monitor.hx
@@ -35,7 +35,29 @@ information on the Server command and response formats.
 
 NOTE: This document is temporary and will be replaced soon.
 
-1. Regular Commands
+1. Stability Considerations
+===========================
+
+The current QMP command set (described in this file) may be useful for a
+number of use cases, however it's limited and several commands have bad
+defined semantics, specially with regard to command completion.
+
+These problems are going to be solved incrementally in the next QEMU releases
+and we're going to establish a deprecation policy for badly defined commands.
+
+If you're planning to adopt QMP, please observe the following:
+
+    1. The deprecation policy will take efect and be documented soon, please
+       check the documentation of each used command as soon as a new release of
+       QEMU is available
+
+    2. DO NOT rely on anything which is not explicit documented
+
+    3. Errors, in special, are not documented. Applications should NOT check
+       for specific errors classes or data (it's strongly recommended to only
+       check for the "error" key)
+
+2. Regular Commands
 ===================
 
 Server's responses in the examples below are always a success response, please
@@ -1592,7 +1614,7 @@ HXCOMM This is required for the QMP documentation layout.
 
 SQMP
 
-2. Query Commands
+3. Query Commands
 =================
 
 EQMP
-- 
cgit v1.2.3


From 30811f0d943f78bb5af00d0f32d5d9a1fd18a03e Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 20 Aug 2010 16:42:32 -0300
Subject: QMP: Update README file

A number of changes I prefer to do in one shot:

- Fix example
- Small clarifications
- Add multiple monitors example
- Add 'Development Process' section

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit d29f3196afddd356a8169780fa506f565adb5036)
---
 QMP/README | 71 ++++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/QMP/README b/QMP/README
index 35a80c73e..948d4453d 100644
--- a/QMP/README
+++ b/QMP/README
@@ -7,60 +7,85 @@ Introduction
 The QEMU Monitor Protocol (QMP) allows applications to communicate with
 QEMU's Monitor.
 
-QMP is JSON[1] based and has the following features:
+QMP is JSON[1] based and currently has the following features:
 
 - Lightweight, text-based, easy to parse data format
-- Asynchronous events support 
-- Stability
+- Asynchronous messages support (ie. events)
+- Capabilities Negotiation
 
-For more information, please, refer to the following files:
+For detailed information on QMP's usage, please, refer to the following files:
 
 o qmp-spec.txt      QEMU Monitor Protocol current specification
-o qmp-commands.txt  QMP supported commands
+o qmp-commands.txt  QMP supported commands (auto-generated at build-time)
 o qmp-events.txt    List of available asynchronous events
 
 There are also two simple Python scripts available:
 
-o qmp-shell       A shell
-o vm-info         Show some information about the Virtual Machine
+o qmp-shell  A shell
+o vm-info    Show some information about the Virtual Machine
+
+IMPORTANT: It's strongly recommended to read the 'Stability Considerations'
+section in the qmp-commands.txt file before making any serious use of QMP.
+
 
 [1] http://www.json.org
 
 Usage
 -----
 
-To enable QMP, QEMU has to be started in "control mode". There are
-two ways of doing this, the simplest one is using the the '-qmp'
-command-line option.
+To enable QMP, you need a QEMU monitor instance in "control mode". There are
+two ways of doing this.
+
+The simplest one is using the '-qmp' command-line option. The following
+example makes QMP available on localhost port 4444:
 
-For example:
+  $ qemu [...] -qmp tcp:localhost:4444,server
 
-$ qemu [...] -qmp tcp:localhost:4444,server
+However, in order to have more complex combinations, like multiple monitors,
+the '-mon' command-line option should be used along with the '-chardev' one.
+For instance, the following example creates one user monitor on stdio and one
+QMP monitor on localhost port 4444.
 
-Will start QEMU in control mode, waiting for a client TCP connection
-on localhost port 4444.
+   $ qemu [...] -chardev stdio,id=mon0 -mon chardev=mon0,mode=readline \
+                -chardev socket,id=mon1,host=localhost,port=4444,server \
+                -mon chardev=mon1,mode=control
 
-It is also possible to use the '-mon' command-line option to have
-more complex combinations. Please, refer to the QEMU's manpage for
-more information.
+Please, refer to QEMU's manpage for more information.
 
 Simple Testing
 --------------
 
-To manually test QMP one can connect with telnet and issue commands:
+To manually test QMP one can connect with telnet and issue commands by hand:
 
 $ telnet localhost 4444
 Trying 127.0.0.1...
 Connected to localhost.
 Escape character is '^]'.
-{"QMP": {"version": {"qemu": "0.12.50", "package": ""}, "capabilities": []}}
+{"QMP": {"version": {"qemu": {"micro": 50, "minor": 13, "major": 0}, "package": ""}, "capabilities": []}}
 { "execute": "qmp_capabilities" }
 {"return": {}}
 { "execute": "query-version" }
-{"return": {"qemu": "0.12.50", "package": ""}}
+{"return": {"qemu": {"micro": 50, "minor": 13, "major": 0}, "package": ""}}
+
+Development Process
+-------------------
+
+When changing QMP's interface (by adding new commands, events or modifying
+existing ones) it's mandatory to update the relevant documentation, which is
+one (or more) of the files listed in the 'Introduction' section*.
+
+Also, it's strongly recommended to send the documentation patch first, before
+doing any code change. This is so because:
+
+  1. Avoids the code dictating the interface
+
+  2. Review can improve your interface.  Letting that happen before
+     you implement it can save you work.
+
+* The qmp-commands.txt file is generated from the qemu-monitor.hx one, which
+  is the file that should be edited.
 
-Contact
--------
+Homepage
+--------
 
 http://www.linux-kvm.org/page/MonitorProtocol
-Luiz Fernando N. Capitulino <lcapitulino@redhat.com>
-- 
cgit v1.2.3


From 7bd11bc311ecb25ad7f8426b30cad52fb90dd020 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Tue, 14 Sep 2010 13:43:39 -0300
Subject: disable guest-provided stats on "info balloon" command

The addition of memory stats reporting to the virtio balloon causes
the 'info balloon' command to become asynchronous.  This is a regression
because in some cases it can hang the user monitor.

This is an alternative to Adam Litke's patch. Adam's patch disabled the
corresponding (guest-visible) virtio feature bit, causing issues for migration.
Original discussion is available at:
http://marc.info/?l=qemu-devel&m=128448124328314&w=2

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Acked-by: Adam Litke <agl@us.ibm.com
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
(cherry picked from commit 07b0403dfc2b2ac179ae5b48105096cc2d03375a)
---
 hw/virtio-balloon.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
index 9fe3886b0..15779d5dd 100644
--- a/hw/virtio-balloon.c
+++ b/hw/virtio-balloon.c
@@ -29,6 +29,10 @@
 #include <sys/mman.h>
 #endif
 
+/* Disable guest-provided stats by now (https://bugzilla.redhat.com/show_bug.cgi?id=623903) */
+#define ENABLE_GUEST_STATS   0
+
+
 typedef struct VirtIOBalloon
 {
     VirtIODevice vdev;
@@ -83,12 +87,14 @@ static QObject *get_stats_qobject(VirtIOBalloon *dev)
                                   VIRTIO_BALLOON_PFN_SHIFT);
 
     stat_put(dict, "actual", actual);
+#if ENABLE_GUEST_STATS
     stat_put(dict, "mem_swapped_in", dev->stats[VIRTIO_BALLOON_S_SWAP_IN]);
     stat_put(dict, "mem_swapped_out", dev->stats[VIRTIO_BALLOON_S_SWAP_OUT]);
     stat_put(dict, "major_page_faults", dev->stats[VIRTIO_BALLOON_S_MAJFLT]);
     stat_put(dict, "minor_page_faults", dev->stats[VIRTIO_BALLOON_S_MINFLT]);
     stat_put(dict, "free_mem", dev->stats[VIRTIO_BALLOON_S_MEMFREE]);
     stat_put(dict, "total_mem", dev->stats[VIRTIO_BALLOON_S_MEMTOT]);
+#endif
 
     return QOBJECT(dict);
 }
@@ -214,7 +220,7 @@ static void virtio_balloon_to_target(void *opaque, ram_addr_t target,
         }
         dev->stats_callback = cb;
         dev->stats_opaque_callback_data = cb_data; 
-        if (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ)) {
+        if (ENABLE_GUEST_STATS && (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ))) {
             virtqueue_push(dev->svq, &dev->stats_vq_elem, dev->stats_vq_offset);
             virtio_notify(&dev->vdev, dev->svq);
         } else {
-- 
cgit v1.2.3


From a0a90b92c94607c04a915d2a47c19e2cc5e85f85 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 10 Sep 2010 14:05:23 -0300
Subject: QMP/README: Update QMP homepage address

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
(cherry picked from commit a18b2ce2ed9dc747c2ebab5a220517624e4ff3df)
---
 QMP/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QMP/README b/QMP/README
index 948d4453d..d1430b0ab 100644
--- a/QMP/README
+++ b/QMP/README
@@ -88,4 +88,4 @@ doing any code change. This is so because:
 Homepage
 --------
 
-http://www.linux-kvm.org/page/MonitorProtocol
+http://wiki.qemu.org/QMP
-- 
cgit v1.2.3


From 9f20b55b9a909de3499d5993f8da6c9bc469653f Mon Sep 17 00:00:00 2001
From: Artyom Tarasenko <atar4qemu@googlemail.com>
Date: Mon, 2 Aug 2010 19:58:21 +0200
Subject: fix last cpu timer initialization

The timer #0 is the system timer, so the timer #num_cpu is the
timer of the last CPU, and it must be initialized in slavio_timer_reset.

Don't mark non-existing timers as running.

Signed-off-by: Artyom Tarasenko <atar4qemu@gmail.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
(cherry picked from commit 5933e8a96ab9c59cb6b6c80c9db385364a68c959)
---
 hw/slavio_timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/slavio_timer.c b/hw/slavio_timer.c
index d7875536b..c125de4b6 100644
--- a/hw/slavio_timer.c
+++ b/hw/slavio_timer.c
@@ -377,12 +377,12 @@ static void slavio_timer_reset(DeviceState *d)
         curr_timer->limit = 0;
         curr_timer->count = 0;
         curr_timer->reached = 0;
-        if (i < s->num_cpus) {
+        if (i <= s->num_cpus) {
             ptimer_set_limit(curr_timer->timer,
                              LIMIT_TO_PERIODS(TIMER_MAX_COUNT32), 1);
             ptimer_run(curr_timer->timer, 0);
+            curr_timer->running = 1;
         }
-        curr_timer->running = 1;
     }
     s->cputimer_mode = 0;
 }
-- 
cgit v1.2.3


From ddfe317152193e70a996d9447d5bf2507d52c662 Mon Sep 17 00:00:00 2001
From: Artyom Tarasenko <atar4qemu@googlemail.com>
Date: Sun, 15 Aug 2010 16:04:41 +0200
Subject: sparc escc IUS improvements (SunOS 4.1.4 fix)

According to scc_escc_um.pdf:
 - Reset Highest IUS must update irq status to allow processing
   of the next priority interrupt.
 - rx interrupt has always higher priority than tx on same channel

The documentation only explicitly says that Reset Highest IUS
command (0x38) clears IUS bits, not that it clears the corresponding
interrupt too, so don't clear interrupts on this command.

The patch allows SunOS 4.1.4 to use the serial ports

Signed-off-by: Artyom Tarasenko <atar4qemu@gmail.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
(cherry picked from commit 9fc391f8b54ea0be47baece3983e0f2224958f25)
---
 hw/escc.c | 56 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/hw/escc.c b/hw/escc.c
index 6d2fd36b1..871423978 100644
--- a/hw/escc.c
+++ b/hw/escc.c
@@ -65,6 +65,8 @@
  *  2006-Aug-10  Igor Kovalenko :   Renamed KBDQueue to SERIOQueue, implemented
  *                                  serial mouse queue.
  *                                  Implemented serial mouse protocol.
+ *
+ *  2010-May-23  Artyom Tarasenko:  Reworked IUS logic
  */
 
 #ifdef DEBUG_SERIAL
@@ -279,7 +281,7 @@ static uint32_t get_queue(void *opaque)
 
 static int escc_update_irq_chn(ChannelState *s)
 {
-    if ((((s->wregs[W_INTR] & INTR_TXINT) && s->txint == 1) ||
+    if ((((s->wregs[W_INTR] & INTR_TXINT) && (s->txint == 1)) ||
          // tx ints enabled, pending
          ((((s->wregs[W_INTR] & INTR_RXMODEMSK) == INTR_RXINT1ST) ||
            ((s->wregs[W_INTR] & INTR_RXMODEMSK) == INTR_RXINTALL)) &&
@@ -342,24 +344,22 @@ static void escc_reset(DeviceState *d)
 static inline void set_rxint(ChannelState *s)
 {
     s->rxint = 1;
-    if (!s->txint_under_svc) {
-        s->rxint_under_svc = 1;
-        if (s->chn == chn_a) {
-            if (s->wregs[W_MINTR] & MINTR_STATUSHI)
-                s->otherchn->rregs[R_IVEC] = IVEC_HIRXINTA;
-            else
-                s->otherchn->rregs[R_IVEC] = IVEC_LORXINTA;
-        } else {
-            if (s->wregs[W_MINTR] & MINTR_STATUSHI)
-                s->rregs[R_IVEC] = IVEC_HIRXINTB;
-            else
-                s->rregs[R_IVEC] = IVEC_LORXINTB;
-        }
-    }
-    if (s->chn == chn_a)
+    /* XXX: missing daisy chainnig: chn_b rx should have a lower priority
+       than chn_a rx/tx/special_condition service*/
+    s->rxint_under_svc = 1;
+    if (s->chn == chn_a) {
         s->rregs[R_INTR] |= INTR_RXINTA;
-    else
+        if (s->wregs[W_MINTR] & MINTR_STATUSHI)
+            s->otherchn->rregs[R_IVEC] = IVEC_HIRXINTA;
+        else
+            s->otherchn->rregs[R_IVEC] = IVEC_LORXINTA;
+    } else {
         s->otherchn->rregs[R_INTR] |= INTR_RXINTB;
+        if (s->wregs[W_MINTR] & MINTR_STATUSHI)
+            s->rregs[R_IVEC] = IVEC_HIRXINTB;
+        else
+            s->rregs[R_IVEC] = IVEC_LORXINTB;
+    }
     escc_update_irq(s);
 }
 
@@ -369,19 +369,17 @@ static inline void set_txint(ChannelState *s)
     if (!s->rxint_under_svc) {
         s->txint_under_svc = 1;
         if (s->chn == chn_a) {
+            s->rregs[R_INTR] |= INTR_TXINTA;
             if (s->wregs[W_MINTR] & MINTR_STATUSHI)
                 s->otherchn->rregs[R_IVEC] = IVEC_HITXINTA;
             else
                 s->otherchn->rregs[R_IVEC] = IVEC_LOTXINTA;
         } else {
             s->rregs[R_IVEC] = IVEC_TXINTB;
+            s->otherchn->rregs[R_INTR] |= INTR_TXINTB;
         }
-    }
-    if (s->chn == chn_a)
-        s->rregs[R_INTR] |= INTR_TXINTA;
-    else
-        s->otherchn->rregs[R_INTR] |= INTR_TXINTB;
     escc_update_irq(s);
+    }
 }
 
 static inline void clr_rxint(ChannelState *s)
@@ -417,6 +415,7 @@ static inline void clr_txint(ChannelState *s)
             s->otherchn->rregs[R_IVEC] = IVEC_LONOINT;
         s->rregs[R_INTR] &= ~INTR_TXINTA;
     } else {
+        s->otherchn->rregs[R_INTR] &= ~INTR_TXINTB;
         if (s->wregs[W_MINTR] & MINTR_STATUSHI)
             s->rregs[R_IVEC] = IVEC_HINOINT;
         else
@@ -515,10 +514,15 @@ static void escc_mem_writeb(void *opaque, target_phys_addr_t addr, uint32_t val)
                 clr_txint(s);
                 break;
             case CMD_CLR_IUS:
-                if (s->rxint_under_svc)
-                    clr_rxint(s);
-                else if (s->txint_under_svc)
-                    clr_txint(s);
+                if (s->rxint_under_svc) {
+                    s->rxint_under_svc = 0;
+                    if (s->txint) {
+                        set_txint(s);
+                    }
+                } else if (s->txint_under_svc) {
+                    s->txint_under_svc = 0;
+                }
+                escc_update_irq(s);
                 break;
             default:
                 break;
-- 
cgit v1.2.3


From cb03355a26a09bae91a8e5dea51538c9cccc838b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 7 Jul 2010 19:44:22 +0300
Subject: QEMUFileBuffered: indicate that we're ready when the underlying file
 is ready

QEMUFileBuffered stops writing when the underlying QEMUFile is not ready,
and tells its producer so.  However, when the underlying QEMUFile becomes
ready, it neglects to pass that information along, resulting in stoppage
of all data until the next tick (a tenths of a second).

Usually this doesn't matter, because most QEMUFiles used with QEMUFileBuffered
are almost always ready, but in the case of exec: migration this is not true,
due to the small pipe buffers used to connect to the target process.  The
result is very slow migration.

Fix by detecting the readiness notification and propagating it.  The detection
is a little ugly since QEMUFile overloads put_buffer() to send it, but that's
the suject for a different patch.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 5e77aaa0d7d2f4ceaa4fcaf50f3a26d5150f34a6)
---
 buffered_file.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/buffered_file.c b/buffered_file.c
index 54dc6c29b..a79264f45 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -156,6 +156,14 @@ static int buffered_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, in
         offset = size;
     }
 
+    if (pos == 0 && size == 0) {
+        DPRINTF("file is ready\n");
+        if (s->bytes_xfer <= s->xfer_limit) {
+            DPRINTF("notifying client\n");
+            s->put_ready(s->opaque);
+        }
+    }
+
     return offset;
 }
 
-- 
cgit v1.2.3


From 14a0b95684d5e3588cab88c4900433746ab56c58 Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Wed, 8 Sep 2010 17:09:15 -0500
Subject: Revert "Make default invocation of block drivers safer (v3)"

This reverts commit 79368c81bf8cf93864d7afc88b81b05d8f0a2c90.

Conflicts:

	block.c

I haven't been able to come up with a solution yet for the corruption caused by
unaligned requests from the IDE disk so revert until a solution can be written.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit 8b33d9eeba91422ee2d73b6936ad57262d18cf5a)
---
 block.c     |   4 --
 block/raw.c | 130 ------------------------------------------------------------
 block_int.h |   1 -
 3 files changed, 135 deletions(-)

diff --git a/block.c b/block.c
index e6087f3cb..85c8d3e75 100644
--- a/block.c
+++ b/block.c
@@ -523,7 +523,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
               BlockDriver *drv)
 {
     int ret;
-    int probed = 0;
 
     if (flags & BDRV_O_SNAPSHOT) {
         BlockDriverState *bs1;
@@ -584,7 +583,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
     /* Find the right image format driver */
     if (!drv) {
         ret = find_image_format(filename, &drv);
-        probed = 1;
     }
 
     if (!drv) {
@@ -597,8 +595,6 @@ int bdrv_open(BlockDriverState *bs, const char *filename, int flags,
         goto unlink_and_fail;
     }
 
-    bs->probed = probed;
-
     /* If there is a backing file, use it */
     if ((flags & BDRV_O_NO_BACKING) == 0 && bs->backing_file[0] != '\0') {
         char backing_filename[PATH_MAX];
diff --git a/block/raw.c b/block/raw.c
index 61e674856..91087792f 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -9,82 +9,15 @@ static int raw_open(BlockDriverState *bs, int flags)
     return 0;
 }
 
-/* check for the user attempting to write something that looks like a
-   block format header to the beginning of the image and fail out.
-*/
-static int check_for_block_signature(BlockDriverState *bs, const uint8_t *buf)
-{
-    static const uint8_t signatures[][4] = {
-        { 'Q', 'F', 'I', 0xfb }, /* qcow/qcow2 */
-        { 'C', 'O', 'W', 'D' }, /* VMDK3 */
-        { 'V', 'M', 'D', 'K' }, /* VMDK4 */
-        { 'O', 'O', 'O', 'M' }, /* UML COW */
-        {}
-    };
-    int i;
-
-    for (i = 0; signatures[i][0] != 0; i++) {
-        if (memcmp(buf, signatures[i], 4) == 0) {
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
-static int check_write_unsafe(BlockDriverState *bs, int64_t sector_num,
-                              const uint8_t *buf, int nb_sectors)
-{
-    /* assume that if the user specifies the format explicitly, then assume
-       that they will continue to do so and provide no safety net */
-    if (!bs->probed) {
-        return 0;
-    }
-
-    if (sector_num == 0 && nb_sectors > 0) {
-        return check_for_block_signature(bs, buf);
-    }
-
-    return 0;
-}
-
 static int raw_read(BlockDriverState *bs, int64_t sector_num,
                     uint8_t *buf, int nb_sectors)
 {
     return bdrv_read(bs->file, sector_num, buf, nb_sectors);
 }
 
-static int raw_write_scrubbed_bootsect(BlockDriverState *bs,
-                                       const uint8_t *buf)
-{
-    uint8_t bootsect[512];
-
-    /* scrub the dangerous signature */
-    memcpy(bootsect, buf, 512);
-    memset(bootsect, 0, 4);
-
-    return bdrv_write(bs->file, 0, bootsect, 1);
-}
-
 static int raw_write(BlockDriverState *bs, int64_t sector_num,
                      const uint8_t *buf, int nb_sectors)
 {
-    if (check_write_unsafe(bs, sector_num, buf, nb_sectors)) {
-        int ret;
-
-        ret = raw_write_scrubbed_bootsect(bs, buf);
-        if (ret < 0) {
-            return ret;
-        }
-
-        ret = bdrv_write(bs->file, 1, buf + 512, nb_sectors - 1);
-        if (ret < 0) {
-            return ret;
-        }
-
-        return ret + 512;
-    }
-
     return bdrv_write(bs->file, sector_num, buf, nb_sectors);
 }
 
@@ -95,73 +28,10 @@ static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
     return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }
 
-typedef struct RawScrubberBounce
-{
-    BlockDriverCompletionFunc *cb;
-    void *opaque;
-    QEMUIOVector qiov;
-} RawScrubberBounce;
-
-static void raw_aio_writev_scrubbed(void *opaque, int ret)
-{
-    RawScrubberBounce *b = opaque;
-
-    if (ret < 0) {
-        b->cb(b->opaque, ret);
-    } else {
-        b->cb(b->opaque, ret + 512);
-    }
-
-    qemu_iovec_destroy(&b->qiov);
-    qemu_free(b);
-}
-
 static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
     int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
     BlockDriverCompletionFunc *cb, void *opaque)
 {
-    const uint8_t *first_buf;
-    int first_buf_index = 0, i;
-
-    /* This is probably being paranoid, but handle cases of zero size
-       vectors. */
-    for (i = 0; i < qiov->niov; i++) {
-        if (qiov->iov[i].iov_len) {
-            assert(qiov->iov[i].iov_len >= 512);
-            first_buf_index = i;
-            break;
-        }
-    }
-
-    first_buf = qiov->iov[first_buf_index].iov_base;
-
-    if (check_write_unsafe(bs, sector_num, first_buf, nb_sectors)) {
-        RawScrubberBounce *b;
-        int ret;
-
-        /* write the first sector using sync I/O */
-        ret = raw_write_scrubbed_bootsect(bs, first_buf);
-        if (ret < 0) {
-            return NULL;
-        }
-
-        /* adjust request to be everything but first sector */
-
-        b = qemu_malloc(sizeof(*b));
-        b->cb = cb;
-        b->opaque = opaque;
-
-        qemu_iovec_init(&b->qiov, qiov->nalloc);
-        qemu_iovec_concat(&b->qiov, qiov, qiov->size);
-
-        b->qiov.size -= 512;
-        b->qiov.iov[first_buf_index].iov_base += 512;
-        b->qiov.iov[first_buf_index].iov_len -= 512;
-
-        return bdrv_aio_writev(bs->file, sector_num + 1, &b->qiov,
-                               nb_sectors - 1, raw_aio_writev_scrubbed, b);
-    }
-
     return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque);
 }
 
diff --git a/block_int.h b/block_int.h
index b86345177..e8e7156c9 100644
--- a/block_int.h
+++ b/block_int.h
@@ -148,7 +148,6 @@ struct BlockDriverState {
     int encrypted; /* if true, the media is encrypted */
     int valid_key; /* if true, a valid encryption key has been set */
     int sg;        /* if true, the device is a /dev/sg* */
-    int probed;    /* if true, format was probed automatically */
     /* event callback when inserting/removing */
     void (*change_cb)(void *opaque);
     void *change_opaque;
-- 
cgit v1.2.3


From 8b84b68e7d6125dbfc092b806210d5940468d530 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Tue, 14 Sep 2010 13:43:39 -0300
Subject: disable guest-provided stats on "info balloon" command

The addition of memory stats reporting to the virtio balloon causes
the 'info balloon' command to become asynchronous.  This is a regression
because in some cases it can hang the user monitor.

This is an alternative to Adam Litke's patch. Adam's patch disabled the
corresponding (guest-visible) virtio feature bit, causing issues for migration.
Original discussion is available at:
http://marc.info/?l=qemu-devel&m=128448124328314&w=2

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Acked-by: Adam Litke <agl@us.ibm.com
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
(cherry picked from commit 07b0403dfc2b2ac179ae5b48105096cc2d03375a)
---
 hw/virtio-balloon.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hw/virtio-balloon.c b/hw/virtio-balloon.c
index 9fe3886b0..15779d5dd 100644
--- a/hw/virtio-balloon.c
+++ b/hw/virtio-balloon.c
@@ -29,6 +29,10 @@
 #include <sys/mman.h>
 #endif
 
+/* Disable guest-provided stats by now (https://bugzilla.redhat.com/show_bug.cgi?id=623903) */
+#define ENABLE_GUEST_STATS   0
+
+
 typedef struct VirtIOBalloon
 {
     VirtIODevice vdev;
@@ -83,12 +87,14 @@ static QObject *get_stats_qobject(VirtIOBalloon *dev)
                                   VIRTIO_BALLOON_PFN_SHIFT);
 
     stat_put(dict, "actual", actual);
+#if ENABLE_GUEST_STATS
     stat_put(dict, "mem_swapped_in", dev->stats[VIRTIO_BALLOON_S_SWAP_IN]);
     stat_put(dict, "mem_swapped_out", dev->stats[VIRTIO_BALLOON_S_SWAP_OUT]);
     stat_put(dict, "major_page_faults", dev->stats[VIRTIO_BALLOON_S_MAJFLT]);
     stat_put(dict, "minor_page_faults", dev->stats[VIRTIO_BALLOON_S_MINFLT]);
     stat_put(dict, "free_mem", dev->stats[VIRTIO_BALLOON_S_MEMFREE]);
     stat_put(dict, "total_mem", dev->stats[VIRTIO_BALLOON_S_MEMTOT]);
+#endif
 
     return QOBJECT(dict);
 }
@@ -214,7 +220,7 @@ static void virtio_balloon_to_target(void *opaque, ram_addr_t target,
         }
         dev->stats_callback = cb;
         dev->stats_opaque_callback_data = cb_data; 
-        if (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ)) {
+        if (ENABLE_GUEST_STATS && (dev->vdev.guest_features & (1 << VIRTIO_BALLOON_F_STATS_VQ))) {
             virtqueue_push(dev->svq, &dev->stats_vq_elem, dev->stats_vq_offset);
             virtio_notify(&dev->vdev, dev->svq);
         } else {
-- 
cgit v1.2.3


From c2ccc98ceb07c009d64b7ee801f8966194510021 Mon Sep 17 00:00:00 2001
From: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Date: Fri, 20 Aug 2010 19:10:41 +0900
Subject: vnc: check fd before calling qemu_set_fd_handler2() in
 vnc_client_write()

Setting fd = -1 to qemu_set_fd_handler2() causes bus error at FD_SET
in main_loop_wait().

Signed-off-by: Yoshiaki Tamura <tamura.yoshiaki@lab.ntt.co.jp>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
(cherry picked from commit ac71103dc6b408775ae72067790ab367912f75ec)
---
 ui/vnc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ui/vnc.c b/ui/vnc.c
index 7fc40acae..c7a18311f 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -1184,7 +1184,7 @@ void vnc_client_write(void *opaque)
     vnc_lock_output(vs);
     if (vs->output.offset) {
         vnc_client_write_locked(opaque);
-    } else {
+    } else if (vs->csock != -1) {
         qemu_set_fd_handler2(vs->csock, NULL, vnc_client_read, NULL, vs);
     }
     vnc_unlock_output(vs);
-- 
cgit v1.2.3