From 356d83725675c0140db27b24afed3a2c0c7d9702 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 13 Sep 2012 10:50:31 +0200
Subject: add pc-1.3 machine type

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/pc_piix.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 88ff0411c..5a0796bed 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -349,8 +349,8 @@ static void pc_xen_hvm_init(ram_addr_t ram_size,
 }
 #endif
 
-static QEMUMachine pc_machine_v1_2 = {
-    .name = "pc-1.2",
+static QEMUMachine pc_machine_v1_3 = {
+    .name = "pc-1.3",
     .alias = "pc",
     .desc = "Standard PC",
     .init = pc_init_pci,
@@ -358,6 +358,13 @@ static QEMUMachine pc_machine_v1_2 = {
     .is_default = 1,
 };
 
+static QEMUMachine pc_machine_v1_2 = {
+    .name = "pc-1.2",
+    .desc = "Standard PC",
+    .init = pc_init_pci,
+    .max_cpus = 255,
+};
+
 #define PC_COMPAT_1_1 \
         {\
             .driver   = "virtio-scsi-pci",\
@@ -655,6 +662,7 @@ static QEMUMachine xenfv_machine = {
 
 static void pc_machine_init(void)
 {
+    qemu_register_machine(&pc_machine_v1_3);
     qemu_register_machine(&pc_machine_v1_2);
     qemu_register_machine(&pc_machine_v1_1);
     qemu_register_machine(&pc_machine_v1_0);
-- 
cgit v1.2.3


From a2879190ab08b2b75d65b576fad7ff95d7d7d641 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 13 Sep 2012 10:53:23 +0200
Subject: compat: turn off msi/msix on xhci for old machine types

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/pc_piix.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/hw/pc_piix.c b/hw/pc_piix.c
index 5a0796bed..afd8361df 100644
--- a/hw/pc_piix.c
+++ b/hw/pc_piix.c
@@ -358,14 +358,30 @@ static QEMUMachine pc_machine_v1_3 = {
     .is_default = 1,
 };
 
+#define PC_COMPAT_1_2 \
+        {\
+            .driver   = "nec-usb-xhci",\
+            .property = "msi",\
+            .value    = "off",\
+        },{\
+            .driver   = "nec-usb-xhci",\
+            .property = "msix",\
+            .value    = "off",\
+        }
+
 static QEMUMachine pc_machine_v1_2 = {
     .name = "pc-1.2",
     .desc = "Standard PC",
     .init = pc_init_pci,
     .max_cpus = 255,
+    .compat_props = (GlobalProperty[]) {
+        PC_COMPAT_1_2,
+        { /* end of list */ }
+    },
 };
 
 #define PC_COMPAT_1_1 \
+        PC_COMPAT_1_2,\
         {\
             .driver   = "virtio-scsi-pci",\
             .property = "hotplug",\
-- 
cgit v1.2.3


From d95e74eaed8b74b0c75ab343e9cb826b1f5c9007 Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 20 Sep 2012 09:55:49 +0200
Subject: xhci: tweak limits

Set maxports to 15.  This is what the usb3 route string can handle.

Set maxslots to 64.  This is more than the number of root ports we
can have, but with additional hubs you can end up with more devices.

Set maxintrs (aka msi vectors) to 16.  Should be enougth, especially
considering that vectors are a limited ressource.  Linux guests use
only three at the moment.

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/usb/hcd-xhci.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index e0ca69044..14148262a 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -37,12 +37,12 @@
 #define FIXME() do { fprintf(stderr, "FIXME %s:%d\n", \
                              __func__, __LINE__); abort(); } while (0)
 
-#define MAXPORTS_2 8
-#define MAXPORTS_3 8
+#define MAXPORTS_2 15
+#define MAXPORTS_3 15
 
 #define MAXPORTS (MAXPORTS_2+MAXPORTS_3)
-#define MAXSLOTS MAXPORTS
-#define MAXINTRS MAXPORTS
+#define MAXSLOTS 64
+#define MAXINTRS 16
 
 #define TD_QUEUE 24
 
-- 
cgit v1.2.3


From ccaf87a085c748910efddbcfb5077f6a67cc354a Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Tue, 18 Sep 2012 17:30:52 +0200
Subject: xhci: route string & usb hub support

Parse route string in slot contexts and
support devices connected via hub.
---
 hw/usb/hcd-xhci.c | 86 +++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index 14148262a..8c0155bb1 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -363,7 +363,7 @@ typedef struct XHCIEPContext {
 typedef struct XHCISlot {
     bool enabled;
     dma_addr_t ctx;
-    unsigned int port;
+    USBPort *uport;
     unsigned int devaddr;
     XHCIEPContext * eps[31];
 } XHCISlot;
@@ -1230,7 +1230,7 @@ static TRBCCode xhci_reset_ep(XHCIState *xhci, unsigned int slotid,
         ep |= 0x80;
     }
 
-    dev = xhci->ports[xhci->slots[slotid-1].port-1].uport->dev;
+    dev = xhci->slots[slotid-1].uport->dev;
     if (!dev) {
         return CC_USB_TRANSACTION_ERROR;
     }
@@ -1412,18 +1412,9 @@ static void xhci_stall_ep(XHCITransfer *xfer)
 static int xhci_submit(XHCIState *xhci, XHCITransfer *xfer,
                        XHCIEPContext *epctx);
 
-static USBDevice *xhci_find_device(XHCIPort *port, uint8_t addr)
-{
-    if (!(port->portsc & PORTSC_PED)) {
-        return NULL;
-    }
-    return usb_find_device(port->uport, addr);
-}
-
 static int xhci_setup_packet(XHCITransfer *xfer)
 {
     XHCIState *xhci = xfer->xhci;
-    XHCIPort *port;
     USBDevice *dev;
     USBEndpoint *ep;
     int dir;
@@ -1434,13 +1425,12 @@ static int xhci_setup_packet(XHCITransfer *xfer)
         ep = xfer->packet.ep;
         dev = ep->dev;
     } else {
-        port = &xhci->ports[xhci->slots[xfer->slotid-1].port-1];
-        dev = xhci_find_device(port, xhci->slots[xfer->slotid-1].devaddr);
-        if (!dev) {
-            fprintf(stderr, "xhci: slot %d port %d has no device\n",
-                    xfer->slotid, xhci->slots[xfer->slotid-1].port);
+        if (!xhci->slots[xfer->slotid-1].uport) {
+            fprintf(stderr, "xhci: slot %d has no device\n",
+                    xfer->slotid);
             return -1;
         }
+        dev = xhci->slots[xfer->slotid-1].uport->dev;
         ep = usb_ep_get(dev, dir, xfer->epid >> 1);
     }
 
@@ -1772,7 +1762,7 @@ static TRBCCode xhci_enable_slot(XHCIState *xhci, unsigned int slotid)
     trace_usb_xhci_slot_enable(slotid);
     assert(slotid >= 1 && slotid <= MAXSLOTS);
     xhci->slots[slotid-1].enabled = 1;
-    xhci->slots[slotid-1].port = 0;
+    xhci->slots[slotid-1].uport = NULL;
     memset(xhci->slots[slotid-1].eps, 0, sizeof(XHCIEPContext*)*31);
 
     return CC_SUCCESS;
@@ -1795,17 +1785,42 @@ static TRBCCode xhci_disable_slot(XHCIState *xhci, unsigned int slotid)
     return CC_SUCCESS;
 }
 
+static USBPort *xhci_lookup_uport(XHCIState *xhci, uint32_t *slot_ctx)
+{
+    USBPort *uport;
+    char path[32];
+    int i, pos, port;
+
+    port = (slot_ctx[1]>>16) & 0xFF;
+    port = xhci->ports[port-1].uport->index+1;
+    pos = snprintf(path, sizeof(path), "%d", port);
+    for (i = 0; i < 5; i++) {
+        port = (slot_ctx[0] >> 4*i) & 0x0f;
+        if (!port) {
+            break;
+        }
+        pos += snprintf(path + pos, sizeof(path) - pos, ".%d", port);
+    }
+
+    QTAILQ_FOREACH(uport, &xhci->bus.used, next) {
+        if (strcmp(uport->path, path) == 0) {
+            return uport;
+        }
+    }
+    return NULL;
+}
+
 static TRBCCode xhci_address_slot(XHCIState *xhci, unsigned int slotid,
                                   uint64_t pictx, bool bsr)
 {
     XHCISlot *slot;
+    USBPort *uport;
     USBDevice *dev;
     dma_addr_t ictx, octx, dcbaap;
     uint64_t poctx;
     uint32_t ictl_ctx[2];
     uint32_t slot_ctx[4];
     uint32_t ep0_ctx[5];
-    unsigned int port;
     int i;
     TRBCCode res;
 
@@ -1837,27 +1852,28 @@ static TRBCCode xhci_address_slot(XHCIState *xhci, unsigned int slotid,
     DPRINTF("xhci: input ep0 context: %08x %08x %08x %08x %08x\n",
             ep0_ctx[0], ep0_ctx[1], ep0_ctx[2], ep0_ctx[3], ep0_ctx[4]);
 
-    port = (slot_ctx[1]>>16) & 0xFF;
-    dev = xhci->ports[port-1].uport->dev;
-
-    if (port < 1 || port > xhci->numports) {
-        fprintf(stderr, "xhci: bad port %d\n", port);
+    uport = xhci_lookup_uport(xhci, slot_ctx);
+    if (uport == NULL) {
+        fprintf(stderr, "xhci: port not found\n");
         return CC_TRB_ERROR;
-    } else if (!dev) {
-        fprintf(stderr, "xhci: port %d not connected\n", port);
+    }
+
+    dev = uport->dev;
+    if (!dev) {
+        fprintf(stderr, "xhci: port %s not connected\n", uport->path);
         return CC_USB_TRANSACTION_ERROR;
     }
 
     for (i = 0; i < MAXSLOTS; i++) {
-        if (xhci->slots[i].port == port) {
-            fprintf(stderr, "xhci: port %d already assigned to slot %d\n",
-                    port, i+1);
+        if (xhci->slots[i].uport == uport) {
+            fprintf(stderr, "xhci: port %s already assigned to slot %d\n",
+                    uport->path, i+1);
             return CC_TRB_ERROR;
         }
     }
 
     slot = &xhci->slots[slotid-1];
-    slot->port = port;
+    slot->uport = uport;
     slot->ctx = octx;
 
     if (bsr) {
@@ -2821,9 +2837,17 @@ static void xhci_complete(USBPort *port, USBPacket *packet)
     xhci_kick_ep(xfer->xhci, xfer->slotid, xfer->epid);
 }
 
-static void xhci_child_detach(USBPort *port, USBDevice *child)
+static void xhci_child_detach(USBPort *uport, USBDevice *child)
 {
-    FIXME();
+    USBBus *bus = usb_bus_from_device(child);
+    XHCIState *xhci = container_of(bus, XHCIState, bus);
+    int i;
+
+    for (i = 0; i < MAXSLOTS; i++) {
+        if (xhci->slots[i].uport == uport) {
+            xhci->slots[i].uport = NULL;
+        }
+    }
 }
 
 static USBPortOps xhci_port_ops = {
-- 
cgit v1.2.3


From 1d8a4e69eeda7e474d1a6b50951b0b1680f8186e Mon Sep 17 00:00:00 2001
From: Gerd Hoffmann <kraxel@redhat.com>
Date: Thu, 20 Sep 2012 13:36:04 +0200
Subject: xhci: create a memory region for each port

Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/usb/hcd-xhci.c | 85 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index 8c0155bb1..e79a8724c 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -285,6 +285,8 @@ typedef enum TRBCCode {
 #define SLOT_CONTEXT_ENTRIES_MASK 0x1f
 #define SLOT_CONTEXT_ENTRIES_SHIFT 27
 
+typedef struct XHCIState XHCIState;
+
 typedef enum EPType {
     ET_INVALID = 0,
     ET_ISO_OUT,
@@ -303,15 +305,15 @@ typedef struct XHCIRing {
 } XHCIRing;
 
 typedef struct XHCIPort {
+    XHCIState *xhci;
     uint32_t portsc;
     uint32_t portnr;
     USBPort  *uport;
     uint32_t speedmask;
+    char name[16];
+    MemoryRegion mem;
 } XHCIPort;
 
-struct XHCIState;
-typedef struct XHCIState XHCIState;
-
 typedef struct XHCITransfer {
     XHCIState *xhci;
     USBPacket packet;
@@ -2430,20 +2432,14 @@ static uint64_t xhci_cap_read(void *ptr, target_phys_addr_t reg, unsigned size)
     return ret;
 }
 
-static uint32_t xhci_port_read(XHCIState *xhci, uint32_t reg)
+static uint64_t xhci_port_read(void *ptr, target_phys_addr_t reg, unsigned size)
 {
-    uint32_t port = reg >> 4;
+    XHCIPort *port = ptr;
     uint32_t ret;
 
-    if (port >= xhci->numports) {
-        fprintf(stderr, "xhci_port_read: port %d out of bounds\n", port);
-        ret = 0;
-        goto out;
-    }
-
-    switch (reg & 0xf) {
+    switch (reg) {
     case 0x00: /* PORTSC */
-        ret = xhci->ports[port].portsc;
+        ret = port->portsc;
         break;
     case 0x04: /* PORTPMSC */
     case 0x08: /* PORTLI */
@@ -2452,30 +2448,25 @@ static uint32_t xhci_port_read(XHCIState *xhci, uint32_t reg)
     case 0x0c: /* reserved */
     default:
         fprintf(stderr, "xhci_port_read (port %d): reg 0x%x unimplemented\n",
-                port, reg);
+                port->portnr, (uint32_t)reg);
         ret = 0;
     }
 
-out:
-    trace_usb_xhci_port_read(port, reg & 0x0f, ret);
+    trace_usb_xhci_port_read(port->portnr, reg, ret);
     return ret;
 }
 
-static void xhci_port_write(XHCIState *xhci, uint32_t reg, uint32_t val)
+static void xhci_port_write(void *ptr, target_phys_addr_t reg,
+                            uint64_t val, unsigned size)
 {
-    uint32_t port = reg >> 4;
+    XHCIPort *port = ptr;
     uint32_t portsc;
 
-    trace_usb_xhci_port_write(port, reg & 0x0f, val);
+    trace_usb_xhci_port_write(port->portnr, reg, val);
 
-    if (port >= xhci->numports) {
-        fprintf(stderr, "xhci_port_read: port %d out of bounds\n", port);
-        return;
-    }
-
-    switch (reg & 0xf) {
+    switch (reg) {
     case 0x00: /* PORTSC */
-        portsc = xhci->ports[port].portsc;
+        portsc = port->portsc;
         /* write-1-to-clear bits*/
         portsc &= ~(val & (PORTSC_CSC|PORTSC_PEC|PORTSC_WRC|PORTSC_OCC|
                            PORTSC_PRC|PORTSC_PLC|PORTSC_CEC));
@@ -2490,16 +2481,16 @@ static void xhci_port_write(XHCIState *xhci, uint32_t reg, uint32_t val)
         /* write-1-to-start bits */
         if (val & PORTSC_PR) {
             DPRINTF("xhci: port %d reset\n", port);
-            usb_device_reset(xhci->ports[port].uport->dev);
+            usb_device_reset(port->uport->dev);
             portsc |= PORTSC_PRC | PORTSC_PED;
         }
-        xhci->ports[port].portsc = portsc;
+        port->portsc = portsc;
         break;
     case 0x04: /* PORTPMSC */
     case 0x08: /* PORTLI */
     default:
         fprintf(stderr, "xhci_port_write (port %d): reg 0x%x unimplemented\n",
-                port, reg);
+                port->portnr, (uint32_t)reg);
     }
 }
 
@@ -2508,10 +2499,6 @@ static uint64_t xhci_oper_read(void *ptr, target_phys_addr_t reg, unsigned size)
     XHCIState *xhci = ptr;
     uint32_t ret;
 
-    if (reg >= 0x400) {
-        return xhci_port_read(xhci, reg - 0x400);
-    }
-
     switch (reg) {
     case 0x00: /* USBCMD */
         ret = xhci->usbcmd;
@@ -2554,11 +2541,6 @@ static void xhci_oper_write(void *ptr, target_phys_addr_t reg,
 {
     XHCIState *xhci = ptr;
 
-    if (reg >= 0x400) {
-        xhci_port_write(xhci, reg - 0x400, val);
-        return;
-    }
-
     trace_usb_xhci_oper_write(reg, val);
 
     switch (reg) {
@@ -2777,6 +2759,14 @@ static const MemoryRegionOps xhci_oper_ops = {
     .endianness = DEVICE_LITTLE_ENDIAN,
 };
 
+static const MemoryRegionOps xhci_port_ops = {
+    .read = xhci_port_read,
+    .write = xhci_port_write,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
 static const MemoryRegionOps xhci_runtime_ops = {
     .read = xhci_runtime_read,
     .write = xhci_runtime_write,
@@ -2850,7 +2840,7 @@ static void xhci_child_detach(USBPort *uport, USBDevice *child)
     }
 }
 
-static USBPortOps xhci_port_ops = {
+static USBPortOps xhci_uport_ops = {
     .attach   = xhci_attach,
     .detach   = xhci_detach,
     .wakeup   = xhci_wakeup,
@@ -2930,6 +2920,7 @@ static void usb_xhci_init(XHCIState *xhci, DeviceState *dev)
                 USB_SPEED_MASK_LOW  |
                 USB_SPEED_MASK_FULL |
                 USB_SPEED_MASK_HIGH;
+            snprintf(port->name, sizeof(port->name), "usb2 port #%d", i+1);
             speedmask |= port->speedmask;
         }
         if (i < xhci->numports_3) {
@@ -2937,16 +2928,17 @@ static void usb_xhci_init(XHCIState *xhci, DeviceState *dev)
             port->portnr = i + 1 + xhci->numports_2;
             port->uport = &xhci->uports[i];
             port->speedmask = USB_SPEED_MASK_SUPER;
+            snprintf(port->name, sizeof(port->name), "usb3 port #%d", i+1);
             speedmask |= port->speedmask;
         }
         usb_register_port(&xhci->bus, &xhci->uports[i], xhci, i,
-                          &xhci_port_ops, speedmask);
+                          &xhci_uport_ops, speedmask);
     }
 }
 
 static int usb_xhci_initfn(struct PCIDevice *dev)
 {
-    int ret;
+    int i, ret;
 
     XHCIState *xhci = DO_UPCAST(XHCIState, pci_dev, dev);
 
@@ -2965,7 +2957,7 @@ static int usb_xhci_initfn(struct PCIDevice *dev)
     memory_region_init_io(&xhci->mem_cap, &xhci_cap_ops, xhci,
                           "capabilities", LEN_CAP);
     memory_region_init_io(&xhci->mem_oper, &xhci_oper_ops, xhci,
-                          "operational", 0x400 + 0x10 * xhci->numports);
+                          "operational", 0x400);
     memory_region_init_io(&xhci->mem_runtime, &xhci_runtime_ops, xhci,
                           "runtime", LEN_RUNTIME);
     memory_region_init_io(&xhci->mem_doorbell, &xhci_doorbell_ops, xhci,
@@ -2976,6 +2968,15 @@ static int usb_xhci_initfn(struct PCIDevice *dev)
     memory_region_add_subregion(&xhci->mem, OFF_RUNTIME,  &xhci->mem_runtime);
     memory_region_add_subregion(&xhci->mem, OFF_DOORBELL, &xhci->mem_doorbell);
 
+    for (i = 0; i < xhci->numports; i++) {
+        XHCIPort *port = &xhci->ports[i];
+        uint32_t offset = OFF_OPER + 0x400 + 0x10 * i;
+        port->xhci = xhci;
+        memory_region_init_io(&port->mem, &xhci_port_ops, port,
+                              port->name, 0x10);
+        memory_region_add_subregion(&xhci->mem, offset, &port->mem);
+    }
+
     pci_register_bar(&xhci->pci_dev, 0,
                      PCI_BASE_ADDRESS_SPACE_MEMORY|PCI_BASE_ADDRESS_MEM_TYPE_64,
                      &xhci->mem);
-- 
cgit v1.2.3


From cae5d3f4b3fbe9b681c0c4046008af424bd1d6a5 Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Thu, 20 Sep 2012 17:38:07 +0200
Subject: ehci: Fix interrupt packet MULT handling

There are several issues with our handling of the MULT epcap field
of interrupt qhs, which this patch fixes.

1) When we don't execute a transaction because of the transaction counter
being 0, p->async stays EHCI_ASYNC_NONE, and the next time we process the
same qtd we hit an assert in ehci_state_fetchqtd because of this. Even though
I believe that this is caused by 3 below, this patch still removes the assert,
as that can still happen without 3, when multiple packets are queued for the
same interrupt ep.

2) We only *check* the transaction counter from ehci_state_execute, any
packets queued up by fill_queue bypass this check. This is fixed by not calling
fill_queue for interrupt packets.

3) Some versions of Windows set the MULT field of the qh to 0, which is a
clear violation of the EHCI spec, but still they do it. This means that we
will never execute a qtd for these, making interrupt ep-s on USB-2 devices
not work, and after recent changes, triggering 1).

So far we've stored the transaction counter in our copy of the mult field,
but with this beginnig at 0 already when dealing with these version of windows
this won't work. So this patch adds a transact_ctr field to our qh struct,
and sets this to the MULT field value on fetchqh. When the MULT field value
is 0, we set it to 4. Assuming that windows gets way with setting it to 0,
by the actual hardware going horizontal on a 1 -> 0 transition, which will
give it 4 transactions (MULT goes from 0 - 3).

Note that we cannot stop on detecting the 1 -> 0 transition, as our decrement
of the transaction counter, and checking for it are done in 2 different places.

Reported-by: Shawn Starr <shawn.starr@rogers.com>
Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/usb/hcd-ehci.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/hw/usb/hcd-ehci.c b/hw/usb/hcd-ehci.c
index 6a5da8413..8bdb806b9 100644
--- a/hw/usb/hcd-ehci.c
+++ b/hw/usb/hcd-ehci.c
@@ -373,6 +373,7 @@ struct EHCIQueue {
     uint32_t seen;
     uint64_t ts;
     int async;
+    int transact_ctr;
 
     /* cached data from guest - needs to be flushed
      * when guest removes an entry (doorbell, handshake sequence)
@@ -1837,6 +1838,11 @@ static EHCIQueue *ehci_state_fetchqh(EHCIState *ehci, int async)
     }
     q->qh = qh;
 
+    q->transact_ctr = get_field(q->qh.epcap, QH_EPCAP_MULT);
+    if (q->transact_ctr == 0) { /* Guest bug in some versions of windows */
+        q->transact_ctr = 4;
+    }
+
     if (q->dev == NULL) {
         q->dev = ehci_find_device(q->ehci, devaddr);
     }
@@ -2014,11 +2020,8 @@ static int ehci_state_fetchqtd(EHCIQueue *q)
     } else if (p != NULL) {
         switch (p->async) {
         case EHCI_ASYNC_NONE:
-            /* Should never happen packet should at least be initialized */
-            assert(0);
-            break;
         case EHCI_ASYNC_INITIALIZED:
-            /* Previously nacked packet (likely interrupt ep) */
+            /* Not yet executed (MULT), or previously nacked (int) packet */
             ehci_set_state(q->ehci, q->async, EST_EXECUTE);
             break;
         case EHCI_ASYNC_INFLIGHT:
@@ -2107,15 +2110,12 @@ static int ehci_state_execute(EHCIQueue *q)
 
     // TODO verify enough time remains in the uframe as in 4.4.1.1
     // TODO write back ptr to async list when done or out of time
-    // TODO Windows does not seem to ever set the MULT field
 
-    if (!q->async) {
-        int transactCtr = get_field(q->qh.epcap, QH_EPCAP_MULT);
-        if (!transactCtr) {
-            ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH);
-            again = 1;
-            goto out;
-        }
+    /* 4.10.3, bottom of page 82, go horizontal on transaction counter == 0 */
+    if (!q->async && q->transact_ctr == 0) {
+        ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH);
+        again = 1;
+        goto out;
     }
 
     if (q->async) {
@@ -2132,7 +2132,11 @@ static int ehci_state_execute(EHCIQueue *q)
         trace_usb_ehci_packet_action(p->queue, p, "async");
         p->async = EHCI_ASYNC_INFLIGHT;
         ehci_set_state(q->ehci, q->async, EST_HORIZONTALQH);
-        again = (ehci_fill_queue(p) == USB_RET_PROCERR) ? -1 : 1;
+        if (q->async) {
+            again = (ehci_fill_queue(p) == USB_RET_PROCERR) ? -1 : 1;
+        } else {
+            again = 1;
+        }
         goto out;
     }
 
@@ -2152,13 +2156,9 @@ static int ehci_state_executing(EHCIQueue *q)
 
     ehci_execute_complete(q);
 
-    // 4.10.3
-    if (!q->async) {
-        int transactCtr = get_field(q->qh.epcap, QH_EPCAP_MULT);
-        transactCtr--;
-        set_field(&q->qh.epcap, transactCtr, QH_EPCAP_MULT);
-        // 4.10.3, bottom of page 82, should exit this state when transaction
-        // counter decrements to 0
+    /* 4.10.3 */
+    if (!q->async && q->transact_ctr > 0) {
+        q->transact_ctr--;
     }
 
     /* 4.10.5 */
-- 
cgit v1.2.3


From 8b626aa7841ef79b70066c880b3b6c29496797af Mon Sep 17 00:00:00 2001
From: Hans de Goede <hdegoede@redhat.com>
Date: Tue, 25 Sep 2012 13:22:21 +0200
Subject: usb-redir: Adjust pkg-config check for usbredirparser .pc file rename
 (v2)

The usbredir 0.5 release introduced the new API for 64 bit packet ids, but
it kept the libusbredirparser.pc name as is, meaning that older versions of
qemu will still have their pkg-config check for usbredirparser fulfilled,
and build with the usb-redir device. Due to the API change there will be
some compiler warnings, but the build will succeed, however the usb-redir
device will be broken on 32 bit machines.

To solve this a new usbredir-0.5.2 release is coming, which renames the
libusbredirparser.pc file to libusbredirparser-0.5.pc, so that it will no
longer fulfill the pkg-config check of the qemu-1.2 and older releases,
stopping the (silent) breakage. This patch adjusts qemu master's configure
to properly detect the new usbredir release.

Changes in v2:
-Not only use the new .pc name in the check but also when getting cflags
 and libs!

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 configure | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/configure b/configure
index 1b865174e..4f240620d 100755
--- a/configure
+++ b/configure
@@ -2752,10 +2752,10 @@ fi
 
 # check for usbredirparser for usb network redirection support
 if test "$usb_redir" != "no" ; then
-    if $pkg_config --atleast-version=0.5 libusbredirparser >/dev/null 2>&1 ; then
+    if $pkg_config --atleast-version=0.5 libusbredirparser-0.5 >/dev/null 2>&1 ; then
         usb_redir="yes"
-        usb_redir_cflags=$($pkg_config --cflags libusbredirparser 2>/dev/null)
-        usb_redir_libs=$($pkg_config --libs libusbredirparser 2>/dev/null)
+        usb_redir_cflags=$($pkg_config --cflags libusbredirparser-0.5 2>/dev/null)
+        usb_redir_libs=$($pkg_config --libs libusbredirparser-0.5 2>/dev/null)
         QEMU_CFLAGS="$QEMU_CFLAGS $usb_redir_cflags"
         libs_softmmu="$libs_softmmu $usb_redir_libs"
     else
-- 
cgit v1.2.3


From 39c138c8420f51a7da7b35233a8d7400a0b589ac Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 26 Sep 2012 12:59:31 +1000
Subject: usb: Fix usb_packet_map() in the presence of IOMMUs

With the IOMMU infrastructure introduced before 1.2, we need to use
dma_memory_map() to obtain a qemu pointer to memory from an IO bus address.
However, dma_memory_map() alters the given length to reflect the length
over which the used DMA translation is valid - which could be either more
or less than the requested length.

usb_packet_map() does not correctly handle these cases, simply failing if
dma_memory_map() alters the requested length.  If dma_memory_map()
increased the length, we just need to use the requested length for the
qemu_iovec_add().  However, if it decreased the length, it means that a
single DMA translation is not valid for the whole sglist element, and so
we need to loop, splitting it up into multiple iovec entries for each
piece with a DMA translation (in practice >2 pieces is unlikely).

This patch implements the correct behaviour

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
 hw/usb/libhw.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/hw/usb/libhw.c b/hw/usb/libhw.c
index c0de30ea8..703e2d213 100644
--- a/hw/usb/libhw.c
+++ b/hw/usb/libhw.c
@@ -28,19 +28,25 @@ int usb_packet_map(USBPacket *p, QEMUSGList *sgl)
 {
     DMADirection dir = (p->pid == USB_TOKEN_IN) ?
         DMA_DIRECTION_FROM_DEVICE : DMA_DIRECTION_TO_DEVICE;
-    dma_addr_t len;
     void *mem;
     int i;
 
     for (i = 0; i < sgl->nsg; i++) {
-        len = sgl->sg[i].len;
-        mem = dma_memory_map(sgl->dma, sgl->sg[i].base, &len, dir);
-        if (!mem) {
-            goto err;
-        }
-        qemu_iovec_add(&p->iov, mem, len);
-        if (len != sgl->sg[i].len) {
-            goto err;
+        dma_addr_t base = sgl->sg[i].base;
+        dma_addr_t len = sgl->sg[i].len;
+
+        while (len) {
+            dma_addr_t xlen = len;
+            mem = dma_memory_map(sgl->dma, sgl->sg[i].base, &xlen, dir);
+            if (!mem) {
+                goto err;
+            }
+            if (xlen > len) {
+                xlen = len;
+            }
+            qemu_iovec_add(&p->iov, mem, xlen);
+            len -= xlen;
+            base += xlen;
         }
     }
     return 0;
-- 
cgit v1.2.3


From e9d17b6890ae772f3652c8cacf4e1f72f576f907 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Fri, 14 Sep 2012 21:44:20 +0900
Subject: Make negotiation optional in QEMUMonitorProtocol

This is a preparation for qemu-ga-client which uses
QEMUMonitorProtocol class. The class tries to
negotiate capabilities on connect, however, qemu-ga
doesn't suppose it and fails.

This change makes the negotiation optional, though
it's still performed by default for compatibility.

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 QMP/qmp.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/QMP/qmp.py b/QMP/qmp.py
index 36ecc1dfa..5a573e148 100644
--- a/QMP/qmp.py
+++ b/QMP/qmp.py
@@ -49,7 +49,6 @@ class QEMUMonitorProtocol:
         return socket.socket(family, socket.SOCK_STREAM)
 
     def __negotiate_capabilities(self):
-        self.__sockfile = self.__sock.makefile()
         greeting = self.__json_read()
         if greeting is None or not greeting.has_key('QMP'):
             raise QMPConnectError
@@ -73,7 +72,7 @@ class QEMUMonitorProtocol:
 
     error = socket.error
 
-    def connect(self):
+    def connect(self, negotiate=True):
         """
         Connect to the QMP Monitor and perform capabilities negotiation.
 
@@ -83,7 +82,9 @@ class QEMUMonitorProtocol:
         @raise QMPCapabilitiesError if fails to negotiate capabilities
         """
         self.__sock.connect(self.__address)
-        return self.__negotiate_capabilities()
+        self.__sockfile = self.__sock.makefile()
+        if negotiate:
+            return self.__negotiate_capabilities()
 
     def accept(self):
         """
-- 
cgit v1.2.3


From e37b350aa8766ae36ec9bf40fab665455d4a5530 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Fri, 14 Sep 2012 21:44:21 +0900
Subject: Support settimeout in QEMUMonitorProtocol

This method is used in the following qemu-ga-client script
to implement non-blocking operations.

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 QMP/qmp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/QMP/qmp.py b/QMP/qmp.py
index 5a573e148..33c7d36d9 100644
--- a/QMP/qmp.py
+++ b/QMP/qmp.py
@@ -162,3 +162,8 @@ class QEMUMonitorProtocol:
     def close(self):
         self.__sock.close()
         self.__sockfile.close()
+
+    timeout = socket.timeout
+
+    def settimeout(self, timeout):
+        self.__sock.settimeout(timeout)
-- 
cgit v1.2.3


From f513cbf7503d8db3778df436beaf25f3d8260317 Mon Sep 17 00:00:00 2001
From: Ryota Ozaki <ozaki.ryota@gmail.com>
Date: Fri, 14 Sep 2012 21:44:22 +0900
Subject: Add qemu-ga-client script

This is an easy-to-use QEMU guest agent client written in
Python. It simply provides commands to call guest agent
functions like ping, fsfreeze and shutdown. Additionally,
it provides extra useful commands, e.g, cat, ifconfig and
reboot, by using guet agent functions.

Examples:
  $ export QGA_CLIENT_ADDRESS=/tmp/qga.sock
  $ qemu-ga-client ping

  $ qemu-ga-client cat /etc/resolv.conf
  # Generated by NetworkManager
  nameserver 10.0.2.3

  $ qemu-ga-client fsfreeze status
  thawed
  $ qemu-ga-client fsfreeze freeze
  2 filesystems frozen

The script communicates with a guest agent by means of
qmp.QEMUMonitorProtocol. Every commands are called with
timeout (3 sec.) to avoid blocking. The script always
calls sync command prior to issuing an actual command
(except for ping which doesn't need sync).

Signed-off-by: Ryota Ozaki <ozaki.ryota@gmail.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 QMP/qemu-ga-client | 299 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100755 QMP/qemu-ga-client

diff --git a/QMP/qemu-ga-client b/QMP/qemu-ga-client
new file mode 100755
index 000000000..46676c375
--- /dev/null
+++ b/QMP/qemu-ga-client
@@ -0,0 +1,299 @@
+#!/usr/bin/python
+
+# QEMU Guest Agent Client
+#
+# Copyright (C) 2012 Ryota Ozaki <ozaki.ryota@gmail.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2.  See
+# the COPYING file in the top-level directory.
+#
+# Usage:
+#
+# Start QEMU with:
+#
+# # qemu [...] -chardev socket,path=/tmp/qga.sock,server,nowait,id=qga0 \
+#   -device virtio-serial -device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0
+#
+# Run the script:
+#
+# $ qemu-ga-client --address=/tmp/qga.sock <command> [args...]
+#
+# or
+#
+# $ export QGA_CLIENT_ADDRESS=/tmp/qga.sock
+# $ qemu-ga-client <command> [args...]
+#
+# For example:
+#
+# $ qemu-ga-client cat /etc/resolv.conf
+# # Generated by NetworkManager
+# nameserver 10.0.2.3
+# $ qemu-ga-client fsfreeze status
+# thawed
+# $ qemu-ga-client fsfreeze freeze
+# 2 filesystems frozen
+#
+# See also: http://wiki.qemu.org/Features/QAPI/GuestAgent
+#
+
+import base64
+import random
+
+import qmp
+
+
+class QemuGuestAgent(qmp.QEMUMonitorProtocol):
+    def __getattr__(self, name):
+        def wrapper(**kwds):
+            return self.command('guest-' + name.replace('_', '-'), **kwds)
+        return wrapper
+
+
+class QemuGuestAgentClient:
+    error = QemuGuestAgent.error
+
+    def __init__(self, address):
+        self.qga = QemuGuestAgent(address)
+        self.qga.connect(negotiate=False)
+
+    def sync(self, timeout=3):
+        # Avoid being blocked forever
+        if not self.ping(timeout):
+            raise EnvironmentError('Agent seems not alive')
+        uid = random.randint(0, (1 << 32) - 1)
+        while True:
+            ret = self.qga.sync(id=uid)
+            if isinstance(ret, int) and int(ret) == uid:
+                break
+
+    def __file_read_all(self, handle):
+        eof = False
+        data = ''
+        while not eof:
+            ret = self.qga.file_read(handle=handle, count=1024)
+            _data = base64.b64decode(ret['buf-b64'])
+            data += _data
+            eof = ret['eof']
+        return data
+
+    def read(self, path):
+        handle = self.qga.file_open(path=path)
+        try:
+            data = self.__file_read_all(handle)
+        finally:
+            self.qga.file_close(handle=handle)
+        return data
+
+    def info(self):
+        info = self.qga.info()
+
+        msgs = []
+        msgs.append('version: ' + info['version'])
+        msgs.append('supported_commands:')
+        enabled = [c['name'] for c in info['supported_commands'] if c['enabled']]
+        msgs.append('\tenabled: ' + ', '.join(enabled))
+        disabled = [c['name'] for c in info['supported_commands'] if not c['enabled']]
+        msgs.append('\tdisabled: ' + ', '.join(disabled))
+
+        return '\n'.join(msgs)
+
+    def __gen_ipv4_netmask(self, prefixlen):
+        mask = int('1' * prefixlen + '0' * (32 - prefixlen), 2)
+        return '.'.join([str(mask >> 24),
+                         str((mask >> 16) & 0xff),
+                         str((mask >> 8) & 0xff),
+                         str(mask & 0xff)])
+
+    def ifconfig(self):
+        nifs = self.qga.network_get_interfaces()
+
+        msgs = []
+        for nif in nifs:
+            msgs.append(nif['name'] + ':')
+            if 'ip-addresses' in nif:
+                for ipaddr in nif['ip-addresses']:
+                    if ipaddr['ip-address-type'] == 'ipv4':
+                        addr = ipaddr['ip-address']
+                        mask = self.__gen_ipv4_netmask(int(ipaddr['prefix']))
+                        msgs.append("\tinet %s  netmask %s" % (addr, mask))
+                    elif ipaddr['ip-address-type'] == 'ipv6':
+                        addr = ipaddr['ip-address']
+                        prefix = ipaddr['prefix']
+                        msgs.append("\tinet6 %s  prefixlen %s" % (addr, prefix))
+            if nif['hardware-address'] != '00:00:00:00:00:00':
+                msgs.append("\tether " + nif['hardware-address'])
+
+        return '\n'.join(msgs)
+
+    def ping(self, timeout):
+        self.qga.settimeout(timeout)
+        try:
+            self.qga.ping()
+        except self.qga.timeout:
+            return False
+        return True
+
+    def fsfreeze(self, cmd):
+        if cmd not in ['status', 'freeze', 'thaw']:
+            raise StandardError('Invalid command: ' + cmd)
+
+        return getattr(self.qga, 'fsfreeze' + '_' + cmd)()
+
+    def fstrim(self, minimum=0):
+        return getattr(self.qga, 'fstrim')(minimum=minimum)
+
+    def suspend(self, mode):
+        if mode not in ['disk', 'ram', 'hybrid']:
+            raise StandardError('Invalid mode: ' + mode)
+
+        try:
+            getattr(self.qga, 'suspend' + '_' + mode)()
+            # On error exception will raise
+        except self.qga.timeout:
+            # On success command will timed out
+            return
+
+    def shutdown(self, mode='powerdown'):
+        if mode not in ['powerdown', 'halt', 'reboot']:
+            raise StandardError('Invalid mode: ' + mode)
+
+        try:
+            self.qga.shutdown(mode=mode)
+        except self.qga.timeout:
+            return
+
+
+def _cmd_cat(client, args):
+    if len(args) != 1:
+        print('Invalid argument')
+        print('Usage: cat <file>')
+        sys.exit(1)
+    print(client.read(args[0]))
+
+
+def _cmd_fsfreeze(client, args):
+    usage = 'Usage: fsfreeze status|freeze|thaw'
+    if len(args) != 1:
+        print('Invalid argument')
+        print(usage)
+        sys.exit(1)
+    if args[0] not in ['status', 'freeze', 'thaw']:
+        print('Invalid command: ' + args[0])
+        print(usage)
+        sys.exit(1)
+    cmd = args[0]
+    ret = client.fsfreeze(cmd)
+    if cmd == 'status':
+        print(ret)
+    elif cmd == 'freeze':
+        print("%d filesystems frozen" % ret)
+    else:
+        print("%d filesystems thawed" % ret)
+
+
+def _cmd_fstrim(client, args):
+    if len(args) == 0:
+        minimum = 0
+    else:
+        minimum = int(args[0])
+    print(client.fstrim(minimum))
+
+
+def _cmd_ifconfig(client, args):
+    print(client.ifconfig())
+
+
+def _cmd_info(client, args):
+    print(client.info())
+
+
+def _cmd_ping(client, args):
+    if len(args) == 0:
+        timeout = 3
+    else:
+        timeout = float(args[0])
+    alive = client.ping(timeout)
+    if not alive:
+        print("Not responded in %s sec" % args[0])
+        sys.exit(1)
+
+
+def _cmd_suspend(client, args):
+    usage = 'Usage: suspend disk|ram|hybrid'
+    if len(args) != 1:
+        print('Less argument')
+        print(usage)
+        sys.exit(1)
+    if args[0] not in ['disk', 'ram', 'hybrid']:
+        print('Invalid command: ' + args[0])
+        print(usage)
+        sys.exit(1)
+    client.suspend(args[0])
+
+
+def _cmd_shutdown(client, args):
+    client.shutdown()
+_cmd_powerdown = _cmd_shutdown
+
+
+def _cmd_halt(client, args):
+    client.shutdown('halt')
+
+
+def _cmd_reboot(client, args):
+    client.shutdown('reboot')
+
+
+commands = [m.replace('_cmd_', '') for m in dir() if '_cmd_' in m]
+
+
+def main(address, cmd, args):
+    if not os.path.exists(address):
+        print('%s not found' % address)
+        sys.exit(1)
+
+    if cmd not in commands:
+        print('Invalid command: ' + cmd)
+        print('Available commands: ' + ', '.join(commands))
+        sys.exit(1)
+
+    try:
+        client = QemuGuestAgentClient(address)
+    except QemuGuestAgent.error, e:
+        import errno
+
+        print(e)
+        if e.errno == errno.ECONNREFUSED:
+            print('Hint: qemu is not running?')
+        sys.exit(1)
+
+    if cmd != 'ping':
+        client.sync()
+
+    globals()['_cmd_' + cmd](client, args)
+
+
+if __name__ == '__main__':
+    import sys
+    import os
+    import optparse
+
+    address = os.environ['QGA_CLIENT_ADDRESS'] if 'QGA_CLIENT_ADDRESS' in os.environ else None
+
+    usage = "%prog [--address=<unix_path>|<ipv4_address>] <command> [args...]\n"
+    usage += '<command>: ' + ', '.join(commands)
+    parser = optparse.OptionParser(usage=usage)
+    parser.add_option('--address', action='store', type='string',
+                      default=address, help='Specify a ip:port pair or a unix socket path')
+    options, args = parser.parse_args()
+
+    address = options.address
+    if address is None:
+        parser.error('address is not specified')
+        sys.exit(1)
+
+    if len(args) == 0:
+        parser.error('Less argument')
+        sys.exit(1)
+
+    main(address, args[0], args[1:])
-- 
cgit v1.2.3


From eda50a656f52a5172fa8a95f7b217565b90d413e Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 19 Sep 2012 16:31:06 +0200
Subject: qapi: do not protect enum values from namespace pollution

Enum values are always preceded by the uppercase name of the enum, so
they do not conflict with reserved words.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 scripts/qapi-types.py | 4 ++--
 scripts/qapi-visit.py | 2 +-
 scripts/qapi.py       | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/scripts/qapi-types.py b/scripts/qapi-types.py
index 49ef569a2..1b8483495 100644
--- a/scripts/qapi-types.py
+++ b/scripts/qapi-types.py
@@ -91,9 +91,9 @@ const char *%(name)s_lookup[] = {
 
 def generate_enum_name(name):
     if name.isupper():
-        return c_fun(name)
+        return c_fun(name, False)
     new_name = ''
-    for c in c_fun(name):
+    for c in c_fun(name, False):
         if c.isupper():
             new_name += '_'
         new_name += c
diff --git a/scripts/qapi-visit.py b/scripts/qapi-visit.py
index e2093e894..a360de719 100644
--- a/scripts/qapi-visit.py
+++ b/scripts/qapi-visit.py
@@ -173,7 +173,7 @@ void visit_type_%(name)s(Visitor *m, %(name)s ** obj, const char *name, Error **
                 break;
 ''',
                 abbrev = de_camel_case(name).upper(),
-                enum = c_fun(de_camel_case(key)).upper(),
+                enum = c_fun(de_camel_case(key),False).upper(),
                 c_type=members[key],
                 c_name=c_fun(key))
 
diff --git a/scripts/qapi.py b/scripts/qapi.py
index 122b4cb6d..057332e4c 100644
--- a/scripts/qapi.py
+++ b/scripts/qapi.py
@@ -141,7 +141,7 @@ def camel_case(name):
             new_name += ch.lower()
     return new_name
 
-def c_var(name):
+def c_var(name, protect=True):
     # ANSI X3J11/88-090, 3.1.1
     c89_words = set(['auto', 'break', 'case', 'char', 'const', 'continue',
                      'default', 'do', 'double', 'else', 'enum', 'extern', 'float',
@@ -156,12 +156,12 @@ def c_var(name):
     # GCC http://gcc.gnu.org/onlinedocs/gcc-4.7.1/gcc/C-Extensions.html
     # excluding _.*
     gcc_words = set(['asm', 'typeof'])
-    if name in c89_words | c99_words | c11_words | gcc_words:
+    if protect and (name in c89_words | c99_words | c11_words | gcc_words):
         return "q_" + name
     return name.replace('-', '_').lstrip("*")
 
-def c_fun(name):
-    return c_var(name).replace('.', '_')
+def c_fun(name, protect=True):
+    return c_var(name, protect).replace('.', '_')
 
 def c_list_type(name):
     return '%sList' % name
-- 
cgit v1.2.3


From 1057725f6629fc2771a294a92ce8eedb92c86fe8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 19 Sep 2012 16:31:07 +0200
Subject: qapi: add "unix" to the set of reserved words

It is #defined to 1.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 scripts/qapi.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/qapi.py b/scripts/qapi.py
index 057332e4c..afc5f32ae 100644
--- a/scripts/qapi.py
+++ b/scripts/qapi.py
@@ -156,7 +156,9 @@ def c_var(name, protect=True):
     # GCC http://gcc.gnu.org/onlinedocs/gcc-4.7.1/gcc/C-Extensions.html
     # excluding _.*
     gcc_words = set(['asm', 'typeof'])
-    if protect and (name in c89_words | c99_words | c11_words | gcc_words):
+    # namespace pollution:
+    polluted_words = set(['unix'])
+    if protect and (name in c89_words | c99_words | c11_words | gcc_words | polluted_words):
         return "q_" + name
     return name.replace('-', '_').lstrip("*")
 
-- 
cgit v1.2.3


From 9a3a88956c7f32130ad20011f7c3c161fa5876d8 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 20 Sep 2012 16:50:32 +0200
Subject: pci-assign: use monitor_handle_fd_param

There is no need to open-code the choice between a file descriptor
number or a named one.  Just use monitor_handle_fd_param, which
also takes care of printing the error message.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 hw/kvm/pci-assign.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/hw/kvm/pci-assign.c b/hw/kvm/pci-assign.c
index 05b93d9a5..7a0998c51 100644
--- a/hw/kvm/pci-assign.c
+++ b/hw/kvm/pci-assign.c
@@ -579,15 +579,9 @@ static int get_real_device(AssignedDevice *pci_dev, uint16_t r_seg,
     snprintf(name, sizeof(name), "%sconfig", dir);
 
     if (pci_dev->configfd_name && *pci_dev->configfd_name) {
-        if (qemu_isdigit(pci_dev->configfd_name[0])) {
-            dev->config_fd = strtol(pci_dev->configfd_name, NULL, 0);
-        } else {
-            dev->config_fd = monitor_get_fd(cur_mon, pci_dev->configfd_name);
-            if (dev->config_fd < 0) {
-                error_report("%s: (%s) unkown", __func__,
-                             pci_dev->configfd_name);
-                return 1;
-            }
+        dev->config_fd = monitor_handle_fd_param(cur_mon, pci_dev->configfd_name);
+        if (dev->config_fd < 0) {
+            return 1;
         }
     } else {
         dev->config_fd = open(name, O_RDWR);
-- 
cgit v1.2.3


From a9940fc4cba811adfb296fe07b247ee707265f90 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 20 Sep 2012 16:50:32 +0200
Subject: monitor: add Error * argument to monitor_get_fd

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 dump.c         |  3 +--
 migration-fd.c |  2 +-
 monitor.c      | 15 +++++++++------
 monitor.h      |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/dump.c b/dump.c
index 2bf8d8d99..1a3c7164b 100644
--- a/dump.c
+++ b/dump.c
@@ -836,9 +836,8 @@ void qmp_dump_guest_memory(bool paging, const char *file, bool has_begin,
 
 #if !defined(WIN32)
     if (strstart(file, "fd:", &p)) {
-        fd = monitor_get_fd(cur_mon, p);
+        fd = monitor_get_fd(cur_mon, p, errp);
         if (fd == -1) {
-            error_set(errp, QERR_FD_NOT_FOUND, p);
             return;
         }
     }
diff --git a/migration-fd.c b/migration-fd.c
index 50138edb3..73351678e 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -75,7 +75,7 @@ static int fd_close(MigrationState *s)
 
 int fd_start_outgoing_migration(MigrationState *s, const char *fdname)
 {
-    s->fd = monitor_get_fd(cur_mon, fdname);
+    s->fd = monitor_get_fd(cur_mon, fdname, NULL);
     if (s->fd == -1) {
         DPRINTF("fd_migration: invalid file descriptor identifier\n");
         goto err_after_get_fd;
diff --git a/monitor.c b/monitor.c
index 67064e270..c24235e90 100644
--- a/monitor.c
+++ b/monitor.c
@@ -951,7 +951,7 @@ static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_d
     CharDriverState *s;
 
     if (strcmp(protocol, "spice") == 0) {
-        int fd = monitor_get_fd(mon, fdname);
+        int fd = monitor_get_fd(mon, fdname, NULL);
         int skipauth = qdict_get_try_bool(qdict, "skipauth", 0);
         int tls = qdict_get_try_bool(qdict, "tls", 0);
         if (!using_spice) {
@@ -965,13 +965,13 @@ static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_d
         return 0;
 #ifdef CONFIG_VNC
     } else if (strcmp(protocol, "vnc") == 0) {
-	int fd = monitor_get_fd(mon, fdname);
+	int fd = monitor_get_fd(mon, fdname, NULL);
         int skipauth = qdict_get_try_bool(qdict, "skipauth", 0);
 	vnc_display_add_client(NULL, fd, skipauth);
 	return 0;
 #endif
     } else if ((s = qemu_chr_find(protocol)) != NULL) {
-	int fd = monitor_get_fd(mon, fdname);
+	int fd = monitor_get_fd(mon, fdname, NULL);
 	if (qemu_chr_add_client(s, fd) < 0) {
 	    qerror_report(QERR_ADD_CLIENT_FAILED);
 	    return -1;
@@ -2118,7 +2118,7 @@ static void do_loadvm(Monitor *mon, const QDict *qdict)
     }
 }
 
-int monitor_get_fd(Monitor *mon, const char *fdname)
+int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp)
 {
     mon_fd_t *monfd;
 
@@ -2139,6 +2139,7 @@ int monitor_get_fd(Monitor *mon, const char *fdname)
         return fd;
     }
 
+    error_setg(errp, "File descriptor named '%s' has not been found", fdname);
     return -1;
 }
 
@@ -2410,12 +2411,14 @@ int monitor_fdset_dup_fd_remove(int dup_fd)
 int monitor_handle_fd_param(Monitor *mon, const char *fdname)
 {
     int fd;
+    Error *local_err = NULL;
 
     if (!qemu_isdigit(fdname[0]) && mon) {
 
-        fd = monitor_get_fd(mon, fdname);
+        fd = monitor_get_fd(mon, fdname, &local_err);
         if (fd == -1) {
-            error_report("No file descriptor named %s found", fdname);
+            qerror_report_err(local_err);
+            error_free(local_err);
             return -1;
         }
     } else {
diff --git a/monitor.h b/monitor.h
index 64c156184..e240c3f42 100644
--- a/monitor.h
+++ b/monitor.h
@@ -66,7 +66,7 @@ int monitor_read_block_device_key(Monitor *mon, const char *device,
                                   BlockDriverCompletionFunc *completion_cb,
                                   void *opaque);
 
-int monitor_get_fd(Monitor *mon, const char *fdname);
+int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp);
 int monitor_handle_fd_param(Monitor *mon, const char *fdname);
 
 void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)
-- 
cgit v1.2.3


From b224e5e2162a767dd56dbc366f796fbe45ca5baa Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Thu, 13 Sep 2012 16:52:20 -0300
Subject: qapi: convert add_client

Also fixes a few issues while there:

 1. The fd returned by monitor_get_fd() leaks in most error conditions
 2. monitor_get_fd() return value is not checked. Best case we get
    an error that is not correctly reported, worse case one of the
    functions using the fd (with value of -1) will explode
 3. A few error conditions aren't reported
 4. We now "use up" @fdname always.  Before, it was left alone for
    invalid @protocol

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 monitor.c        | 39 ---------------------------------------
 qapi-schema.json | 25 +++++++++++++++++++++++++
 qmp-commands.hx  |  5 +----
 qmp.c            | 43 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 43 deletions(-)

diff --git a/monitor.c b/monitor.c
index c24235e90..c9f460ac4 100644
--- a/monitor.c
+++ b/monitor.c
@@ -944,45 +944,6 @@ static void do_trace_print_events(Monitor *mon)
     trace_print_events((FILE *)mon, &monitor_fprintf);
 }
 
-static int add_graphics_client(Monitor *mon, const QDict *qdict, QObject **ret_data)
-{
-    const char *protocol  = qdict_get_str(qdict, "protocol");
-    const char *fdname = qdict_get_str(qdict, "fdname");
-    CharDriverState *s;
-
-    if (strcmp(protocol, "spice") == 0) {
-        int fd = monitor_get_fd(mon, fdname, NULL);
-        int skipauth = qdict_get_try_bool(qdict, "skipauth", 0);
-        int tls = qdict_get_try_bool(qdict, "tls", 0);
-        if (!using_spice) {
-            /* correct one? spice isn't a device ,,, */
-            qerror_report(QERR_DEVICE_NOT_ACTIVE, "spice");
-            return -1;
-        }
-        if (qemu_spice_display_add_client(fd, skipauth, tls) < 0) {
-            close(fd);
-        }
-        return 0;
-#ifdef CONFIG_VNC
-    } else if (strcmp(protocol, "vnc") == 0) {
-	int fd = monitor_get_fd(mon, fdname, NULL);
-        int skipauth = qdict_get_try_bool(qdict, "skipauth", 0);
-	vnc_display_add_client(NULL, fd, skipauth);
-	return 0;
-#endif
-    } else if ((s = qemu_chr_find(protocol)) != NULL) {
-	int fd = monitor_get_fd(mon, fdname, NULL);
-	if (qemu_chr_add_client(s, fd) < 0) {
-	    qerror_report(QERR_ADD_CLIENT_FAILED);
-	    return -1;
-	}
-	return 0;
-    }
-
-    qerror_report(QERR_INVALID_PARAMETER, "protocol");
-    return -1;
-}
-
 static int client_migrate_info(Monitor *mon, const QDict *qdict,
                                MonitorCompletion cb, void *opaque)
 {
diff --git a/qapi-schema.json b/qapi-schema.json
index 14e44199b..191d92194 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -32,6 +32,31 @@
             'DeviceNotActive', 'DeviceNotFound', 'KVMMissingCap',
             'MigrationExpected' ] }
 
+##
+# @add_client
+#
+# Allow client connections for VNC, Spice and socket based
+# character devices to be passed in to QEMU via SCM_RIGHTS.
+#
+# @protocol: protocol name. Valid names are "vnc", "spice" or the
+#            name of a character device (eg. from -chardev id=XXXX)
+#
+# @fdname: file descriptor name previously passed via 'getfd' command
+#
+# @skipauth: #optional whether to skip authentication. Only applies
+#            to "vnc" and "spice" protocols
+#
+# @tls: #optional whether to perform TLS. Only applies to the "spice"
+#       protocol
+#
+# Returns: nothing on success.
+#
+# Since: 0.14.0
+##
+{ 'command': 'add_client',
+  'data': { 'protocol': 'str', 'fdname': 'str', '*skipauth': 'bool',
+            '*tls': 'bool' } }
+
 ##
 # @NameInfo:
 #
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 6e21ddba6..36e08d9ff 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -1231,10 +1231,7 @@ EQMP
     {
         .name       = "add_client",
         .args_type  = "protocol:s,fdname:s,skipauth:b?,tls:b?",
-        .params     = "protocol fdname skipauth tls",
-        .help       = "add a graphics client",
-        .user_print = monitor_user_noop,
-        .mhandler.cmd_new = add_graphics_client,
+        .mhandler.cmd_new = qmp_marshal_input_add_client,
     },
 
 SQMP
diff --git a/qmp.c b/qmp.c
index 84639220d..36c54c57c 100644
--- a/qmp.c
+++ b/qmp.c
@@ -479,3 +479,46 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error **errp)
     return arch_query_cpu_definitions(errp);
 }
 
+void qmp_add_client(const char *protocol, const char *fdname,
+                    bool has_skipauth, bool skipauth, bool has_tls, bool tls,
+                    Error **errp)
+{
+    CharDriverState *s;
+    int fd;
+
+    fd = monitor_get_fd(cur_mon, fdname, errp);
+    if (fd < 0) {
+        return;
+    }
+
+    if (strcmp(protocol, "spice") == 0) {
+        if (!using_spice) {
+            error_set(errp, QERR_DEVICE_NOT_ACTIVE, "spice");
+            close(fd);
+            return;
+        }
+        skipauth = has_skipauth ? skipauth : false;
+        tls = has_tls ? tls : false;
+        if (qemu_spice_display_add_client(fd, skipauth, tls) < 0) {
+            error_setg(errp, "spice failed to add client");
+            close(fd);
+        }
+        return;
+#ifdef CONFIG_VNC
+    } else if (strcmp(protocol, "vnc") == 0) {
+        skipauth = has_skipauth ? skipauth : false;
+        vnc_display_add_client(NULL, fd, skipauth);
+        return;
+#endif
+    } else if ((s = qemu_chr_find(protocol)) != NULL) {
+        if (qemu_chr_add_client(s, fd) < 0) {
+            error_setg(errp, "failed to add client");
+            close(fd);
+            return;
+        }
+        return;
+    }
+
+    error_setg(errp, "protocol '%s' is invalid", protocol);
+    close(fd);
+}
-- 
cgit v1.2.3


From d691180e41f58cc3c1f9fa848c2ab89193503160 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 21 Sep 2012 13:10:58 -0300
Subject: qmp: dump-guest-memory: improve schema doc (again)

 o Add a note about memory allocation with paging=true
 o Fix indentation

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 qapi-schema.json | 32 ++++++++++++++++++++------------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 191d92194..c6a676783 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2007,26 +2007,33 @@
 # supported on i386 and x86_64.
 #
 # @paging: if true, do paging to get guest's memory mapping. This allows
-# using gdb to process the core file. However, setting @paging to false
-# may be desirable because of two reasons:
+#          using gdb to process the core file.
 #
-#   1. The guest may be in a catastrophic state or can have corrupted
-#      memory, which cannot be trusted
-#   2. The guest can be in real-mode even if paging is enabled. For example,
-#      the guest uses ACPI to sleep, and ACPI sleep state goes in real-mode
+#          IMPORTANT: this option can make QEMU allocate several gigabytes
+#                     of RAM. This can happen for a large guest, or a
+#                     malicious guest pretending to be large.
+#
+#          Also, paging=true has the following limitations:
+#
+#             1. The guest may be in a catastrophic state or can have corrupted
+#                memory, which cannot be trusted
+#             2. The guest can be in real-mode even if paging is enabled. For
+#                example, the guest uses ACPI to sleep, and ACPI sleep state
+#                goes in real-mode
 #
 # @protocol: the filename or file descriptor of the vmcore. The supported
-# protocols are:
+#            protocols are:
 #
-#   1. file: the protocol starts with "file:", and the following string is
-#      the file's path.
-#   2. fd: the protocol starts with "fd:", and the following string is the
-#      fd's name.
+#            1. file: the protocol starts with "file:", and the following
+#               string is the file's path.
+#            2. fd: the protocol starts with "fd:", and the following string
+#               is the fd's name.
 #
 # @begin: #optional if specified, the starting physical address.
 #
 # @length: #optional if specified, the memory size, in bytes. If you don't
-# want to dump all guest's memory, please specify the start @begin and @length
+#          want to dump all guest's memory, please specify the start @begin
+#          and @length
 #
 # Returns: nothing on success
 #
@@ -2035,6 +2042,7 @@
 { 'command': 'dump-guest-memory',
   'data': { 'paging': 'bool', 'protocol': 'str', '*begin': 'int',
             '*length': 'int' } }
+
 ##
 # @netdev_add:
 #
-- 
cgit v1.2.3


From 2f61652d660ec1ffdadf926401a174c11f5c13a7 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 21 Sep 2012 13:17:55 -0300
Subject: qmp: dump-guest-memory: don't spin if non-blocking fd would block

fd_write_vmcore() will indefinitely spin for a non-blocking
file-descriptor that would block. However, if the fd is non-blocking,
how does it make sense to spin?

Change this behavior to return an error instead.

Note that this can only happen with an fd provided by a management
application. The fd opened internally by dump-guest-memory is blocking.

While there, also fix 'writen_size' variable name.

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 dump.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/dump.c b/dump.c
index 1a3c7164b..6b7c12790 100644
--- a/dump.c
+++ b/dump.c
@@ -100,18 +100,11 @@ static void dump_error(DumpState *s, const char *reason)
 static int fd_write_vmcore(void *buf, size_t size, void *opaque)
 {
     DumpState *s = opaque;
-    int fd = s->fd;
-    size_t writen_size;
+    size_t written_size;
 
-    /* The fd may be passed from user, and it can be non-blocked */
-    while (size) {
-        writen_size = qemu_write_full(fd, buf, size);
-        if (writen_size != size && errno != EAGAIN) {
-            return -1;
-        }
-
-        buf += writen_size;
-        size -= writen_size;
+    written_size = qemu_write_full(s->fd, buf, size);
+    if (written_size != size) {
+        return -1;
     }
 
     return 0;
-- 
cgit v1.2.3


From 753637695bcf6b80ea96614de5d31161603ad50f Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Fri, 21 Sep 2012 13:53:00 -0300
Subject: hmp: dump-guest-memory: hardcode protocol argument to "file:"

Today, it's necessary to specify the protocol you want to use
when dumping the guest memory, for example:

 (qemu) dump-guest-memory file:/tmp/guest-memory

This has a few issues:

 1. It's cumbersome to type
 2. We loose file path autocompletion
 3. Being able to specify fd:X in HMP makes little sense for humans

Because of these reasons, hardcode the 'protocol' argument to
'file:' in HMP.

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 hmp-commands.hx | 8 +++-----
 hmp.c           | 8 ++++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index ed67e997f..0302458df 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -914,12 +914,11 @@ ETEXI
 #if defined(CONFIG_HAVE_CORE_DUMP)
     {
         .name       = "dump-guest-memory",
-        .args_type  = "paging:-p,protocol:s,begin:i?,length:i?",
-        .params     = "[-p] protocol [begin] [length]",
+        .args_type  = "paging:-p,filename:F,begin:i?,length:i?",
+        .params     = "[-p] filename [begin] [length]",
         .help       = "dump guest memory to file"
                       "\n\t\t\t begin(optional): the starting physical address"
                       "\n\t\t\t length(optional): the memory size, in bytes",
-        .user_print = monitor_user_noop,
         .mhandler.cmd = hmp_dump_guest_memory,
     },
 
@@ -929,8 +928,7 @@ STEXI
 @findex dump-guest-memory
 Dump guest memory to @var{protocol}. The file can be processed with crash or
 gdb.
-  protocol: destination file(started with "file:") or destination file
-            descriptor (started with "fd:")
+  filename: dump file name
     paging: do paging to get guest's memory mapping
      begin: the starting physical address. It's optional, and should be
             specified with length together.
diff --git a/hmp.c b/hmp.c
index ba6fbd3dc..2de31401e 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1042,11 +1042,12 @@ void hmp_dump_guest_memory(Monitor *mon, const QDict *qdict)
 {
     Error *errp = NULL;
     int paging = qdict_get_try_bool(qdict, "paging", 0);
-    const char *file = qdict_get_str(qdict, "protocol");
+    const char *file = qdict_get_str(qdict, "filename");
     bool has_begin = qdict_haskey(qdict, "begin");
     bool has_length = qdict_haskey(qdict, "length");
     int64_t begin = 0;
     int64_t length = 0;
+    char *prot;
 
     if (has_begin) {
         begin = qdict_get_int(qdict, "begin");
@@ -1055,9 +1056,12 @@ void hmp_dump_guest_memory(Monitor *mon, const QDict *qdict)
         length = qdict_get_int(qdict, "length");
     }
 
-    qmp_dump_guest_memory(paging, file, has_begin, begin, has_length, length,
+    prot = g_strconcat("file:", file, NULL);
+
+    qmp_dump_guest_memory(paging, prot, has_begin, begin, has_length, length,
                           &errp);
     hmp_handle_error(mon, &errp);
+    g_free(prot);
 }
 
 void hmp_netdev_add(Monitor *mon, const QDict *qdict)
-- 
cgit v1.2.3


From 05a3543dbddd03d6be723be4074e2e661b00b851 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Thu, 20 Sep 2012 13:44:28 -0300
Subject: input: qmp_send_key(): simplify

The current code duplicates the QKeyCodeList keys in order to store
the key values for release_keys() late run. This is a bit complicated
though, as we have to care about correct ordering and then release_keys()
will have to index key_defs[] over again.

Switch to an array of integers, which is dynamically allocated and stores
the already converted key value.

This simplifies the current code and the next commit.

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 input.c | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/input.c b/input.c
index c4b0619a7..32c605748 100644
--- a/input.c
+++ b/input.c
@@ -224,30 +224,31 @@ int index_from_keycode(int code)
     return i;
 }
 
-static QKeyCodeList *keycodes;
+static int *keycodes;
+static int keycodes_size;
 static QEMUTimer *key_timer;
 
 static void release_keys(void *opaque)
 {
-    int keycode;
-    QKeyCodeList *p;
+    int i;
 
-    for (p = keycodes; p != NULL; p = p->next) {
-        keycode = key_defs[p->value];
-        if (keycode & 0x80) {
+    for (i = 0; i < keycodes_size; i++) {
+        if (keycodes[i] & 0x80) {
             kbd_put_keycode(0xe0);
         }
-        kbd_put_keycode(keycode | 0x80);
+        kbd_put_keycode(keycodes[i]| 0x80);
     }
-    qapi_free_QKeyCodeList(keycodes);
+
+    g_free(keycodes);
     keycodes = NULL;
+    keycodes_size = 0;
 }
 
 void qmp_send_key(QKeyCodeList *keys, bool has_hold_time, int64_t hold_time,
                   Error **errp)
 {
     int keycode;
-    QKeyCodeList *p, *keylist, *head = NULL, *tmp = NULL;
+    QKeyCodeList *p;
 
     if (!key_timer) {
         key_timer = qemu_new_timer_ns(vm_clock, release_keys, NULL);
@@ -257,31 +258,22 @@ void qmp_send_key(QKeyCodeList *keys, bool has_hold_time, int64_t hold_time,
         qemu_del_timer(key_timer);
         release_keys(NULL);
     }
+
     if (!has_hold_time) {
         hold_time = 100;
     }
 
     for (p = keys; p != NULL; p = p->next) {
-        keylist = g_malloc0(sizeof(*keylist));
-        keylist->value = p->value;
-        keylist->next = NULL;
-
-        if (!head) {
-            head = keylist;
-        }
-        if (tmp) {
-            tmp->next = keylist;
-        }
-        tmp = keylist;
-
         /* key down events */
         keycode = key_defs[p->value];
         if (keycode & 0x80) {
             kbd_put_keycode(0xe0);
         }
         kbd_put_keycode(keycode & 0x7f);
+
+        keycodes = g_realloc(keycodes, sizeof(int) * (keycodes_size + 1));
+        keycodes[keycodes_size++] = keycode;
     }
-    keycodes = head;
 
     /* delayed key up events */
     qemu_mod_timer(key_timer, qemu_get_clock_ns(vm_clock) +
-- 
cgit v1.2.3


From 9f32897768064841fe9a99145c9d15ab6667ffed Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Thu, 20 Sep 2012 14:19:47 -0300
Subject: qmp: qmp_send_key(): accept key codes in hex

Before the qapi conversion, the sendkey command could be used to
send key codes in hex directly to the guest. In HMP, this would
be like:

 (qemu) sendkey 0xdc

However, the qapi conversion broke this, as it only supports sending
QKeyCode values to the guest. That's a regression.

This commit fixes the problem by adding hex value support down
the QMP interface, qmp_send_key().

In more detail, this commit:

 1. Adds the KeyValue union. This can represent an hex value or
    a QKeyCode value

 2. *Changes* the QMP send-key command to take an KeyValue argument
    instead of a QKeyCode one

 3. Adapt hmp_send_key() to the QMP interface changes

Item 2 is an incompatible change, but as we're in development phase
(and this command has been merged a few weeks ago) this shouldn't be
a problem.

Finally, it's not possible to split this commit without breaking the
build.

Reported-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 hmp.c            | 43 +++++++++++++++++++++++++++++--------------
 input.c          | 33 +++++++++++++++++++++++++++------
 qapi-schema.json | 20 +++++++++++++++++---
 3 files changed, 73 insertions(+), 23 deletions(-)

diff --git a/hmp.c b/hmp.c
index 2de31401e..3306bcdbb 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1113,13 +1113,13 @@ void hmp_closefd(Monitor *mon, const QDict *qdict)
 void hmp_send_key(Monitor *mon, const QDict *qdict)
 {
     const char *keys = qdict_get_str(qdict, "keys");
-    QKeyCodeList *keylist, *head = NULL, *tmp = NULL;
+    KeyValueList *keylist, *head = NULL, *tmp = NULL;
     int has_hold_time = qdict_haskey(qdict, "hold-time");
     int hold_time = qdict_get_try_int(qdict, "hold-time", -1);
     Error *err = NULL;
     char keyname_buf[16];
     char *separator;
-    int keyname_len, idx;
+    int keyname_len;
 
     while (1) {
         separator = strchr(keys, '-');
@@ -1133,15 +1133,8 @@ void hmp_send_key(Monitor *mon, const QDict *qdict)
         }
         keyname_buf[keyname_len] = 0;
 
-        idx = index_from_key(keyname_buf);
-        if (idx == Q_KEY_CODE_MAX) {
-            monitor_printf(mon, "invalid parameter: %s\n", keyname_buf);
-            break;
-        }
-
         keylist = g_malloc0(sizeof(*keylist));
-        keylist->value = idx;
-        keylist->next = NULL;
+        keylist->value = g_malloc0(sizeof(*keylist->value));
 
         if (!head) {
             head = keylist;
@@ -1151,17 +1144,39 @@ void hmp_send_key(Monitor *mon, const QDict *qdict)
         }
         tmp = keylist;
 
+        if (strstart(keyname_buf, "0x", NULL)) {
+            char *endp;
+            int value = strtoul(keyname_buf, &endp, 0);
+            if (*endp != '\0') {
+                goto err_out;
+            }
+            keylist->value->kind = KEY_VALUE_KIND_NUMBER;
+            keylist->value->number = value;
+        } else {
+            int idx = index_from_key(keyname_buf);
+            if (idx == Q_KEY_CODE_MAX) {
+                goto err_out;
+            }
+            keylist->value->kind = KEY_VALUE_KIND_QCODE;
+            keylist->value->qcode = idx;
+        }
+
         if (!separator) {
             break;
         }
         keys = separator + 1;
     }
 
-    if (idx != Q_KEY_CODE_MAX) {
-        qmp_send_key(head, has_hold_time, hold_time, &err);
-    }
+    qmp_send_key(head, has_hold_time, hold_time, &err);
     hmp_handle_error(mon, &err);
-    qapi_free_QKeyCodeList(head);
+
+out:
+    qapi_free_KeyValueList(head);
+    return;
+
+err_out:
+    monitor_printf(mon, "invalid parameter: %s\n", keyname_buf);
+    goto out;
 }
 
 void hmp_screen_dump(Monitor *mon, const QDict *qdict)
diff --git a/input.c b/input.c
index 32c605748..76ade64d0 100644
--- a/input.c
+++ b/input.c
@@ -228,6 +228,23 @@ static int *keycodes;
 static int keycodes_size;
 static QEMUTimer *key_timer;
 
+static int keycode_from_keyvalue(const KeyValue *value)
+{
+    if (value->kind == KEY_VALUE_KIND_QCODE) {
+        return key_defs[value->qcode];
+    } else {
+        assert(value->kind == KEY_VALUE_KIND_NUMBER);
+        return value->number;
+    }
+}
+
+static void free_keycodes(void)
+{
+    g_free(keycodes);
+    keycodes = NULL;
+    keycodes_size = 0;
+}
+
 static void release_keys(void *opaque)
 {
     int i;
@@ -239,16 +256,14 @@ static void release_keys(void *opaque)
         kbd_put_keycode(keycodes[i]| 0x80);
     }
 
-    g_free(keycodes);
-    keycodes = NULL;
-    keycodes_size = 0;
+    free_keycodes();
 }
 
-void qmp_send_key(QKeyCodeList *keys, bool has_hold_time, int64_t hold_time,
+void qmp_send_key(KeyValueList *keys, bool has_hold_time, int64_t hold_time,
                   Error **errp)
 {
     int keycode;
-    QKeyCodeList *p;
+    KeyValueList *p;
 
     if (!key_timer) {
         key_timer = qemu_new_timer_ns(vm_clock, release_keys, NULL);
@@ -265,7 +280,13 @@ void qmp_send_key(QKeyCodeList *keys, bool has_hold_time, int64_t hold_time,
 
     for (p = keys; p != NULL; p = p->next) {
         /* key down events */
-        keycode = key_defs[p->value];
+        keycode = keycode_from_keyvalue(p->value);
+        if (keycode < 0x01 || keycode > 0xff) {
+            error_setg(errp, "invalid hex keycode 0x%x\n", keycode);
+            free_keycodes();
+            return;
+        }
+
         if (keycode & 0x80) {
             kbd_put_keycode(0xe0);
         }
diff --git a/qapi-schema.json b/qapi-schema.json
index c6a676783..28d8815df 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2620,13 +2620,27 @@
             'props', 'undo', 'front', 'copy', 'open', 'paste', 'find', 'cut',
              'lf', 'help', 'meta_l', 'meta_r', 'compose' ] }
 
+##
+# @KeyValue
+#
+# Represents a keyboard key.
+#
+# Since: 1.3.0
+##
+{ 'union': 'KeyValue',
+  'data': {
+    'number': 'int',
+    'qcode': 'QKeyCode' } }
+
 ##
 # @send-key:
 #
 # Send keys to guest.
 #
-# @keys: key sequence. 'keys' is the name of the key. Use a JSON array to
-#        press several keys simultaneously.
+# @keys: An array of @KeyValue elements. All @KeyValues in this array are
+#        simultaneously sent to the guest. A @KeyValue.number value is sent
+#        directly to the guest, while @KeyValue.qcode must be a valid
+#        @QKeyCode value
 #
 # @hold-time: #optional time to delay key up events, milliseconds. Defaults
 #             to 100
@@ -2638,7 +2652,7 @@
 #
 ##
 { 'command': 'send-key',
-  'data': { 'keys': ['QKeyCode'], '*hold-time': 'int' } }
+  'data': { 'keys': ['KeyValue'], '*hold-time': 'int' } }
 
 ##
 # @screendump:
-- 
cgit v1.2.3


From 9d537c9019e6a05713b44900c78447a0cfb06567 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <lcapitulino@redhat.com>
Date: Thu, 20 Sep 2012 14:47:02 -0300
Subject: input: index_from_key(): drop unused code

The hex key conversion is unused since last commit.

Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Markus Armbruster <armbru@redhat.com>
---
 input.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/input.c b/input.c
index 76ade64d0..25d3973e2 100644
--- a/input.c
+++ b/input.c
@@ -186,8 +186,7 @@ static const int key_defs[] = {
 
 int index_from_key(const char *key)
 {
-    int i, keycode;
-    char *endp;
+    int i;
 
     for (i = 0; QKeyCode_lookup[i] != NULL; i++) {
         if (!strcmp(key, QKeyCode_lookup[i])) {
@@ -195,17 +194,6 @@ int index_from_key(const char *key)
         }
     }
 
-    if (strstart(key, "0x", NULL)) {
-        keycode = strtoul(key, &endp, 0);
-        if (*endp == '\0' && keycode >= 0x01 && keycode <= 0xff) {
-            for (i = 0; i < Q_KEY_CODE_MAX; i++) {
-                if (keycode == key_defs[i]) {
-                    break;
-                }
-            }
-        }
-    }
-
     /* Return Q_KEY_CODE_MAX if the key is invalid */
     return i;
 }
-- 
cgit v1.2.3


From 8bde9b6f8892c15e46e1f37a37ac038313be4d58 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 26 Sep 2012 16:34:29 +0200
Subject: block: live snapshot documentation tweaks

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 qapi-schema.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index 28d8815df..f4c21855c 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1399,7 +1399,7 @@
 # @format: #optional the format of the snapshot image, default is 'qcow2'.
 #
 # @mode: #optional whether and how QEMU should create a new image, default is
-# 'absolute-paths'.
+#        'absolute-paths'.
 ##
 { 'type': 'BlockdevSnapshot',
   'data': { 'device': 'str', 'snapshot-file': 'str', '*format': 'str',
@@ -1453,7 +1453,7 @@
 # @format: #optional the format of the snapshot image, default is 'qcow2'.
 #
 # @mode: #optional whether and how QEMU should create a new image, default is
-# 'absolute-paths'.
+#        'absolute-paths'.
 #
 # Returns: nothing on success
 #          If @device is not a valid block device, DeviceNotFound
-- 
cgit v1.2.3


From 0cea71a207508c2b8f563b2644ac46009832c8f4 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 24 Sep 2012 15:09:30 +0200
Subject: virtio: don't mark unaccessed memory as dirty

offset of accessed buffer is calculated using iov_length, so it
can exceed accessed len. If that happens
math in len - offset wraps around, and size becomes wrong.
As real value is 0, so this is harmless but unnecessary.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 209c76375..b5764bb8f 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -241,7 +241,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
                                   elem->in_sg[i].iov_len,
                                   1, size);
 
-        offset += elem->in_sg[i].iov_len;
+        offset += size;
     }
 
     for (i = 0; i < elem->out_num; i++)
-- 
cgit v1.2.3


From 40bad8f3deba15e2074ff34cfe923c12916b1cc5 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 24 Sep 2012 15:15:43 +0200
Subject: virtio-net: fix used len for tx

There is no out sg for TX, so used buf length for tx
should always be 0.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-net.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index 649074329..247d7bef5 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -690,7 +690,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
 {
     VirtIONet *n = DO_UPCAST(NICState, nc, nc)->opaque;
 
-    virtqueue_push(n->tx_vq, &n->async_tx.elem, n->async_tx.len);
+    virtqueue_push(n->tx_vq, &n->async_tx.elem, 0);
     virtio_notify(&n->vdev, n->tx_vq);
 
     n->async_tx.elem.out_num = n->async_tx.len = 0;
@@ -754,7 +754,7 @@ static int32_t virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
 
         len += ret;
 
-        virtqueue_push(vq, &elem, len);
+        virtqueue_push(vq, &elem, 0);
         virtio_notify(&n->vdev, vq);
 
         if (++num_packets >= n->tx_burst) {
-- 
cgit v1.2.3


From 844b5cea8ea6cbe964670a26d1b34037067569df Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Mon, 24 Sep 2012 12:50:32 +0200
Subject: iov: add const annotation

iov_from_buf does not change iov, make it const.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 iov.c | 2 +-
 iov.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/iov.c b/iov.c
index 60705c73a..c6a66f0af 100644
--- a/iov.c
+++ b/iov.c
@@ -26,7 +26,7 @@
 # include <sys/socket.h>
 #endif
 
-size_t iov_from_buf(struct iovec *iov, unsigned int iov_cnt,
+size_t iov_from_buf(const struct iovec *iov, unsigned int iov_cnt,
                     size_t offset, const void *buf, size_t bytes)
 {
     size_t done;
diff --git a/iov.h b/iov.h
index 381f37a54..a73569f94 100644
--- a/iov.h
+++ b/iov.h
@@ -36,7 +36,7 @@ size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt);
  * such "large" value is -1 (sinice size_t is unsigned),
  * so specifying `-1' as `bytes' means 'up to the end of iovec'.
  */
-size_t iov_from_buf(struct iovec *iov, unsigned int iov_cnt,
+size_t iov_from_buf(const struct iovec *iov, unsigned int iov_cnt,
                     size_t offset, const void *buf, size_t bytes);
 size_t iov_to_buf(const struct iovec *iov, const unsigned int iov_cnt,
                   size_t offset, void *buf, size_t bytes);
-- 
cgit v1.2.3


From 385ce95d9d060f20870402c8b2b503d0b6ab8af0 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Tue, 25 Sep 2012 00:05:14 +0530
Subject: virtio: use unsigned int for counting bytes in vq

The virtqueue_avail_bytes() function counts bytes in an int.  Use an
unsigned int instead.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index b5764bb8f..cfad36361 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -338,7 +338,7 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
 int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
 {
     unsigned int idx;
-    int total_bufs, in_total, out_total;
+    unsigned int total_bufs, in_total, out_total;
 
     idx = vq->last_avail_idx;
 
-- 
cgit v1.2.3


From 0d8d7690850eb0cf2b2b60933cf47669a6b6f18f Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Tue, 25 Sep 2012 00:05:15 +0530
Subject: virtio: Introduce virtqueue_get_avail_bytes()

The current virtqueue_avail_bytes() is oddly named, and checks if a
particular number of bytes are available in a vq.  A better API is to
fetch the number of bytes available in the vq, and let the caller do
what's interesting with the numbers.

Introduce virtqueue_get_avail_bytes(), which returns the number of bytes
for buffers marked for both, in as well as out.  virtqueue_avail_bytes()
is made a wrapper over this new function.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c | 28 +++++++++++++++++++++-------
 hw/virtio.h |  5 ++++-
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index cfad36361..6821092df 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -335,7 +335,8 @@ static unsigned virtqueue_next_desc(target_phys_addr_t desc_pa,
     return next;
 }
 
-int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
+void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
+                               unsigned int *out_bytes)
 {
     unsigned int idx;
     unsigned int total_bufs, in_total, out_total;
@@ -380,13 +381,9 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
             }
 
             if (vring_desc_flags(desc_pa, i) & VRING_DESC_F_WRITE) {
-                if (in_bytes > 0 &&
-                    (in_total += vring_desc_len(desc_pa, i)) >= in_bytes)
-                    return 1;
+                in_total += vring_desc_len(desc_pa, i);
             } else {
-                if (out_bytes > 0 &&
-                    (out_total += vring_desc_len(desc_pa, i)) >= out_bytes)
-                    return 1;
+                out_total += vring_desc_len(desc_pa, i);
             }
         } while ((i = virtqueue_next_desc(desc_pa, i, max)) != max);
 
@@ -395,7 +392,24 @@ int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes)
         else
             total_bufs++;
     }
+    if (in_bytes) {
+        *in_bytes = in_total;
+    }
+    if (out_bytes) {
+        *out_bytes = out_total;
+    }
+}
 
+int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
+                          unsigned int out_bytes)
+{
+    unsigned int in_total, out_total;
+
+    virtqueue_get_avail_bytes(vq, &in_total, &out_total);
+    if ((in_bytes && in_bytes < in_total)
+        || (out_bytes && out_bytes < out_total)) {
+        return 1;
+    }
     return 0;
 }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index 7a4f56452..80de3757e 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -147,7 +147,10 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
 void virtqueue_map_sg(struct iovec *sg, target_phys_addr_t *addr,
     size_t num_sg, int is_write);
 int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem);
-int virtqueue_avail_bytes(VirtQueue *vq, int in_bytes, int out_bytes);
+int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes,
+                          unsigned int out_bytes);
+void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes,
+                               unsigned int *out_bytes);
 
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
 
-- 
cgit v1.2.3


From ad3005ad8c70a69705149d3ce6d1e51fb76edb15 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Tue, 25 Sep 2012 00:05:16 +0530
Subject: virtio-serial-bus: let chardev know the exact number of bytes
 requested

Using the virtqueue_avail_bytes() function had an unnecessarily
crippling effect on the number of bytes needed by the guest as reported
to the chardev layer in the can_read() callback.

Using the new virtqueue_get_avail_bytes() function will let us advertise
the exact number of bytes we can send to the guest.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-serial-bus.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/hw/virtio-serial-bus.c b/hw/virtio-serial-bus.c
index 82073f5dc..d20bd8bf7 100644
--- a/hw/virtio-serial-bus.c
+++ b/hw/virtio-serial-bus.c
@@ -287,6 +287,7 @@ ssize_t virtio_serial_write(VirtIOSerialPort *port, const uint8_t *buf,
 size_t virtio_serial_guest_ready(VirtIOSerialPort *port)
 {
     VirtQueue *vq = port->ivq;
+    unsigned int bytes;
 
     if (!virtio_queue_ready(vq) ||
         !(port->vser->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) ||
@@ -296,14 +297,8 @@ size_t virtio_serial_guest_ready(VirtIOSerialPort *port)
     if (use_multiport(port->vser) && !port->guest_connected) {
         return 0;
     }
-
-    if (virtqueue_avail_bytes(vq, 4096, 0)) {
-        return 4096;
-    }
-    if (virtqueue_avail_bytes(vq, 1, 0)) {
-        return 1;
-    }
-    return 0;
+    virtqueue_get_avail_bytes(vq, &bytes, NULL);
+    return bytes;
 }
 
 static void flush_queued_data_bh(void *opaque)
-- 
cgit v1.2.3


From 1ceee0d5cc841fc9ca8e72b81450b598ab307f14 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:22:56 +0200
Subject: iostatus: change is_read to a bool

Do this while we are touching this part of the code, before introducing
more uses of "int is_read".

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c         | 4 ++--
 block.h         | 4 ++--
 blockdev.c      | 2 +-
 hw/ide/core.c   | 2 +-
 hw/ide/pci.c    | 4 ++--
 hw/scsi-disk.c  | 2 +-
 hw/virtio-blk.c | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index 1c3ebd785..0bae0461d 100644
--- a/block.c
+++ b/block.c
@@ -1387,7 +1387,7 @@ void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
 }
 
 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
-                               BlockErrorAction action, int is_read)
+                               BlockErrorAction action, bool is_read)
 {
     QObject *data;
     const char *action_str;
@@ -2481,7 +2481,7 @@ void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
     bs->on_write_error = on_write_error;
 }
 
-BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, int is_read)
+BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
 {
     return is_read ? bs->on_read_error : bs->on_write_error;
 }
diff --git a/block.h b/block.h
index ee8112945..47dd905bc 100644
--- a/block.h
+++ b/block.h
@@ -109,7 +109,7 @@ void bdrv_iostatus_disable(BlockDriverState *bs);
 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs);
 void bdrv_iostatus_set_err(BlockDriverState *bs, int error);
 void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
-                               BlockErrorAction action, int is_read);
+                               BlockErrorAction action, bool is_read);
 void bdrv_info_print(Monitor *mon, const QObject *data);
 void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
@@ -281,7 +281,7 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 
 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
                        BlockdevOnError on_write_error);
-BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, int is_read);
+BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read);
 int bdrv_is_read_only(BlockDriverState *bs);
 int bdrv_is_sg(BlockDriverState *bs);
 int bdrv_enable_write_cache(BlockDriverState *bs);
diff --git a/blockdev.c b/blockdev.c
index 63307154b..d52a83076 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -238,7 +238,7 @@ static void drive_put_ref_bh_schedule(DriveInfo *dinfo)
     qemu_bh_schedule(s->bh);
 }
 
-static int parse_block_error_action(const char *buf, int is_read)
+static int parse_block_error_action(const char *buf, bool is_read)
 {
     if (!strcmp(buf, "ignore")) {
         return BLOCKDEV_ON_ERROR_IGNORE;
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 2620e87ae..c03db4a3a 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -556,7 +556,7 @@ void ide_dma_error(IDEState *s)
 
 static int ide_handle_rw_error(IDEState *s, int error, int op)
 {
-    int is_read = (op & BM_STATUS_RETRY_READ);
+    bool is_read = (op & BM_STATUS_RETRY_READ) != 0;
     BlockdevOnError action = bdrv_get_on_error(s->bs, is_read);
 
     if (action == BLOCKDEV_ON_ERROR_IGNORE) {
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index 88c0942e3..644533f77 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -188,7 +188,7 @@ static void bmdma_restart_bh(void *opaque)
 {
     BMDMAState *bm = opaque;
     IDEBus *bus = bm->bus;
-    int is_read;
+    bool is_read;
     int error_status;
 
     qemu_bh_delete(bm->bh);
@@ -198,7 +198,7 @@ static void bmdma_restart_bh(void *opaque)
         return;
     }
 
-    is_read = !!(bus->error_status & BM_STATUS_RETRY_READ);
+    is_read = (bus->error_status & BM_STATUS_RETRY_READ) != 0;
 
     /* The error status must be cleared before resubmitting the request: The
      * request may fail again, and this case can only be distinguished if the
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index c295326e9..2dd99a90a 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -386,7 +386,7 @@ static void scsi_read_data(SCSIRequest *req)
  */
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
 {
-    int is_read = (r->req.cmd.xfer == SCSI_XFER_FROM_DEV);
+    bool is_read = (r->req.cmd.xfer == SCSI_XFER_FROM_DEV);
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
     BlockdevOnError action = bdrv_get_on_error(s->qdev.conf.bs, is_read);
 
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index f178fa86c..1ac2483b5 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -64,7 +64,7 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
 }
 
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
-    int is_read)
+    bool is_read)
 {
     BlockdevOnError action = bdrv_get_on_error(req->dev->bs, is_read);
     VirtIOBlock *s = req->dev;
@@ -98,7 +98,7 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
     trace_virtio_blk_rw_complete(req, ret);
 
     if (ret) {
-        int is_read = !(ldl_p(&req->out->type) & VIRTIO_BLK_T_OUT);
+        bool is_read = !(ldl_p(&req->out->type) & VIRTIO_BLK_T_OUT);
         if (virtio_blk_handle_rw_error(req, -ret, is_read))
             return;
     }
-- 
cgit v1.2.3


From 3e1caa5f76a9104a0d574b0f28b3dafe986a8408 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:22:57 +0200
Subject: iostatus: reorganize io error code

Move the common part of IDE/SCSI/virtio error handling to the block
layer.  The new function bdrv_error_action subsumes all three of
bdrv_emit_qmp_error_event, vm_stop, bdrv_iostatus_set_err.

The same scheme will be used for errors in block jobs.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c         | 46 ++++++++++++++++++++++++++++++++++++++--------
 block.h         |  5 +++--
 hw/ide/core.c   | 20 +++++---------------
 hw/scsi-disk.c  | 23 +++++++----------------
 hw/virtio-blk.c | 19 +++++--------------
 qemu-tool.c     |  6 ++++++
 6 files changed, 64 insertions(+), 55 deletions(-)

diff --git a/block.c b/block.c
index 0bae0461d..8b0ba6722 100644
--- a/block.c
+++ b/block.c
@@ -29,6 +29,7 @@
 #include "blockjob.h"
 #include "module.h"
 #include "qjson.h"
+#include "sysemu.h"
 #include "qemu-coroutine.h"
 #include "qmp-commands.h"
 #include "qemu-timer.h"
@@ -1386,8 +1387,8 @@ void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
     }
 }
 
-void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
-                               BlockErrorAction action, bool is_read)
+static void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
+                                      BlockErrorAction action, bool is_read)
 {
     QObject *data;
     const char *action_str;
@@ -2486,6 +2487,39 @@ BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read)
     return is_read ? bs->on_read_error : bs->on_write_error;
 }
 
+BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error)
+{
+    BlockdevOnError on_err = is_read ? bs->on_read_error : bs->on_write_error;
+
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        return (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_STOP:
+        return BDRV_ACTION_STOP;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        return BDRV_ACTION_REPORT;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        return BDRV_ACTION_IGNORE;
+    default:
+        abort();
+    }
+}
+
+/* This is done by device models because, while the block layer knows
+ * about the error, it does not know whether an operation comes from
+ * the device or the block layer (from a job, for example).
+ */
+void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
+                       bool is_read, int error)
+{
+    assert(error >= 0);
+    bdrv_emit_qmp_error_event(bs, action, is_read);
+    if (action == BDRV_ACTION_STOP) {
+        vm_stop(RUN_STATE_IO_ERROR);
+        bdrv_iostatus_set_err(bs, error);
+    }
+}
+
 int bdrv_is_read_only(BlockDriverState *bs)
 {
     return bs->read_only;
@@ -4226,14 +4260,10 @@ void bdrv_iostatus_reset(BlockDriverState *bs)
     }
 }
 
-/* XXX: Today this is set by device models because it makes the implementation
-   quite simple. However, the block layer knows about the error, so it's
-   possible to implement this without device models being involved */
 void bdrv_iostatus_set_err(BlockDriverState *bs, int error)
 {
-    if (bdrv_iostatus_is_enabled(bs) &&
-        bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-        assert(error >= 0);
+    assert(bdrv_iostatus_is_enabled(bs));
+    if (bs->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
         bs->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
                                          BLOCK_DEVICE_IO_STATUS_FAILED;
     }
diff --git a/block.h b/block.h
index 47dd905bc..e2d89d7bc 100644
--- a/block.h
+++ b/block.h
@@ -108,8 +108,6 @@ void bdrv_iostatus_reset(BlockDriverState *bs);
 void bdrv_iostatus_disable(BlockDriverState *bs);
 bool bdrv_iostatus_is_enabled(const BlockDriverState *bs);
 void bdrv_iostatus_set_err(BlockDriverState *bs, int error);
-void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
-                               BlockErrorAction action, bool is_read);
 void bdrv_info_print(Monitor *mon, const QObject *data);
 void bdrv_info(Monitor *mon, QObject **ret_data);
 void bdrv_stats_print(Monitor *mon, const QObject *data);
@@ -282,6 +280,9 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error,
                        BlockdevOnError on_write_error);
 BlockdevOnError bdrv_get_on_error(BlockDriverState *bs, bool is_read);
+BlockErrorAction bdrv_get_error_action(BlockDriverState *bs, bool is_read, int error);
+void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
+                       bool is_read, int error);
 int bdrv_is_read_only(BlockDriverState *bs);
 int bdrv_is_sg(BlockDriverState *bs);
 int bdrv_enable_write_cache(BlockDriverState *bs);
diff --git a/hw/ide/core.c b/hw/ide/core.c
index c03db4a3a..d683a8cc8 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -557,31 +557,21 @@ void ide_dma_error(IDEState *s)
 static int ide_handle_rw_error(IDEState *s, int error, int op)
 {
     bool is_read = (op & BM_STATUS_RETRY_READ) != 0;
-    BlockdevOnError action = bdrv_get_on_error(s->bs, is_read);
+    BlockErrorAction action = bdrv_get_error_action(s->bs, is_read, error);
 
-    if (action == BLOCKDEV_ON_ERROR_IGNORE) {
-        bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_IGNORE, is_read);
-        return 0;
-    }
-
-    if ((error == ENOSPC && action == BLOCKDEV_ON_ERROR_ENOSPC)
-            || action == BLOCKDEV_ON_ERROR_STOP) {
+    if (action == BDRV_ACTION_STOP) {
         s->bus->dma->ops->set_unit(s->bus->dma, s->unit);
         s->bus->error_status = op;
-        bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_STOP, is_read);
-        vm_stop(RUN_STATE_IO_ERROR);
-        bdrv_iostatus_set_err(s->bs, error);
-    } else {
+    } else if (action == BDRV_ACTION_REPORT) {
         if (op & BM_STATUS_DMA_RETRY) {
             dma_buf_commit(s);
             ide_dma_error(s);
         } else {
             ide_rw_error(s);
         }
-        bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_REPORT, is_read);
     }
-
-    return 1;
+    bdrv_error_action(s->bs, action, is_read, error);
+    return action != BDRV_ACTION_IGNORE;
 }
 
 void ide_dma_cb(void *opaque, int ret)
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 2dd99a90a..99bb02ebf 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -388,21 +388,9 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
 {
     bool is_read = (r->req.cmd.xfer == SCSI_XFER_FROM_DEV);
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
-    BlockdevOnError action = bdrv_get_on_error(s->qdev.conf.bs, is_read);
+    BlockErrorAction action = bdrv_get_error_action(s->qdev.conf.bs, is_read, error);
 
-    if (action == BLOCKDEV_ON_ERROR_IGNORE) {
-        bdrv_emit_qmp_error_event(s->qdev.conf.bs, BDRV_ACTION_IGNORE, is_read);
-        return 0;
-    }
-
-    if ((error == ENOSPC && action == BLOCKDEV_ON_ERROR_ENOSPC)
-            || action == BLOCKDEV_ON_ERROR_STOP) {
-
-        bdrv_emit_qmp_error_event(s->qdev.conf.bs, BDRV_ACTION_STOP, is_read);
-        vm_stop(RUN_STATE_IO_ERROR);
-        bdrv_iostatus_set_err(s->qdev.conf.bs, error);
-        scsi_req_retry(&r->req);
-    } else {
+    if (action == BDRV_ACTION_REPORT) {
         switch (error) {
         case ENOMEDIUM:
             scsi_check_condition(r, SENSE_CODE(NO_MEDIUM));
@@ -417,9 +405,12 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
             scsi_check_condition(r, SENSE_CODE(IO_ERROR));
             break;
         }
-        bdrv_emit_qmp_error_event(s->qdev.conf.bs, BDRV_ACTION_REPORT, is_read);
     }
-    return 1;
+    bdrv_error_action(s->qdev.conf.bs, action, is_read, error);
+    if (action == BDRV_ACTION_STOP) {
+        scsi_req_retry(&r->req);
+    }
+    return action != BDRV_ACTION_IGNORE;
 }
 
 static void scsi_write_complete(void * opaque, int ret)
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 1ac2483b5..e25cc9647 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -66,29 +66,20 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int status)
 static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
     bool is_read)
 {
-    BlockdevOnError action = bdrv_get_on_error(req->dev->bs, is_read);
+    BlockErrorAction action = bdrv_get_error_action(req->dev->bs, is_read, error);
     VirtIOBlock *s = req->dev;
 
-    if (action == BLOCKDEV_ON_ERROR_IGNORE) {
-        bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_IGNORE, is_read);
-        return 0;
-    }
-
-    if ((error == ENOSPC && action == BLOCKDEV_ON_ERROR_ENOSPC)
-            || action == BLOCKDEV_ON_ERROR_STOP) {
+    if (action == BDRV_ACTION_STOP) {
         req->next = s->rq;
         s->rq = req;
-        bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_STOP, is_read);
-        vm_stop(RUN_STATE_IO_ERROR);
-        bdrv_iostatus_set_err(s->bs, error);
-    } else {
+    } else if (action == BDRV_ACTION_REPORT) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
         bdrv_acct_done(s->bs, &req->acct);
         g_free(req);
-        bdrv_emit_qmp_error_event(s->bs, BDRV_ACTION_REPORT, is_read);
     }
 
-    return 1;
+    bdrv_error_action(s->bs, action, is_read, error);
+    return action != BDRV_ACTION_IGNORE;
 }
 
 static void virtio_blk_rw_complete(void *opaque, int ret)
diff --git a/qemu-tool.c b/qemu-tool.c
index 18205baba..f2f98138c 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -19,6 +19,7 @@
 #include "qemu-log.h"
 #include "migration.h"
 #include "main-loop.h"
+#include "sysemu.h"
 #include "qemu_socket.h"
 #include "slirp/libslirp.h"
 
@@ -37,6 +38,11 @@ const char *qemu_get_vm_name(void)
 
 Monitor *cur_mon;
 
+void vm_stop(RunState state)
+{
+    abort();
+}
+
 int monitor_cur_is_qmp(void)
 {
     return 0;
-- 
cgit v1.2.3


From 32c81a4a6ecc3f50efc9c270a269e4d3d8a9fbd5 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:22:58 +0200
Subject: block: introduce block job error

The following behaviors are possible:

'report': The behavior is the same as in 1.1.  An I/O error,
respectively during a read or a write, will complete the job immediately
with an error code.

'ignore': An I/O error, respectively during a read or a write, will be
ignored.  For streaming, the job will complete with an error and the
backing file will be left in place.  For mirroring, the sector will be
marked again as dirty and re-examined later.

'stop': The job will be paused and the job iostatus will be set to
failed or nospace, while the VM will keep running.  This can only be
specified if the block device has rerror=stop and werror=stop or enospc.

'enospc': Behaves as 'stop' for ENOSPC errors, 'report' for others.

In all cases, even for 'report', the I/O error is reported as a QMP
event BLOCK_JOB_ERROR, with the same arguments as BLOCK_IO_ERROR.

It is possible that while stopping the VM a BLOCK_IO_ERROR event will be
reported and will clobber the event from BLOCK_JOB_ERROR, or vice versa.
This is not really avoidable since stopping the VM completes all pending
I/O requests.  In fact, it is already possible now that a series of
BLOCK_IO_ERROR events are reported with rerror=stop, because vm_stop
calls bdrv_drain_all and this can generate further errors.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 QMP/qmp-events.txt | 22 +++++++++++++++++++
 block.c            |  9 ++++----
 block_int.h        |  4 ++++
 blockjob.c         | 63 ++++++++++++++++++++++++++++++++++++++++++++++++------
 blockjob.h         | 25 ++++++++++++++++++++++
 monitor.c          |  1 +
 monitor.h          |  1 +
 qapi-schema.json   |  7 +++++-
 8 files changed, 120 insertions(+), 12 deletions(-)

diff --git a/QMP/qmp-events.txt b/QMP/qmp-events.txt
index 449102042..987c5756b 100644
--- a/QMP/qmp-events.txt
+++ b/QMP/qmp-events.txt
@@ -96,6 +96,28 @@ Example:
                "speed": 0 },
      "timestamp": { "seconds": 1267061043, "microseconds": 959568 } }
 
+BLOCK_JOB_ERROR
+---------------
+
+Emitted when a block job encounters an error.
+
+Data:
+
+- "device": device name (json-string)
+- "operation": I/O operation (json-string, "read" or "write")
+- "action": action that has been taken, it's one of the following (json-string):
+    "ignore": error has been ignored, the job may fail later
+    "report": error will be reported and the job canceled
+    "stop": error caused job to be paused
+
+Example:
+
+{ "event": "BLOCK_JOB_ERROR",
+    "data": { "device": "ide0-hd1",
+              "operation": "write",
+              "action": "stop" },
+    "timestamp": { "seconds": 1265044230, "microseconds": 450486 } }
+
 DEVICE_TRAY_MOVED
 -----------------
 
diff --git a/block.c b/block.c
index 8b0ba6722..c108a7695 100644
--- a/block.c
+++ b/block.c
@@ -1387,8 +1387,9 @@ void bdrv_set_dev_ops(BlockDriverState *bs, const BlockDevOps *ops,
     }
 }
 
-static void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
-                                      BlockErrorAction action, bool is_read)
+void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
+                               enum MonitorEvent ev,
+                               BlockErrorAction action, bool is_read)
 {
     QObject *data;
     const char *action_str;
@@ -1411,7 +1412,7 @@ static void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
                               bdrv->device_name,
                               action_str,
                               is_read ? "read" : "write");
-    monitor_protocol_event(QEVENT_BLOCK_IO_ERROR, data);
+    monitor_protocol_event(ev, data);
 
     qobject_decref(data);
 }
@@ -2513,7 +2514,7 @@ void bdrv_error_action(BlockDriverState *bs, BlockErrorAction action,
                        bool is_read, int error)
 {
     assert(error >= 0);
-    bdrv_emit_qmp_error_event(bs, action, is_read);
+    bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IO_ERROR, action, is_read);
     if (action == BDRV_ACTION_STOP) {
         vm_stop(RUN_STATE_IO_ERROR);
         bdrv_iostatus_set_err(bs, error);
diff --git a/block_int.h b/block_int.h
index 615aafca8..785d43d61 100644
--- a/block_int.h
+++ b/block_int.h
@@ -31,6 +31,7 @@
 #include "qemu-timer.h"
 #include "qapi-types.h"
 #include "qerror.h"
+#include "monitor.h"
 
 #define BLOCK_FLAG_ENCRYPT          1
 #define BLOCK_FLAG_COMPAT6          4
@@ -286,6 +287,9 @@ void bdrv_set_io_limits(BlockDriverState *bs,
 #ifdef _WIN32
 int is_windows_drive(const char *filename);
 #endif
+void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
+                               enum MonitorEvent ev,
+                               BlockErrorAction action, bool is_read);
 
 /**
  * stream_start:
diff --git a/blockjob.c b/blockjob.c
index 8219f7397..f55f55a19 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -112,6 +112,7 @@ bool block_job_is_paused(BlockJob *job)
 void block_job_resume(BlockJob *job)
 {
     job->paused = false;
+    block_job_iostatus_reset(job);
     if (job->co && !job->busy) {
         qemu_coroutine_enter(job->co, NULL);
     }
@@ -128,6 +129,11 @@ bool block_job_is_cancelled(BlockJob *job)
     return job->cancelled;
 }
 
+void block_job_iostatus_reset(BlockJob *job)
+{
+    job->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
+}
+
 struct BlockCancelData {
     BlockJob *job;
     BlockDriverCompletionFunc *cb;
@@ -189,12 +195,55 @@ void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns)
 BlockJobInfo *block_job_query(BlockJob *job)
 {
     BlockJobInfo *info = g_new0(BlockJobInfo, 1);
-    info->type   = g_strdup(job->job_type->job_type);
-    info->device = g_strdup(bdrv_get_device_name(job->bs));
-    info->len    = job->len;
-    info->busy   = job->busy;
-    info->paused = job->paused;
-    info->offset = job->offset;
-    info->speed  = job->speed;
+    info->type      = g_strdup(job->job_type->job_type);
+    info->device    = g_strdup(bdrv_get_device_name(job->bs));
+    info->len       = job->len;
+    info->busy      = job->busy;
+    info->paused    = job->paused;
+    info->offset    = job->offset;
+    info->speed     = job->speed;
+    info->io_status = job->iostatus;
     return info;
 }
+
+static void block_job_iostatus_set_err(BlockJob *job, int error)
+{
+    if (job->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
+        job->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
+                                          BLOCK_DEVICE_IO_STATUS_FAILED;
+    }
+}
+
+
+BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
+                                        BlockdevOnError on_err,
+                                        int is_read, int error)
+{
+    BlockErrorAction action;
+
+    switch (on_err) {
+    case BLOCKDEV_ON_ERROR_ENOSPC:
+        action = (error == ENOSPC) ? BDRV_ACTION_STOP : BDRV_ACTION_REPORT;
+        break;
+    case BLOCKDEV_ON_ERROR_STOP:
+        action = BDRV_ACTION_STOP;
+        break;
+    case BLOCKDEV_ON_ERROR_REPORT:
+        action = BDRV_ACTION_REPORT;
+        break;
+    case BLOCKDEV_ON_ERROR_IGNORE:
+        action = BDRV_ACTION_IGNORE;
+        break;
+    default:
+        abort();
+    }
+    bdrv_emit_qmp_error_event(job->bs, QEVENT_BLOCK_JOB_ERROR, action, is_read);
+    if (action == BDRV_ACTION_STOP) {
+        block_job_pause(job);
+        block_job_iostatus_set_err(job, error);
+        if (bs != job->bs) {
+            bdrv_iostatus_set_err(bs, error);
+        }
+    }
+    return action;
+}
diff --git a/blockjob.h b/blockjob.h
index ece5afa75..930cc3c46 100644
--- a/blockjob.h
+++ b/blockjob.h
@@ -82,6 +82,9 @@ struct BlockJob {
      */
     bool busy;
 
+    /** Status that is published by the query-block-jobs QMP API */
+    BlockDeviceIoStatus iostatus;
+
     /** Offset that is published by the query-block-jobs QMP API */
     int64_t offset;
 
@@ -215,4 +218,26 @@ bool block_job_is_paused(BlockJob *job);
  */
 int block_job_cancel_sync(BlockJob *job);
 
+/**
+ * block_job_iostatus_reset:
+ * @job: The job whose I/O status should be reset.
+ *
+ * Reset I/O status on @job.
+ */
+void block_job_iostatus_reset(BlockJob *job);
+
+/**
+ * block_job_error_action:
+ * @job: The job to signal an error for.
+ * @bs: The block device on which to set an I/O error.
+ * @on_err: The error action setting.
+ * @is_read: Whether the operation was a read.
+ * @error: The error that was reported.
+ *
+ * Report an I/O error for a block job and possibly stop the VM.  Return the
+ * action that was selected based on @on_err and @error.
+ */
+BlockErrorAction block_job_error_action(BlockJob *job, BlockDriverState *bs,
+                                        BlockdevOnError on_err,
+                                        int is_read, int error);
 #endif
diff --git a/monitor.c b/monitor.c
index 67064e270..d4bd5feb6 100644
--- a/monitor.c
+++ b/monitor.c
@@ -450,6 +450,7 @@ static const char *monitor_event_names[] = {
     [QEVENT_SPICE_DISCONNECTED] = "SPICE_DISCONNECTED",
     [QEVENT_BLOCK_JOB_COMPLETED] = "BLOCK_JOB_COMPLETED",
     [QEVENT_BLOCK_JOB_CANCELLED] = "BLOCK_JOB_CANCELLED",
+    [QEVENT_BLOCK_JOB_ERROR] = "BLOCK_JOB_ERROR",
     [QEVENT_DEVICE_TRAY_MOVED] = "DEVICE_TRAY_MOVED",
     [QEVENT_SUSPEND] = "SUSPEND",
     [QEVENT_SUSPEND_DISK] = "SUSPEND_DISK",
diff --git a/monitor.h b/monitor.h
index 64c156184..43040af1c 100644
--- a/monitor.h
+++ b/monitor.h
@@ -38,6 +38,7 @@ typedef enum MonitorEvent {
     QEVENT_SPICE_DISCONNECTED,
     QEVENT_BLOCK_JOB_COMPLETED,
     QEVENT_BLOCK_JOB_CANCELLED,
+    QEVENT_BLOCK_JOB_ERROR,
     QEVENT_DEVICE_TRAY_MOVED,
     QEVENT_SUSPEND,
     QEVENT_SUSPEND_DISK,
diff --git a/qapi-schema.json b/qapi-schema.json
index a7264135a..14e7a0d68 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1131,11 +1131,14 @@
 #
 # @speed: the rate limit, bytes per second
 #
+# @io-status: the status of the job (since 1.3)
+#
 # Since: 1.1
 ##
 { 'type': 'BlockJobInfo',
   'data': {'type': 'str', 'device': 'str', 'len': 'int',
-           'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int'} }
+           'offset': 'int', 'busy': 'bool', 'paused': 'bool', 'speed': 'int',
+           'io-status': 'BlockDeviceIoStatus'} }
 
 ##
 # @query-block-jobs:
@@ -1958,6 +1961,8 @@
 # operation.  It is an error to call this command if no operation is in
 # progress.  Resuming an already running job is not an error.
 #
+# This command also clears the error status of the job.
+#
 # @device: the device name
 #
 # Returns: Nothing on success
-- 
cgit v1.2.3


From 1d809098aa9518cda41c2cf6e660d3d602614907 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:22:59 +0200
Subject: stream: add on-error argument

This patch adds support for error management to streaming.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/stream.c   | 28 +++++++++++++++++++++++++++-
 block_int.h      |  3 ++-
 blockdev.c       | 11 ++++++++---
 hmp.c            |  3 ++-
 qapi-schema.json |  9 +++++++--
 qmp-commands.hx  |  2 +-
 6 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index 57e4be7c6..792665276 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -31,6 +31,7 @@ typedef struct StreamBlockJob {
     BlockJob common;
     RateLimit limit;
     BlockDriverState *base;
+    BlockdevOnError on_error;
     char backing_file_id[1024];
 } StreamBlockJob;
 
@@ -78,6 +79,7 @@ static void coroutine_fn stream_run(void *opaque)
     BlockDriverState *bs = s->common.bs;
     BlockDriverState *base = s->base;
     int64_t sector_num, end;
+    int error = 0;
     int ret = 0;
     int n = 0;
     void *buf;
@@ -142,7 +144,19 @@ wait:
             ret = stream_populate(bs, sector_num, n, buf);
         }
         if (ret < 0) {
-            break;
+            BlockErrorAction action =
+                block_job_error_action(&s->common, s->common.bs, s->on_error,
+                                       true, -ret);
+            if (action == BDRV_ACTION_STOP) {
+                n = 0;
+                continue;
+            }
+            if (error == 0) {
+                error = ret;
+            }
+            if (action == BDRV_ACTION_REPORT) {
+                break;
+            }
         }
         ret = 0;
 
@@ -154,6 +168,9 @@ wait:
         bdrv_disable_copy_on_read(bs);
     }
 
+    /* Do not remove the backing file if an error was there but ignored.  */
+    ret = error;
+
     if (!block_job_is_cancelled(&s->common) && sector_num == end && ret == 0) {
         const char *base_id = NULL, *base_fmt = NULL;
         if (base) {
@@ -189,11 +206,19 @@ static BlockJobType stream_job_type = {
 
 void stream_start(BlockDriverState *bs, BlockDriverState *base,
                   const char *base_id, int64_t speed,
+                  BlockdevOnError on_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp)
 {
     StreamBlockJob *s;
 
+    if ((on_error == BLOCKDEV_ON_ERROR_STOP ||
+         on_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
+        !bdrv_iostatus_is_enabled(bs)) {
+        error_set(errp, QERR_INVALID_PARAMETER, "on-error");
+        return;
+    }
+
     s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
@@ -204,6 +229,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base,
         pstrcpy(s->backing_file_id, sizeof(s->backing_file_id), base_id);
     }
 
+    s->on_error = on_error;
     s->common.co = qemu_coroutine_create(stream_run);
     trace_stream_start(bs, base, s, s->common.co, opaque);
     qemu_coroutine_enter(s->common.co, s);
diff --git a/block_int.h b/block_int.h
index 785d43d61..f4bae0440 100644
--- a/block_int.h
+++ b/block_int.h
@@ -299,6 +299,7 @@ void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
  * @base_id: The file name that will be written to @bs as the new
  * backing file if the job completes.  Ignored if @base is %NULL.
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
  * @cb: Completion function for the job.
  * @opaque: Opaque pointer value passed to @cb.
  * @errp: Error object.
@@ -310,7 +311,7 @@ void bdrv_emit_qmp_error_event(const BlockDriverState *bdrv,
  * @base_id in the written image and to @base in the live BlockDriverState.
  */
 void stream_start(BlockDriverState *bs, BlockDriverState *base,
-                  const char *base_id, int64_t speed,
+                  const char *base_id, int64_t speed, BlockdevOnError on_error,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp);
 
diff --git a/blockdev.c b/blockdev.c
index d52a83076..5f18dfa97 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1096,13 +1096,18 @@ static void block_job_cb(void *opaque, int ret)
 }
 
 void qmp_block_stream(const char *device, bool has_base,
-                      const char *base, bool has_speed,
-                      int64_t speed, Error **errp)
+                      const char *base, bool has_speed, int64_t speed,
+                      bool has_on_error, BlockdevOnError on_error,
+                      Error **errp)
 {
     BlockDriverState *bs;
     BlockDriverState *base_bs = NULL;
     Error *local_err = NULL;
 
+    if (!has_on_error) {
+        on_error = BLOCKDEV_ON_ERROR_REPORT;
+    }
+
     bs = bdrv_find(device);
     if (!bs) {
         error_set(errp, QERR_DEVICE_NOT_FOUND, device);
@@ -1118,7 +1123,7 @@ void qmp_block_stream(const char *device, bool has_base,
     }
 
     stream_start(bs, base_bs, base, has_speed ? speed : 0,
-                 block_job_cb, bs, &local_err);
+                 on_error, block_job_cb, bs, &local_err);
     if (error_is_set(&local_err)) {
         error_propagate(errp, local_err);
         return;
diff --git a/hmp.c b/hmp.c
index 55601f7da..df789b235 100644
--- a/hmp.c
+++ b/hmp.c
@@ -930,7 +930,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
     int64_t speed = qdict_get_try_int(qdict, "speed", 0);
 
     qmp_block_stream(device, base != NULL, base,
-                     qdict_haskey(qdict, "speed"), speed, &error);
+                     qdict_haskey(qdict, "speed"), speed,
+                     BLOCKDEV_ON_ERROR_REPORT, true, &error);
 
     hmp_handle_error(mon, &error);
 }
diff --git a/qapi-schema.json b/qapi-schema.json
index 14e7a0d68..768b4c785 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1869,13 +1869,18 @@
 #
 # @speed:  #optional the maximum speed, in bytes per second
 #
+# @on-error: #optional the action to take on an error (default report).
+#            'stop' and 'enospc' can only be used if the block device
+#            supports io-status (see BlockInfo).  Since 1.3.
+#
 # Returns: Nothing on success
 #          If @device does not exist, DeviceNotFound
 #
 # Since: 1.1
 ##
-{ 'command': 'block-stream', 'data': { 'device': 'str', '*base': 'str',
-                                       '*speed': 'int' } }
+{ 'command': 'block-stream',
+  'data': { 'device': 'str', '*base': 'str', '*speed': 'int',
+            '*on-error': 'BlockdevOnError' } }
 
 ##
 # @block-job-set-speed:
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 71d7c25f5..ea93b1d4a 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -787,7 +787,7 @@ EQMP
 
     {
         .name       = "block-stream",
-        .args_type  = "device:B,base:s?,speed:o?",
+        .args_type  = "device:B,base:s?,speed:o?,on-error:s?",
         .mhandler.cmd_new = qmp_marshal_input_block_stream,
     },
 
-- 
cgit v1.2.3


From 8f96b5be92fbd74798b97b1dc1ff5fbbe249ed11 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:23:00 +0200
Subject: blkdebug: process all set_state rules in the old state

Currently it is impossible to write a blkdebug script that ping-pongs
between two states, because the second set-state rule will use the
state that is set in the first.  If you have

    [set-state]
    event = "..."
    state = "1"
    new_state = "2"

    [set-state]
    event = "..."
    state = "2"
    new_state = "1"

for example the state will remain locked at 1.  This can be fixed
by first processing all rules, and then setting the state.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/blkdebug.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 59dcea065..1206d5256 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -28,6 +28,7 @@
 
 typedef struct BDRVBlkdebugState {
     int state;
+    int new_state;
     QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX];
     QSIMPLEQ_HEAD(, BlkdebugRule) active_rules;
 } BDRVBlkdebugState;
@@ -403,12 +404,12 @@ static void blkdebug_close(BlockDriverState *bs)
 }
 
 static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
-    int old_state, bool injected)
+    bool injected)
 {
     BDRVBlkdebugState *s = bs->opaque;
 
     /* Only process rules for the current state */
-    if (rule->state && rule->state != old_state) {
+    if (rule->state && rule->state != s->state) {
         return injected;
     }
 
@@ -423,7 +424,7 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule,
         break;
 
     case ACTION_SET_STATE:
-        s->state = rule->options.set_state.new_state;
+        s->new_state = rule->options.set_state.new_state;
         break;
     }
     return injected;
@@ -433,15 +434,16 @@ static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event)
 {
     BDRVBlkdebugState *s = bs->opaque;
     struct BlkdebugRule *rule;
-    int old_state = s->state;
     bool injected;
 
     assert((int)event >= 0 && event < BLKDBG_EVENT_MAX);
 
     injected = false;
+    s->new_state = s->state;
     QLIST_FOREACH(rule, &s->rules[event], next) {
-        injected = process_rule(bs, rule, old_state, injected);
+        injected = process_rule(bs, rule, injected);
     }
+    s->state = s->new_state;
 }
 
 static int64_t blkdebug_getlength(BlockDriverState *bs)
-- 
cgit v1.2.3


From 4f45056841abced5d57485edf0ff1d2ffc042cb1 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:23:01 +0200
Subject: qemu-iotests: map underscore to dash in QMP argument names

iotests.py provides a convenience function that uses Python keyword
arguments to represent QMP command arguments.  However, almost all
QMP commands use dashes for argument names (the sole exception is
block_set_io_throttle), and dashes are not allowed in a keyword
argument name.  Hence provide automatic conversion of underscores
to dashes.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/iotests.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index e05b1d640..a94ea75bb 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -19,6 +19,7 @@
 import os
 import re
 import subprocess
+import string
 import unittest
 import sys; sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'QMP'))
 import qmp
@@ -96,9 +97,14 @@ class VM(object):
             os.remove(self._qemu_log_path)
             self._popen = None
 
+    underscore_to_dash = string.maketrans('_', '-')
     def qmp(self, cmd, **args):
         '''Invoke a QMP command and return the result dict'''
-        return self._qmp.cmd(cmd, args=args)
+        qmp_args = dict()
+        for k in args.keys():
+            qmp_args[k.translate(self.underscore_to_dash)] = args[k]
+
+        return self._qmp.cmd(cmd, args=qmp_args)
 
     def get_qmp_events(self, wait=False):
         '''Poll for queued QMP events and return a list of dicts'''
-- 
cgit v1.2.3


From 90f0b71153c6a85d03967244b9889f892841d835 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Fri, 28 Sep 2012 17:23:02 +0200
Subject: qemu-iotests: add tests for streaming error handling

Add a test for each of report/ignore/stop.  The tests use blkdebug
to generate an error in the middle of a script.  The error is
recoverable (once = "on") so that we can test resuming a job after
stopping for an error.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 tests/qemu-iotests/030        | 220 ++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/030.out    |   4 +-
 tests/qemu-iotests/iotests.py |   7 ++
 3 files changed, 229 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index dfacdf11a..dd4ef1199 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -195,6 +195,226 @@ class TestSmallerBackingFile(ImageStreamingTestCase):
         self.assert_no_active_streams()
         self.vm.shutdown()
 
+class TestErrors(ImageStreamingTestCase):
+    image_len = 2 * 1024 * 1024 # MB
+
+    # this should match STREAM_BUFFER_SIZE/512 in block/stream.c
+    STREAM_BUFFER_SIZE = 512 * 1024
+
+    def create_blkdebug_file(self, name, event, errno):
+        file = open(name, 'w')
+        file.write('''
+[inject-error]
+state = "1"
+event = "%s"
+errno = "%d"
+immediately = "off"
+once = "on"
+sector = "%d"
+
+[set-state]
+state = "1"
+event = "%s"
+new_state = "2"
+
+[set-state]
+state = "2"
+event = "%s"
+new_state = "1"
+''' % (event, errno, self.STREAM_BUFFER_SIZE / 512, event, event))
+        file.close()
+
+class TestEIO(TestErrors):
+    def setUp(self):
+        self.blkdebug_file = backing_img + ".blkdebug"
+        self.create_image(backing_img, TestErrors.image_len)
+        self.create_blkdebug_file(self.blkdebug_file, "read_aio", 5)
+        qemu_img('create', '-f', iotests.imgfmt,
+                 '-o', 'backing_file=blkdebug:%s:%s,backing_fmt=raw'
+                       % (self.blkdebug_file, backing_img),
+                 test_img)
+        self.vm = iotests.VM().add_drive(test_img)
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(test_img)
+        os.remove(backing_img)
+        os.remove(self.blkdebug_file)
+
+    def test_report(self):
+        self.assert_no_active_streams()
+
+        result = self.vm.qmp('block-stream', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        completed = False
+        error = False
+        while not completed:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_ERROR':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/operation', 'read')
+                    error = True
+                elif event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assertTrue(error, 'job completed unexpectedly')
+                    self.assert_qmp(event, 'data/type', 'stream')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/error', 'Input/output error')
+                    self.assert_qmp(event, 'data/offset', self.STREAM_BUFFER_SIZE)
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    completed = True
+
+        self.assert_no_active_streams()
+        self.vm.shutdown()
+
+    def test_ignore(self):
+        self.assert_no_active_streams()
+
+        result = self.vm.qmp('block-stream', device='drive0', on_error='ignore')
+        self.assert_qmp(result, 'return', {})
+
+        error = False
+        completed = False
+        while not completed:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_ERROR':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/operation', 'read')
+                    result = self.vm.qmp('query-block-jobs')
+                    self.assert_qmp(result, 'return[0]/paused', False)
+                    error = True
+                elif event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assertTrue(error, 'job completed unexpectedly')
+                    self.assert_qmp(event, 'data/type', 'stream')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/error', 'Input/output error')
+                    self.assert_qmp(event, 'data/offset', self.image_len)
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    completed = True
+
+        self.assert_no_active_streams()
+        self.vm.shutdown()
+
+    def test_stop(self):
+        self.assert_no_active_streams()
+
+        result = self.vm.qmp('block-stream', device='drive0', on_error='stop')
+        self.assert_qmp(result, 'return', {})
+
+        error = False
+        completed = False
+        while not completed:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_ERROR':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/operation', 'read')
+
+                    result = self.vm.qmp('query-block-jobs')
+                    self.assert_qmp(result, 'return[0]/paused', True)
+                    self.assert_qmp(result, 'return[0]/offset', self.STREAM_BUFFER_SIZE)
+                    self.assert_qmp(result, 'return[0]/io-status', 'failed')
+
+                    result = self.vm.qmp('block-job-resume', device='drive0')
+                    self.assert_qmp(result, 'return', {})
+
+                    result = self.vm.qmp('query-block-jobs')
+                    self.assert_qmp(result, 'return[0]/paused', False)
+                    self.assert_qmp(result, 'return[0]/io-status', 'ok')
+                    error = True
+                elif event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assertTrue(error, 'job completed unexpectedly')
+                    self.assert_qmp(event, 'data/type', 'stream')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp_absent(event, 'data/error')
+                    self.assert_qmp(event, 'data/offset', self.image_len)
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    completed = True
+
+        self.assert_no_active_streams()
+        self.vm.shutdown()
+
+    def test_enospc(self):
+        self.assert_no_active_streams()
+
+        result = self.vm.qmp('block-stream', device='drive0', on_error='enospc')
+        self.assert_qmp(result, 'return', {})
+
+        completed = False
+        error = False
+        while not completed:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_ERROR':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/operation', 'read')
+                    error = True
+                elif event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assertTrue(error, 'job completed unexpectedly')
+                    self.assert_qmp(event, 'data/type', 'stream')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/error', 'Input/output error')
+                    self.assert_qmp(event, 'data/offset', self.STREAM_BUFFER_SIZE)
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    completed = True
+
+        self.assert_no_active_streams()
+        self.vm.shutdown()
+
+class TestENOSPC(TestErrors):
+    def setUp(self):
+        self.blkdebug_file = backing_img + ".blkdebug"
+        self.create_image(backing_img, TestErrors.image_len)
+        self.create_blkdebug_file(self.blkdebug_file, "read_aio", 28)
+        qemu_img('create', '-f', iotests.imgfmt,
+                 '-o', 'backing_file=blkdebug:%s:%s,backing_fmt=raw'
+                       % (self.blkdebug_file, backing_img),
+                 test_img)
+        self.vm = iotests.VM().add_drive(test_img)
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(test_img)
+        os.remove(backing_img)
+        os.remove(self.blkdebug_file)
+
+    def test_enospc(self):
+        self.assert_no_active_streams()
+
+        result = self.vm.qmp('block-stream', device='drive0', on_error='enospc')
+        self.assert_qmp(result, 'return', {})
+
+        error = False
+        completed = False
+        while not completed:
+            for event in self.vm.get_qmp_events(wait=True):
+                if event['event'] == 'BLOCK_JOB_ERROR':
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp(event, 'data/operation', 'read')
+
+                    result = self.vm.qmp('query-block-jobs')
+                    self.assert_qmp(result, 'return[0]/paused', True)
+                    self.assert_qmp(result, 'return[0]/offset', self.STREAM_BUFFER_SIZE)
+                    self.assert_qmp(result, 'return[0]/io-status', 'nospace')
+
+                    result = self.vm.qmp('block-job-resume', device='drive0')
+                    self.assert_qmp(result, 'return', {})
+
+                    result = self.vm.qmp('query-block-jobs')
+                    self.assert_qmp(result, 'return[0]/paused', False)
+                    self.assert_qmp(result, 'return[0]/io-status', 'ok')
+                    error = True
+                elif event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assertTrue(error, 'job completed unexpectedly')
+                    self.assert_qmp(event, 'data/type', 'stream')
+                    self.assert_qmp(event, 'data/device', 'drive0')
+                    self.assert_qmp_absent(event, 'data/error')
+                    self.assert_qmp(event, 'data/offset', self.image_len)
+                    self.assert_qmp(event, 'data/len', self.image_len)
+                    completed = True
+
+        self.assert_no_active_streams()
+        self.vm.shutdown()
 
 class TestStreamStop(ImageStreamingTestCase):
     image_len = 8 * 1024 * 1024 * 1024 # GB
diff --git a/tests/qemu-iotests/030.out b/tests/qemu-iotests/030.out
index 594c16f49..fa16b5cce 100644
--- a/tests/qemu-iotests/030.out
+++ b/tests/qemu-iotests/030.out
@@ -1,5 +1,5 @@
-........
+.............
 ----------------------------------------------------------------------
-Ran 8 tests
+Ran 13 tests
 
 OK
diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index a94ea75bb..3c60b2d16 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -138,6 +138,13 @@ class QMPTestCase(unittest.TestCase):
                     self.fail('invalid index "%s" in path "%s" in "%s"' % (idx, path, str(d)))
         return d
 
+    def assert_qmp_absent(self, d, path):
+        try:
+            result = self.dictpath(d, path)
+        except AssertionError:
+            return
+        self.fail('path "%s" has value "%s"' % (path, str(result)))
+
     def assert_qmp(self, d, path, value):
         '''Assert that the value for a specific path in a QMP dict matches'''
         result = self.dictpath(d, path)
-- 
cgit v1.2.3


From b1f4679392a03f2b26a37bfa52e95d6cc4f73d82 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 6 Sep 2012 10:05:35 +0000
Subject: i386: kvm: bit 10 of CPUID[8000_0001].EDX is reserved

Bit 10 of CPUID[8000_0001].EDX is not defined as an alias of
CPUID[1].EDX[10], so do not duplicate it on
kvm_arch_get_supported_cpuid().

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Reviewed-By: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Don Slutz <Don@CloudSwitch.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 target-i386/kvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 6790180b0..acb93693b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -165,7 +165,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
                      * so add missing bits according to the AMD spec:
                      */
                     cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
-                    ret |= cpuid_1_edx & 0x183f7ff;
+                    ret |= cpuid_1_edx & 0x183f3ff;
                     break;
                 }
                 break;
-- 
cgit v1.2.3


From 8fad4b44a0f71cd404f95f109657c0ccbf11f8f9 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 6 Sep 2012 10:05:36 +0000
Subject: i386: kvm: use a #define for the set of alias feature bits

Instea of using a hardcoded hex constant, define CPUID_EXT2_AMD_ALIASES
as the set of CPUID[8000_0001].EDX bits that on AMD are the same as the
bits of CPUID[1].EDX.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Reviewed-By: Igor Mammedov <imammedo@redhat.com>
Reviewed-by: Don Slutz <Don@CloudSwitch.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 target-i386/cpu.h | 12 ++++++++++++
 target-i386/kvm.c |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index d7ea2f92a..49950843f 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -409,6 +409,7 @@
 #define CPUID_EXT_HYPERVISOR  (1 << 31)
 
 #define CPUID_EXT2_FPU     (1 << 0)
+#define CPUID_EXT2_VME     (1 << 1)
 #define CPUID_EXT2_DE      (1 << 2)
 #define CPUID_EXT2_PSE     (1 << 3)
 #define CPUID_EXT2_TSC     (1 << 4)
@@ -436,6 +437,17 @@
 #define CPUID_EXT2_3DNOWEXT (1 << 30)
 #define CPUID_EXT2_3DNOW   (1 << 31)
 
+/* CPUID[8000_0001].EDX bits that are aliase of CPUID[1].EDX bits on AMD CPUs */
+#define CPUID_EXT2_AMD_ALIASES (CPUID_EXT2_FPU | CPUID_EXT2_VME | \
+                                CPUID_EXT2_DE | CPUID_EXT2_PSE | \
+                                CPUID_EXT2_TSC | CPUID_EXT2_MSR | \
+                                CPUID_EXT2_PAE | CPUID_EXT2_MCE | \
+                                CPUID_EXT2_CX8 | CPUID_EXT2_APIC | \
+                                CPUID_EXT2_MTRR | CPUID_EXT2_PGE | \
+                                CPUID_EXT2_MCA | CPUID_EXT2_CMOV | \
+                                CPUID_EXT2_PAT | CPUID_EXT2_PSE36 | \
+                                CPUID_EXT2_MMX | CPUID_EXT2_FXSR)
+
 #define CPUID_EXT3_LAHF_LM (1 << 0)
 #define CPUID_EXT3_CMP_LEG (1 << 1)
 #define CPUID_EXT3_SVM     (1 << 2)
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index acb93693b..5b18383d8 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -165,7 +165,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
                      * so add missing bits according to the AMD spec:
                      */
                     cpuid_1_edx = kvm_arch_get_supported_cpuid(s, 1, 0, R_EDX);
-                    ret |= cpuid_1_edx & 0x183f3ff;
+                    ret |= cpuid_1_edx & CPUID_EXT2_AMD_ALIASES;
                     break;
                 }
                 break;
-- 
cgit v1.2.3


From 60032ac04c675cf8950497f9d06e681b2dc7085c Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 6 Sep 2012 10:05:37 +0000
Subject: i386: cpu: replace EXT2_FEATURE_MASK with CPUID_EXT2_AMD_ALIASES

Both constants have the same value, but CPUID_EXT2_AMD_ALIASES is
defined without using magic numbers.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Reviewed-by: Don Slutz <Don@CloudSwitch.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 target-i386/cpu.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index fd4fe2898..7cad3b5ac 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -258,7 +258,6 @@ typedef struct x86_def_t {
           CPUID_MSR | CPUID_MCE | CPUID_CX8 | CPUID_PGE | CPUID_CMOV | \
           CPUID_PAT | CPUID_FXSR | CPUID_MMX | CPUID_SSE | CPUID_SSE2 | \
           CPUID_PAE | CPUID_SEP | CPUID_APIC)
-#define EXT2_FEATURE_MASK 0x0183F3FF
 
 #define TCG_FEATURES (CPUID_FP87 | CPUID_PSE | CPUID_TSC | CPUID_MSR | \
           CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC | CPUID_SEP | \
@@ -276,7 +275,7 @@ typedef struct x86_def_t {
           /* missing:
           CPUID_EXT_DTES64, CPUID_EXT_DSCPL, CPUID_EXT_VMX, CPUID_EXT_EST,
           CPUID_EXT_TM2, CPUID_EXT_XTPR, CPUID_EXT_PDCM, CPUID_EXT_XSAVE */
-#define TCG_EXT2_FEATURES ((TCG_FEATURES & EXT2_FEATURE_MASK) | \
+#define TCG_EXT2_FEATURES ((TCG_FEATURES & CPUID_EXT2_AMD_ALIASES) | \
           CPUID_EXT2_NX | CPUID_EXT2_MMXEXT | CPUID_EXT2_RDTSCP | \
           CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT)
           /* missing:
@@ -305,7 +304,7 @@ static x86_def_t builtin_x86_defs[] = {
             CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
             CPUID_PSE36,
         .ext_features = CPUID_EXT_SSE3 | CPUID_EXT_CX16 | CPUID_EXT_POPCNT,
-        .ext2_features = (PPRO_FEATURES & EXT2_FEATURE_MASK) |
+        .ext2_features = (PPRO_FEATURES & CPUID_EXT2_AMD_ALIASES) |
             CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
         .ext3_features = CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM |
             CPUID_EXT3_ABM | CPUID_EXT3_SSE4A,
@@ -325,7 +324,7 @@ static x86_def_t builtin_x86_defs[] = {
             CPUID_PSE36 | CPUID_VME | CPUID_HT,
         .ext_features = CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_CX16 |
             CPUID_EXT_POPCNT,
-        .ext2_features = (PPRO_FEATURES & EXT2_FEATURE_MASK) |
+        .ext2_features = (PPRO_FEATURES & CPUID_EXT2_AMD_ALIASES) |
             CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX |
             CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT | CPUID_EXT2_MMXEXT |
             CPUID_EXT2_FFXSR | CPUID_EXT2_PDPE1GB | CPUID_EXT2_RDTSCP,
@@ -373,7 +372,7 @@ static x86_def_t builtin_x86_defs[] = {
         /* Missing: CPUID_EXT_POPCNT, CPUID_EXT_MONITOR */
         .ext_features = CPUID_EXT_SSE3 | CPUID_EXT_CX16,
         /* Missing: CPUID_EXT2_PDPE1GB, CPUID_EXT2_RDTSCP */
-        .ext2_features = (PPRO_FEATURES & EXT2_FEATURE_MASK) |
+        .ext2_features = (PPRO_FEATURES & CPUID_EXT2_AMD_ALIASES) |
             CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
         /* Missing: CPUID_EXT3_LAHF_LM, CPUID_EXT3_CMP_LEG, CPUID_EXT3_EXTAPIC,
                     CPUID_EXT3_CR8LEG, CPUID_EXT3_ABM, CPUID_EXT3_SSE4A,
@@ -402,7 +401,7 @@ static x86_def_t builtin_x86_defs[] = {
         .features = PPRO_FEATURES |
             CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA | CPUID_PSE36,
         .ext_features = CPUID_EXT_SSE3,
-        .ext2_features = PPRO_FEATURES & EXT2_FEATURE_MASK,
+        .ext2_features = PPRO_FEATURES & CPUID_EXT2_AMD_ALIASES,
         .ext3_features = 0,
         .xlevel = 0x80000008,
         .model_id = "Common 32-bit KVM processor"
@@ -467,8 +466,10 @@ static x86_def_t builtin_x86_defs[] = {
         .family = 6,
         .model = 2,
         .stepping = 3,
-        .features = PPRO_FEATURES | CPUID_PSE36 | CPUID_VME | CPUID_MTRR | CPUID_MCA,
-        .ext2_features = (PPRO_FEATURES & EXT2_FEATURE_MASK) | CPUID_EXT2_MMXEXT | CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT,
+        .features = PPRO_FEATURES | CPUID_PSE36 | CPUID_VME | CPUID_MTRR |
+            CPUID_MCA,
+        .ext2_features = (PPRO_FEATURES & CPUID_EXT2_AMD_ALIASES) |
+            CPUID_EXT2_MMXEXT | CPUID_EXT2_3DNOW | CPUID_EXT2_3DNOWEXT,
         .xlevel = 0x80000008,
     },
     {
@@ -484,7 +485,8 @@ static x86_def_t builtin_x86_defs[] = {
             /* Some CPUs got no CPUID_SEP */
         .ext_features = CPUID_EXT_SSE3 | CPUID_EXT_MONITOR | CPUID_EXT_SSSE3 |
             CPUID_EXT_DSCPL | CPUID_EXT_EST | CPUID_EXT_TM2 | CPUID_EXT_XTPR,
-        .ext2_features = (PPRO_FEATURES & EXT2_FEATURE_MASK) | CPUID_EXT2_NX,
+        .ext2_features = (PPRO_FEATURES & CPUID_EXT2_AMD_ALIASES) |
+            CPUID_EXT2_NX,
         .ext3_features = CPUID_EXT3_LAHF_LM,
         .xlevel = 0x8000000A,
         .model_id = "Intel(R) Atom(TM) CPU N270   @ 1.60GHz",
-- 
cgit v1.2.3


From 3b671a40cab2404bc63e57db8cd3afa4ec70bfab Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 6 Sep 2012 10:05:38 +0000
Subject: i386: cpu: eliminate duplicate feature names

Instead of having duplicate feature names on the ext2_feature array for
the AMD feature bit aliases, we keep the feature names only on the
feature_name[] array, and copy the corresponding bits to
cpuid_ext2_features in case the CPU vendor is AMD.

This will:

- Make sure we don't set the feature bit aliases on Intel CPUs;
- Make it easier to convert feature bits to CPU properties, as now we
  have a single bit on the x86_def_t struct for each CPU feature.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Reviewed-by: Don Slutz <Don@CloudSwitch.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 target-i386/cpu.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 7cad3b5ac..7577381b9 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -61,15 +61,19 @@ static const char *ext_feature_name[] = {
     "tsc-deadline", "aes", "xsave", "osxsave",
     "avx", NULL, NULL, "hypervisor",
 };
+/* Feature names that are already defined on feature_name[] but are set on
+ * CPUID[8000_0001].EDX on AMD CPUs don't have their names on
+ * ext2_feature_name[]. They are copied automatically to cpuid_ext2_features
+ * if and only if CPU vendor is AMD.
+ */
 static const char *ext2_feature_name[] = {
-    "fpu", "vme", "de", "pse",
-    "tsc", "msr", "pae", "mce",
-    "cx8" /* AMD CMPXCHG8B */, "apic", NULL, "syscall",
-    "mtrr", "pge", "mca", "cmov",
-    "pat", "pse36", NULL, NULL /* Linux mp */,
-    "nx|xd", NULL, "mmxext", "mmx",
-    "fxsr", "fxsr_opt|ffxsr", "pdpe1gb" /* AMD Page1GB */, "rdtscp",
-    NULL, "lm|i64", "3dnowext", "3dnow",
+    NULL /* fpu */, NULL /* vme */, NULL /* de */, NULL /* pse */,
+    NULL /* tsc */, NULL /* msr */, NULL /* pae */, NULL /* mce */,
+    NULL /* cx8 */ /* AMD CMPXCHG8B */, NULL /* apic */, NULL, "syscall",
+    NULL /* mtrr */, NULL /* pge */, NULL /* mca */, NULL /* cmov */,
+    NULL /* pat */, NULL /* pse36 */, NULL, NULL /* Linux mp */,
+    "nx|xd", NULL, "mmxext", NULL /* mmx */,
+    NULL /* fxsr */, "fxsr_opt|ffxsr", "pdpe1gb" /* AMD Page1GB */, "rdtscp",
 };
 static const char *ext3_feature_name[] = {
     "lahf_lm" /* AMD LahfSahf */, "cmp_legacy", "svm", "extapic" /* AMD ExtApicSpace */,
@@ -1374,6 +1378,17 @@ int cpu_x86_register(X86CPU *cpu, const char *cpu_model)
     env->cpuid_xlevel2 = def->xlevel2;
     object_property_set_int(OBJECT(cpu), (int64_t)def->tsc_khz * 1000,
                             "tsc-frequency", &error);
+
+    /* On AMD CPUs, some CPUID[8000_0001].EDX bits must match the bits on
+     * CPUID[1].EDX.
+     */
+    if (env->cpuid_vendor1 == CPUID_VENDOR_AMD_1 &&
+            env->cpuid_vendor2 == CPUID_VENDOR_AMD_2 &&
+            env->cpuid_vendor3 == CPUID_VENDOR_AMD_3) {
+        env->cpuid_ext2_features &= ~CPUID_EXT2_AMD_ALIASES;
+        env->cpuid_ext2_features |= (def->features & CPUID_EXT2_AMD_ALIASES);
+    }
+
     if (!kvm_enabled()) {
         env->cpuid_features &= TCG_FEATURES;
         env->cpuid_ext_features &= TCG_EXT_FEATURES;
-- 
cgit v1.2.3


From 4a19e505df659dd25a77fb790399744f3e1f971c Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 6 Sep 2012 10:05:39 +0000
Subject: i386: -cpu help: remove reference to specific CPUID leaves/registers

The -cpu configuration interface is based on a list of feature names or
properties, on a single namespace, so there's no need to mention on
which CPUID leaf/register each flag is located.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Reviewed-by: Don Slutz <Don@CloudSwitch.com>
Signed-off-by: Blue Swirl <blauwirbel@gmail.com>
---
 target-i386/cpu.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 7577381b9..c4e6163c9 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -1312,13 +1312,13 @@ void x86_cpu_list(FILE *f, fprintf_function cpu_fprintf)
     }
     (*cpu_fprintf)(f, "\nRecognized CPUID flags:\n");
     listflags(buf, sizeof(buf), (uint32_t)~0, feature_name, 1);
-    (*cpu_fprintf)(f, "  f_edx: %s\n", buf);
+    (*cpu_fprintf)(f, "  %s\n", buf);
     listflags(buf, sizeof(buf), (uint32_t)~0, ext_feature_name, 1);
-    (*cpu_fprintf)(f, "  f_ecx: %s\n", buf);
+    (*cpu_fprintf)(f, "  %s\n", buf);
     listflags(buf, sizeof(buf), (uint32_t)~0, ext2_feature_name, 1);
-    (*cpu_fprintf)(f, "  extf_edx: %s\n", buf);
+    (*cpu_fprintf)(f, "  %s\n", buf);
     listflags(buf, sizeof(buf), (uint32_t)~0, ext3_feature_name, 1);
-    (*cpu_fprintf)(f, "  extf_ecx: %s\n", buf);
+    (*cpu_fprintf)(f, "  %s\n", buf);
 }
 
 CpuDefinitionInfoList *arch_query_cpu_definitions(Error **errp)
-- 
cgit v1.2.3


From a9321a4d49d65d29c2926a51aedc5b91a01f3591 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 26 Sep 2012 13:18:43 -0700
Subject: x86: Implement SMEP and SMAP

This patch implements Supervisor Mode Execution Prevention (SMEP) and
Supervisor Mode Access Prevention (SMAP) for x86.  The purpose of the
patch, obviously, is to help kernel developers debug the support for
those features.

A fair bit of the code relates to the handling of CPUID features.  The
CPUID code probably would get greatly simplified if all the feature
bit words were unified into a single vector object, but in the
interest of producing a minimal patch for SMEP/SMAP, and because I had
very limited time for this project, I followed the existing style.

[ v2: don't change the definition of the qemu64 CPU shorthand, since
  that breaks loading old snapshots.  Per Anthony Liguori this can be
  fixed once the CPU feature set is snapshot.

  Change the coding style slightly to conform to checkpatch.pl. ]

Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 target-i386/cc_helper.c |  10 ++++
 target-i386/cpu.c       |  34 ++++++++---
 target-i386/cpu.h       |  33 ++++++++---
 target-i386/helper.c    | 150 ++++++++++++++++++++++++++++++++++++++----------
 target-i386/helper.h    |   2 +
 target-i386/translate.c |  27 +++++++--
 6 files changed, 207 insertions(+), 49 deletions(-)

diff --git a/target-i386/cc_helper.c b/target-i386/cc_helper.c
index 07892f904..9422003f2 100644
--- a/target-i386/cc_helper.c
+++ b/target-i386/cc_helper.c
@@ -353,6 +353,16 @@ void helper_sti(CPUX86State *env)
     env->eflags |= IF_MASK;
 }
 
+void helper_clac(CPUX86State *env)
+{
+    env->eflags &= ~AC_MASK;
+}
+
+void helper_stac(CPUX86State *env)
+{
+    env->eflags |= AC_MASK;
+}
+
 #if 0
 /* vm86plus instructions */
 void helper_cli_vm(CPUX86State *env)
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index c4e6163c9..bb1e44eb0 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -104,6 +104,13 @@ static const char *svm_feature_name[] = {
     NULL, NULL, NULL, NULL,
 };
 
+static const char *cpuid_7_0_ebx_feature_name[] = {
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, "smep",
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+    NULL, NULL, NULL, NULL, "smap", NULL, NULL, NULL,
+    NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+};
+
 /* collects per-function cpuid data
  */
 typedef struct model_features_t {
@@ -219,14 +226,17 @@ static void add_flagname_to_bitmaps(const char *flagname, uint32_t *features,
                                     uint32_t *ext2_features,
                                     uint32_t *ext3_features,
                                     uint32_t *kvm_features,
-                                    uint32_t *svm_features)
+                                    uint32_t *svm_features,
+                                    uint32_t *cpuid_7_0_ebx_features)
 {
     if (!lookup_feature(features, flagname, NULL, feature_name) &&
         !lookup_feature(ext_features, flagname, NULL, ext_feature_name) &&
         !lookup_feature(ext2_features, flagname, NULL, ext2_feature_name) &&
         !lookup_feature(ext3_features, flagname, NULL, ext3_feature_name) &&
         !lookup_feature(kvm_features, flagname, NULL, kvm_feature_name) &&
-        !lookup_feature(svm_features, flagname, NULL, svm_feature_name))
+        !lookup_feature(svm_features, flagname, NULL, svm_feature_name) &&
+        !lookup_feature(cpuid_7_0_ebx_features, flagname, NULL,
+                        cpuid_7_0_ebx_feature_name))
             fprintf(stderr, "CPU feature %s not found\n", flagname);
 }
 
@@ -287,6 +297,7 @@ typedef struct x86_def_t {
 #define TCG_EXT3_FEATURES (CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM | \
           CPUID_EXT3_CR8LEG | CPUID_EXT3_ABM | CPUID_EXT3_SSE4A)
 #define TCG_SVM_FEATURES 0
+#define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP)
 
 /* maintains list of cpu model definitions
  */
@@ -1097,10 +1108,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
     uint32_t plus_features = 0, plus_ext_features = 0;
     uint32_t plus_ext2_features = 0, plus_ext3_features = 0;
     uint32_t plus_kvm_features = 0, plus_svm_features = 0;
+    uint32_t plus_7_0_ebx_features = 0;
     /* Features to be removed */
     uint32_t minus_features = 0, minus_ext_features = 0;
     uint32_t minus_ext2_features = 0, minus_ext3_features = 0;
     uint32_t minus_kvm_features = 0, minus_svm_features = 0;
+    uint32_t minus_7_0_ebx_features = 0;
     uint32_t numvalue;
 
     for (def = x86_defs; def; def = def->next)
@@ -1127,8 +1140,8 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
 #endif
 
     add_flagname_to_bitmaps("hypervisor", &plus_features,
-        &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
-        &plus_kvm_features, &plus_svm_features);
+            &plus_ext_features, &plus_ext2_features, &plus_ext3_features,
+            &plus_kvm_features, &plus_svm_features,  &plus_7_0_ebx_features);
 
     featurestr = strtok(NULL, ",");
 
@@ -1138,12 +1151,12 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
             add_flagname_to_bitmaps(featurestr + 1, &plus_features,
                             &plus_ext_features, &plus_ext2_features,
                             &plus_ext3_features, &plus_kvm_features,
-                            &plus_svm_features);
+                            &plus_svm_features, &plus_7_0_ebx_features);
         } else if (featurestr[0] == '-') {
             add_flagname_to_bitmaps(featurestr + 1, &minus_features,
                             &minus_ext_features, &minus_ext2_features,
                             &minus_ext3_features, &minus_kvm_features,
-                            &minus_svm_features);
+                            &minus_svm_features, &minus_7_0_ebx_features);
         } else if ((val = strchr(featurestr, '='))) {
             *val = 0; val++;
             if (!strcmp(featurestr, "family")) {
@@ -1249,16 +1262,21 @@ static int cpu_x86_find_by_name(x86_def_t *x86_cpu_def, const char *cpu_model)
     x86_cpu_def->ext3_features |= plus_ext3_features;
     x86_cpu_def->kvm_features |= plus_kvm_features;
     x86_cpu_def->svm_features |= plus_svm_features;
+    x86_cpu_def->cpuid_7_0_ebx_features |= plus_7_0_ebx_features;
     x86_cpu_def->features &= ~minus_features;
     x86_cpu_def->ext_features &= ~minus_ext_features;
     x86_cpu_def->ext2_features &= ~minus_ext2_features;
     x86_cpu_def->ext3_features &= ~minus_ext3_features;
     x86_cpu_def->kvm_features &= ~minus_kvm_features;
     x86_cpu_def->svm_features &= ~minus_svm_features;
+    x86_cpu_def->cpuid_7_0_ebx_features &= ~minus_7_0_ebx_features;
     if (check_cpuid) {
         if (check_features_against_host(x86_cpu_def) && enforce_cpuid)
             goto error;
     }
+    if (x86_cpu_def->cpuid_7_0_ebx_features && x86_cpu_def->level < 7) {
+        x86_cpu_def->level = 7;
+    }
     g_free(s);
     return 0;
 
@@ -1374,7 +1392,7 @@ int cpu_x86_register(X86CPU *cpu, const char *cpu_model)
     env->cpuid_kvm_features = def->kvm_features;
     env->cpuid_svm_features = def->svm_features;
     env->cpuid_ext4_features = def->ext4_features;
-    env->cpuid_7_0_ebx = def->cpuid_7_0_ebx_features;
+    env->cpuid_7_0_ebx_features = def->cpuid_7_0_ebx_features;
     env->cpuid_xlevel2 = def->xlevel2;
     object_property_set_int(OBJECT(cpu), (int64_t)def->tsc_khz * 1000,
                             "tsc-frequency", &error);
@@ -1562,7 +1580,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count,
         /* Structured Extended Feature Flags Enumeration Leaf */
         if (count == 0) {
             *eax = 0; /* Maximum ECX value for sub-leaves */
-            *ebx = env->cpuid_7_0_ebx; /* Feature flags */
+            *ebx = env->cpuid_7_0_ebx_features; /* Feature flags */
             *ecx = 0; /* Reserved */
             *edx = 0; /* Reserved */
         } else {
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 49950843f..e4a7d5b6e 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -123,8 +123,8 @@
 
 /* hidden flags - used internally by qemu to represent additional cpu
    states. Only the CPL, INHIBIT_IRQ, SMM and SVMI are not
-   redundant. We avoid using the IOPL_MASK, TF_MASK and VM_MASK bit
-   position to ease oring with eflags. */
+   redundant. We avoid using the IOPL_MASK, TF_MASK, VM_MASK and AC_MASK
+   bit positions to ease oring with eflags. */
 /* current cpl */
 #define HF_CPL_SHIFT         0
 /* true if soft mmu is being used */
@@ -147,10 +147,12 @@
 #define HF_CS64_SHIFT       15 /* only used on x86_64: 64 bit code segment  */
 #define HF_RF_SHIFT         16 /* must be same as eflags */
 #define HF_VM_SHIFT         17 /* must be same as eflags */
+#define HF_AC_SHIFT         18 /* must be same as eflags */
 #define HF_SMM_SHIFT        19 /* CPU in SMM mode */
 #define HF_SVME_SHIFT       20 /* SVME enabled (copy of EFER.SVME) */
 #define HF_SVMI_SHIFT       21 /* SVM intercepts are active */
 #define HF_OSFXSR_SHIFT     22 /* CR4.OSFXSR */
+#define HF_SMAP_SHIFT       23 /* CR4.SMAP */
 
 #define HF_CPL_MASK          (3 << HF_CPL_SHIFT)
 #define HF_SOFTMMU_MASK      (1 << HF_SOFTMMU_SHIFT)
@@ -168,10 +170,12 @@
 #define HF_CS64_MASK         (1 << HF_CS64_SHIFT)
 #define HF_RF_MASK           (1 << HF_RF_SHIFT)
 #define HF_VM_MASK           (1 << HF_VM_SHIFT)
+#define HF_AC_MASK           (1 << HF_AC_SHIFT)
 #define HF_SMM_MASK          (1 << HF_SMM_SHIFT)
 #define HF_SVME_MASK         (1 << HF_SVME_SHIFT)
 #define HF_SVMI_MASK         (1 << HF_SVMI_SHIFT)
 #define HF_OSFXSR_MASK       (1 << HF_OSFXSR_SHIFT)
+#define HF_SMAP_MASK         (1 << HF_SMAP_SHIFT)
 
 /* hflags2 */
 
@@ -210,6 +214,13 @@
 #define CR4_OSFXSR_SHIFT 9
 #define CR4_OSFXSR_MASK (1 << CR4_OSFXSR_SHIFT)
 #define CR4_OSXMMEXCPT_MASK  (1 << 10)
+#define CR4_VMXE_MASK   (1 << 13)
+#define CR4_SMXE_MASK   (1 << 14)
+#define CR4_FSGSBASE_MASK (1 << 16)
+#define CR4_PCIDE_MASK  (1 << 17)
+#define CR4_OSXSAVE_MASK (1 << 18)
+#define CR4_SMEP_MASK   (1 << 20)
+#define CR4_SMAP_MASK   (1 << 21)
 
 #define DR6_BD          (1 << 13)
 #define DR6_BS          (1 << 14)
@@ -474,6 +485,9 @@
 #define CPUID_SVM_PAUSEFILTER  (1 << 10)
 #define CPUID_SVM_PFTHRESHOLD  (1 << 12)
 
+#define CPUID_7_0_EBX_SMEP     (1 << 7)
+#define CPUID_7_0_EBX_SMAP     (1 << 20)
+
 #define CPUID_VENDOR_INTEL_1 0x756e6547 /* "Genu" */
 #define CPUID_VENDOR_INTEL_2 0x49656e69 /* "ineI" */
 #define CPUID_VENDOR_INTEL_3 0x6c65746e /* "ntel" */
@@ -649,7 +663,7 @@ typedef struct {
 #define CPU_NB_REGS CPU_NB_REGS32
 #endif
 
-#define NB_MMU_MODES 2
+#define NB_MMU_MODES 3
 
 typedef enum TPRAccess {
     TPR_ACCESS_READ,
@@ -779,7 +793,7 @@ typedef struct CPUX86State {
     uint32_t cpuid_xlevel2;
     uint32_t cpuid_ext4_features;
     /* Flags from CPUID[EAX=7,ECX=0].EBX */
-    uint32_t cpuid_7_0_ebx;
+    uint32_t cpuid_7_0_ebx_features;
 
     /* MTRRs */
     uint64_t mtrr_fixed[11];
@@ -1018,10 +1032,15 @@ static inline CPUX86State *cpu_init(const char *cpu_model)
 /* MMU modes definitions */
 #define MMU_MODE0_SUFFIX _kernel
 #define MMU_MODE1_SUFFIX _user
-#define MMU_USER_IDX 1
+#define MMU_MODE2_SUFFIX _ksmap /* Kernel with SMAP override */
+#define MMU_KERNEL_IDX  0
+#define MMU_USER_IDX    1
+#define MMU_KSMAP_IDX   2
 static inline int cpu_mmu_index (CPUX86State *env)
 {
-    return (env->hflags & HF_CPL_MASK) == 3 ? 1 : 0;
+    return (env->hflags & HF_CPL_MASK) == 3 ? MMU_USER_IDX :
+        ((env->hflags & HF_SMAP_MASK) && (env->eflags & AC_MASK))
+        ? MMU_KSMAP_IDX : MMU_KERNEL_IDX;
 }
 
 #undef EAX
@@ -1107,7 +1126,7 @@ static inline void cpu_get_tb_cpu_state(CPUX86State *env, target_ulong *pc,
     *cs_base = env->segs[R_CS].base;
     *pc = *cs_base + env->eip;
     *flags = env->hflags |
-        (env->eflags & (IOPL_MASK | TF_MASK | RF_MASK | VM_MASK));
+        (env->eflags & (IOPL_MASK | TF_MASK | RF_MASK | VM_MASK | AC_MASK));
 }
 
 void do_cpu_init(X86CPU *cpu);
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 8a5da3d7c..c635667d6 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -443,17 +443,27 @@ void cpu_x86_update_cr4(CPUX86State *env, uint32_t new_cr4)
 #if defined(DEBUG_MMU)
     printf("CR4 update: CR4=%08x\n", (uint32_t)env->cr[4]);
 #endif
-    if ((new_cr4 & (CR4_PGE_MASK | CR4_PAE_MASK | CR4_PSE_MASK)) !=
-        (env->cr[4] & (CR4_PGE_MASK | CR4_PAE_MASK | CR4_PSE_MASK))) {
+    if ((new_cr4 ^ env->cr[4]) &
+        (CR4_PGE_MASK | CR4_PAE_MASK | CR4_PSE_MASK |
+         CR4_SMEP_MASK | CR4_SMAP_MASK)) {
         tlb_flush(env, 1);
     }
     /* SSE handling */
-    if (!(env->cpuid_features & CPUID_SSE))
+    if (!(env->cpuid_features & CPUID_SSE)) {
         new_cr4 &= ~CR4_OSFXSR_MASK;
-    if (new_cr4 & CR4_OSFXSR_MASK)
+    }
+    env->hflags &= ~HF_OSFXSR_MASK;
+    if (new_cr4 & CR4_OSFXSR_MASK) {
         env->hflags |= HF_OSFXSR_MASK;
-    else
-        env->hflags &= ~HF_OSFXSR_MASK;
+    }
+
+    if (!(env->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SMAP)) {
+        new_cr4 &= ~CR4_SMAP_MASK;
+    }
+    env->hflags &= ~HF_SMAP_MASK;
+    if (new_cr4 & CR4_SMAP_MASK) {
+        env->hflags |= HF_SMAP_MASK;
+    }
 
     env->cr[4] = new_cr4;
 }
@@ -591,17 +601,38 @@ int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
             /* 2 MB page */
             page_size = 2048 * 1024;
             ptep ^= PG_NX_MASK;
-            if ((ptep & PG_NX_MASK) && is_write1 == 2)
+            if ((ptep & PG_NX_MASK) && is_write1 == 2) {
                 goto do_fault_protect;
-            if (is_user) {
-                if (!(ptep & PG_USER_MASK))
+            }
+            switch (mmu_idx) {
+            case MMU_USER_IDX:
+                if (!(ptep & PG_USER_MASK)) {
                     goto do_fault_protect;
-                if (is_write && !(ptep & PG_RW_MASK))
+                }
+                if (is_write && !(ptep & PG_RW_MASK)) {
                     goto do_fault_protect;
-            } else {
+                }
+                break;
+
+            case MMU_KERNEL_IDX:
+                if (is_write1 != 2 && (env->cr[4] & CR4_SMAP_MASK) &&
+                    (ptep & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
+                /* fall through */
+            case MMU_KSMAP_IDX:
+                if (is_write1 == 2 && (env->cr[4] & CR4_SMEP_MASK) &&
+                    (ptep & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
                 if ((env->cr[0] & CR0_WP_MASK) &&
-                    is_write && !(ptep & PG_RW_MASK))
+                    is_write && !(ptep & PG_RW_MASK)) {
                     goto do_fault_protect;
+                }
+                break;
+
+            default: /* cannot happen */
+                break;
             }
             is_dirty = is_write && !(pde & PG_DIRTY_MASK);
             if (!(pde & PG_ACCESSED_MASK) || is_dirty) {
@@ -635,15 +666,35 @@ int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
             ptep ^= PG_NX_MASK;
             if ((ptep & PG_NX_MASK) && is_write1 == 2)
                 goto do_fault_protect;
-            if (is_user) {
-                if (!(ptep & PG_USER_MASK))
+            switch (mmu_idx) {
+            case MMU_USER_IDX:
+                if (!(ptep & PG_USER_MASK)) {
                     goto do_fault_protect;
-                if (is_write && !(ptep & PG_RW_MASK))
+                }
+                if (is_write && !(ptep & PG_RW_MASK)) {
                     goto do_fault_protect;
-            } else {
+                }
+                break;
+
+            case MMU_KERNEL_IDX:
+                if (is_write1 != 2 && (env->cr[4] & CR4_SMAP_MASK) &&
+                    (ptep & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
+                /* fall through */
+            case MMU_KSMAP_IDX:
+                if (is_write1 == 2 && (env->cr[4] & CR4_SMEP_MASK) &&
+                    (ptep & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
                 if ((env->cr[0] & CR0_WP_MASK) &&
-                    is_write && !(ptep & PG_RW_MASK))
+                    is_write && !(ptep & PG_RW_MASK)) {
                     goto do_fault_protect;
+                }
+                break;
+
+            default: /* cannot happen */
+                break;
             }
             is_dirty = is_write && !(pte & PG_DIRTY_MASK);
             if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
@@ -670,15 +721,35 @@ int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
         /* if PSE bit is set, then we use a 4MB page */
         if ((pde & PG_PSE_MASK) && (env->cr[4] & CR4_PSE_MASK)) {
             page_size = 4096 * 1024;
-            if (is_user) {
-                if (!(pde & PG_USER_MASK))
+            switch (mmu_idx) {
+            case MMU_USER_IDX:
+                if (!(pde & PG_USER_MASK)) {
                     goto do_fault_protect;
-                if (is_write && !(pde & PG_RW_MASK))
+                }
+                if (is_write && !(pde & PG_RW_MASK)) {
                     goto do_fault_protect;
-            } else {
+                }
+                break;
+
+            case MMU_KERNEL_IDX:
+                if (is_write1 != 2 && (env->cr[4] & CR4_SMAP_MASK) &&
+                    (pde & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
+                /* fall through */
+            case MMU_KSMAP_IDX:
+                if (is_write1 == 2 && (env->cr[4] & CR4_SMEP_MASK) &&
+                    (pde & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
                 if ((env->cr[0] & CR0_WP_MASK) &&
-                    is_write && !(pde & PG_RW_MASK))
+                    is_write && !(pde & PG_RW_MASK)) {
                     goto do_fault_protect;
+                }
+                break;
+
+            default: /* cannot happen */
+                break;
             }
             is_dirty = is_write && !(pde & PG_DIRTY_MASK);
             if (!(pde & PG_ACCESSED_MASK) || is_dirty) {
@@ -707,15 +778,35 @@ int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
             }
             /* combine pde and pte user and rw protections */
             ptep = pte & pde;
-            if (is_user) {
-                if (!(ptep & PG_USER_MASK))
+            switch (mmu_idx) {
+            case MMU_USER_IDX:
+                if (!(ptep & PG_USER_MASK)) {
                     goto do_fault_protect;
-                if (is_write && !(ptep & PG_RW_MASK))
+                }
+                if (is_write && !(ptep & PG_RW_MASK)) {
                     goto do_fault_protect;
-            } else {
+                }
+                break;
+
+            case MMU_KERNEL_IDX:
+                if (is_write1 != 2 && (env->cr[4] & CR4_SMAP_MASK) &&
+                    (ptep & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
+                /* fall through */
+            case MMU_KSMAP_IDX:
+                if (is_write1 == 2 && (env->cr[4] & CR4_SMEP_MASK) &&
+                    (ptep & PG_USER_MASK)) {
+                    goto do_fault_protect;
+                }
                 if ((env->cr[0] & CR0_WP_MASK) &&
-                    is_write && !(ptep & PG_RW_MASK))
+                    is_write && !(ptep & PG_RW_MASK)) {
                     goto do_fault_protect;
+                }
+                break;
+
+            default: /* cannot happen */
+                break;
             }
             is_dirty = is_write && !(pte & PG_DIRTY_MASK);
             if (!(pte & PG_ACCESSED_MASK) || is_dirty) {
@@ -762,8 +853,9 @@ int cpu_x86_handle_mmu_fault(CPUX86State *env, target_ulong addr,
     if (is_user)
         error_code |= PG_ERROR_U_MASK;
     if (is_write1 == 2 &&
-        (env->efer & MSR_EFER_NXE) &&
-        (env->cr[4] & CR4_PAE_MASK))
+        (((env->efer & MSR_EFER_NXE) &&
+          (env->cr[4] & CR4_PAE_MASK)) ||
+         (env->cr[4] & CR4_SMEP_MASK)))
         error_code |= PG_ERROR_I_D_MASK;
     if (env->intercept_exceptions & (1 << EXCP0E_PAGE)) {
         /* cr2 is not modified in case of exceptions */
diff --git a/target-i386/helper.h b/target-i386/helper.h
index ab6af638e..93850ceec 100644
--- a/target-i386/helper.h
+++ b/target-i386/helper.h
@@ -67,6 +67,8 @@ DEF_HELPER_3(raise_interrupt, void, env, int, int)
 DEF_HELPER_2(raise_exception, void, env, int)
 DEF_HELPER_1(cli, void, env)
 DEF_HELPER_1(sti, void, env)
+DEF_HELPER_1(clac, void, env)
+DEF_HELPER_1(stac, void, env)
 DEF_HELPER_1(set_inhibit_irq, void, env)
 DEF_HELPER_1(reset_inhibit_irq, void, env)
 DEF_HELPER_3(boundw, void, env, tl, int)
diff --git a/target-i386/translate.c b/target-i386/translate.c
index 323869d87..0a7e4e348 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -107,6 +107,7 @@ typedef struct DisasContext {
     int cpuid_ext_features;
     int cpuid_ext2_features;
     int cpuid_ext3_features;
+    int cpuid_7_0_ebx_features;
 } DisasContext;
 
 static void gen_eob(DisasContext *s);
@@ -6556,7 +6557,7 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
             }
             gen_pop_update(s);
             s->cc_op = CC_OP_EFLAGS;
-            /* abort translation because TF flag may change */
+            /* abort translation because TF/AC flag may change */
             gen_jmp_im(s->pc - s->cs_base);
             gen_eob(s);
         }
@@ -7206,6 +7207,24 @@ static target_ulong disas_insn(DisasContext *s, target_ulong pc_start)
                     gen_helper_mwait(cpu_env, tcg_const_i32(s->pc - pc_start));
                     gen_eob(s);
                     break;
+                case 2: /* clac */
+                    if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SMAP) ||
+                        s->cpl != 0) {
+                        goto illegal_op;
+                    }
+                    gen_helper_clac(cpu_env);
+                    gen_jmp_im(s->pc - s->cs_base);
+                    gen_eob(s);
+                    break;
+                case 3: /* stac */
+                    if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_SMAP) ||
+                        s->cpl != 0) {
+                        goto illegal_op;
+                    }
+                    gen_helper_stac(cpu_env);
+                    gen_jmp_im(s->pc - s->cs_base);
+                    gen_eob(s);
+                    break;
                 default:
                     goto illegal_op;
                 }
@@ -7901,15 +7920,13 @@ static inline void gen_intermediate_code_internal(CPUX86State *env,
     /* select memory access functions */
     dc->mem_index = 0;
     if (flags & HF_SOFTMMU_MASK) {
-        if (dc->cpl == 3)
-            dc->mem_index = 2 * 4;
-        else
-            dc->mem_index = 1 * 4;
+        dc->mem_index = (cpu_mmu_index(env) + 1) << 2;
     }
     dc->cpuid_features = env->cpuid_features;
     dc->cpuid_ext_features = env->cpuid_ext_features;
     dc->cpuid_ext2_features = env->cpuid_ext2_features;
     dc->cpuid_ext3_features = env->cpuid_ext3_features;
+    dc->cpuid_7_0_ebx_features = env->cpuid_7_0_ebx_features;
 #ifdef TARGET_X86_64
     dc->lma = (flags >> HF_LMA_SHIFT) & 1;
     dc->code64 = (flags >> HF_CS64_SHIFT) & 1;
-- 
cgit v1.2.3


From df8c1b0207689a4bcc2396bf5327d5ea135a45bb Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 26 Sep 2012 11:19:18 -0600
Subject: Update kernel header script to include vfio

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 scripts/update-linux-headers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 53a6f8765..67be2efec 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -57,7 +57,7 @@ done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h kvm_para.h vhost.h virtio_config.h virtio_ring.h; do
+for header in kvm.h kvm_para.h vfio.h vhost.h virtio_config.h virtio_ring.h; do
     cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
 rm -rf "$output/linux-headers/asm-generic"
-- 
cgit v1.2.3


From 883f0b85f0cbb8c31baeadfcbfd144a1ed29d935 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 26 Sep 2012 11:19:25 -0600
Subject: Update Linux kernel headers

Based on Linux as of 1a95620.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 linux-headers/linux/vfio.h | 368 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 368 insertions(+)
 create mode 100644 linux-headers/linux/vfio.h

diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
new file mode 100644
index 000000000..f787b727a
--- /dev/null
+++ b/linux-headers/linux/vfio.h
@@ -0,0 +1,368 @@
+/*
+ * VFIO API definition
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef VFIO_H
+#define VFIO_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VFIO_API_VERSION	0
+
+
+/* Kernel & User level defines for VFIO IOCTLs. */
+
+/* Extensions */
+
+#define VFIO_TYPE1_IOMMU		1
+
+/*
+ * The IOCTL interface is designed for extensibility by embedding the
+ * structure length (argsz) and flags into structures passed between
+ * kernel and userspace.  We therefore use the _IO() macro for these
+ * defines to avoid implicitly embedding a size into the ioctl request.
+ * As structure fields are added, argsz will increase to match and flag
+ * bits will be defined to indicate additional fields with valid data.
+ * It's *always* the caller's responsibility to indicate the size of
+ * the structure passed by setting argsz appropriately.
+ */
+
+#define VFIO_TYPE	(';')
+#define VFIO_BASE	100
+
+/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
+
+/**
+ * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
+ *
+ * Report the version of the VFIO API.  This allows us to bump the entire
+ * API version should we later need to add or change features in incompatible
+ * ways.
+ * Return: VFIO_API_VERSION
+ * Availability: Always
+ */
+#define VFIO_GET_API_VERSION		_IO(VFIO_TYPE, VFIO_BASE + 0)
+
+/**
+ * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
+ *
+ * Check whether an extension is supported.
+ * Return: 0 if not supported, 1 (or some other positive integer) if supported.
+ * Availability: Always
+ */
+#define VFIO_CHECK_EXTENSION		_IO(VFIO_TYPE, VFIO_BASE + 1)
+
+/**
+ * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
+ *
+ * Set the iommu to the given type.  The type must be supported by an
+ * iommu driver as verified by calling CHECK_EXTENSION using the same
+ * type.  A group must be set to this file descriptor before this
+ * ioctl is available.  The IOMMU interfaces enabled by this call are
+ * specific to the value set.
+ * Return: 0 on success, -errno on failure
+ * Availability: When VFIO group attached
+ */
+#define VFIO_SET_IOMMU			_IO(VFIO_TYPE, VFIO_BASE + 2)
+
+/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
+
+/**
+ * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
+ *						struct vfio_group_status)
+ *
+ * Retrieve information about the group.  Fills in provided
+ * struct vfio_group_info.  Caller sets argsz.
+ * Return: 0 on succes, -errno on failure.
+ * Availability: Always
+ */
+struct vfio_group_status {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_GROUP_FLAGS_VIABLE		(1 << 0)
+#define VFIO_GROUP_FLAGS_CONTAINER_SET	(1 << 1)
+};
+#define VFIO_GROUP_GET_STATUS		_IO(VFIO_TYPE, VFIO_BASE + 3)
+
+/**
+ * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
+ *
+ * Set the container for the VFIO group to the open VFIO file
+ * descriptor provided.  Groups may only belong to a single
+ * container.  Containers may, at their discretion, support multiple
+ * groups.  Only when a container is set are all of the interfaces
+ * of the VFIO file descriptor and the VFIO group file descriptor
+ * available to the user.
+ * Return: 0 on success, -errno on failure.
+ * Availability: Always
+ */
+#define VFIO_GROUP_SET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 4)
+
+/**
+ * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
+ *
+ * Remove the group from the attached container.  This is the
+ * opposite of the SET_CONTAINER call and returns the group to
+ * an initial state.  All device file descriptors must be released
+ * prior to calling this interface.  When removing the last group
+ * from a container, the IOMMU will be disabled and all state lost,
+ * effectively also returning the VFIO file descriptor to an initial
+ * state.
+ * Return: 0 on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_UNSET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 5)
+
+/**
+ * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
+ *
+ * Return a new file descriptor for the device object described by
+ * the provided string.  The string should match a device listed in
+ * the devices subdirectory of the IOMMU group sysfs entry.  The
+ * group containing the device must already be added to this context.
+ * Return: new file descriptor on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
+
+/* --------------- IOCTLs for DEVICE file descriptors --------------- */
+
+/**
+ * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
+ *						struct vfio_device_info)
+ *
+ * Retrieve information about the device.  Fills in provided
+ * struct vfio_device_info.  Caller sets argsz.
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
+#define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
+	__u32	num_regions;	/* Max region index + 1 */
+	__u32	num_irqs;	/* Max IRQ index + 1 */
+};
+#define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
+
+/**
+ * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
+ *				       struct vfio_region_info)
+ *
+ * Retrieve information about a device region.  Caller provides
+ * struct vfio_region_info with index value set.  Caller sets argsz.
+ * Implementation of region mapping is bus driver specific.  This is
+ * intended to describe MMIO, I/O port, as well as bus specific
+ * regions (ex. PCI config space).  Zero sized regions may be used
+ * to describe unimplemented regions (ex. unimplemented PCI BARs).
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_region_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
+#define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
+#define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
+	__u32	index;		/* Region index */
+	__u32	resv;		/* Reserved for alignment */
+	__u64	size;		/* Region size (bytes) */
+	__u64	offset;		/* Region offset from start of device fd */
+};
+#define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
+
+/**
+ * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
+ *				    struct vfio_irq_info)
+ *
+ * Retrieve information about a device IRQ.  Caller provides
+ * struct vfio_irq_info with index value set.  Caller sets argsz.
+ * Implementation of IRQ mapping is bus driver specific.  Indexes
+ * using multiple IRQs are primarily intended to support MSI-like
+ * interrupt blocks.  Zero count irq blocks may be used to describe
+ * unimplemented interrupt types.
+ *
+ * The EVENTFD flag indicates the interrupt index supports eventfd based
+ * signaling.
+ *
+ * The MASKABLE flags indicates the index supports MASK and UNMASK
+ * actions described below.
+ *
+ * AUTOMASKED indicates that after signaling, the interrupt line is
+ * automatically masked by VFIO and the user needs to unmask the line
+ * to receive new interrupts.  This is primarily intended to distinguish
+ * level triggered interrupts.
+ *
+ * The NORESIZE flag indicates that the interrupt lines within the index
+ * are setup as a set and new subindexes cannot be enabled without first
+ * disabling the entire index.  This is used for interrupts like PCI MSI
+ * and MSI-X where the driver may only use a subset of the available
+ * indexes, but VFIO needs to enable a specific number of vectors
+ * upfront.  In the case of MSI-X, where the user can enable MSI-X and
+ * then add and unmask vectors, it's up to userspace to make the decision
+ * whether to allocate the maximum supported number of vectors or tear
+ * down setup and incrementally increase the vectors as each is enabled.
+ */
+struct vfio_irq_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_INFO_EVENTFD		(1 << 0)
+#define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
+#define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
+#define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
+	__u32	index;		/* IRQ index */
+	__u32	count;		/* Number of IRQs within this index */
+};
+#define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
+
+/**
+ * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
+ *
+ * Set signaling, masking, and unmasking of interrupts.  Caller provides
+ * struct vfio_irq_set with all fields set.  'start' and 'count' indicate
+ * the range of subindexes being specified.
+ *
+ * The DATA flags specify the type of data provided.  If DATA_NONE, the
+ * operation performs the specified action immediately on the specified
+ * interrupt(s).  For example, to unmask AUTOMASKED interrupt [0,0]:
+ * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
+ *
+ * DATA_BOOL allows sparse support for the same on arrays of interrupts.
+ * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
+ * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
+ * data = {1,0,1}
+ *
+ * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
+ * A value of -1 can be used to either de-assign interrupts if already
+ * assigned or skip un-assigned interrupts.  For example, to set an eventfd
+ * to be trigger for interrupts [0,0] and [0,2]:
+ * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
+ * data = {fd1, -1, fd2}
+ * If index [0,1] is previously set, two count = 1 ioctls calls would be
+ * required to set [0,0] and [0,2] without changing [0,1].
+ *
+ * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
+ * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
+ * from userspace (ie. simulate hardware triggering).
+ *
+ * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
+ * enables the interrupt index for the device.  Individual subindex interrupts
+ * can be disabled using the -1 value for DATA_EVENTFD or the index can be
+ * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
+ *
+ * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
+ * ACTION_TRIGGER specifies kernel->user signaling.
+ */
+struct vfio_irq_set {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_SET_DATA_NONE		(1 << 0) /* Data not present */
+#define VFIO_IRQ_SET_DATA_BOOL		(1 << 1) /* Data is bool (u8) */
+#define VFIO_IRQ_SET_DATA_EVENTFD	(1 << 2) /* Data is eventfd (s32) */
+#define VFIO_IRQ_SET_ACTION_MASK	(1 << 3) /* Mask interrupt */
+#define VFIO_IRQ_SET_ACTION_UNMASK	(1 << 4) /* Unmask interrupt */
+#define VFIO_IRQ_SET_ACTION_TRIGGER	(1 << 5) /* Trigger interrupt */
+	__u32	index;
+	__u32	start;
+	__u32	count;
+	__u8	data[];
+};
+#define VFIO_DEVICE_SET_IRQS		_IO(VFIO_TYPE, VFIO_BASE + 10)
+
+#define VFIO_IRQ_SET_DATA_TYPE_MASK	(VFIO_IRQ_SET_DATA_NONE | \
+					 VFIO_IRQ_SET_DATA_BOOL | \
+					 VFIO_IRQ_SET_DATA_EVENTFD)
+#define VFIO_IRQ_SET_ACTION_TYPE_MASK	(VFIO_IRQ_SET_ACTION_MASK | \
+					 VFIO_IRQ_SET_ACTION_UNMASK | \
+					 VFIO_IRQ_SET_ACTION_TRIGGER)
+/**
+ * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
+ *
+ * Reset a device.
+ */
+#define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
+
+/*
+ * The VFIO-PCI bus driver makes use of the following fixed region and
+ * IRQ index mapping.  Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
+ */
+
+enum {
+	VFIO_PCI_BAR0_REGION_INDEX,
+	VFIO_PCI_BAR1_REGION_INDEX,
+	VFIO_PCI_BAR2_REGION_INDEX,
+	VFIO_PCI_BAR3_REGION_INDEX,
+	VFIO_PCI_BAR4_REGION_INDEX,
+	VFIO_PCI_BAR5_REGION_INDEX,
+	VFIO_PCI_ROM_REGION_INDEX,
+	VFIO_PCI_CONFIG_REGION_INDEX,
+	VFIO_PCI_NUM_REGIONS
+};
+
+enum {
+	VFIO_PCI_INTX_IRQ_INDEX,
+	VFIO_PCI_MSI_IRQ_INDEX,
+	VFIO_PCI_MSIX_IRQ_INDEX,
+	VFIO_PCI_NUM_IRQS
+};
+
+/* -------- API for Type1 VFIO IOMMU -------- */
+
+/**
+ * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info)
+ *
+ * Retrieve information about the IOMMU object. Fills in provided
+ * struct vfio_iommu_info. Caller sets argsz.
+ *
+ * XXX Should we do these by CHECK_EXTENSION too?
+ */
+struct vfio_iommu_type1_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+};
+
+#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/**
+ * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map)
+ *
+ * Map process virtual addresses to IO virtual addresses using the
+ * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ */
+struct vfio_iommu_type1_dma_map {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
+#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+	__u64	vaddr;				/* Process virtual address */
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/**
+ * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ *
+ * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
+ * Caller sets argsz.
+ */
+struct vfio_iommu_type1_dma_unmap {
+	__u32	argsz;
+	__u32	flags;
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+#endif /* VFIO_H */
-- 
cgit v1.2.3


From 65501a745dbaf28284e099f724aa5fc478eae0e7 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 26 Sep 2012 11:19:32 -0600
Subject: vfio: vfio-pci device assignment driver

This adds the core of the QEMU VFIO-based PCI device assignment driver.
To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
module.  To assign device 0000:05:00.0 to a guest, do the following:

for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
    vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
    device=$(cat /sys/bus/pci/devices/$dev/device)
    if [ -e /sys/bus/pci/devices/$dev/driver ]; then
        echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
    fi
    echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
done

See Documentation/vfio.txt in the Linux kernel tree for further
description of IOMMU groups and VFIO.

Then launch qemu including the option:

-device vfio-pci,host=0000:05:00.0

Legacy PCI interrupts (INTx) currently makes use of a kludge where we
trap BAR accesses and assume the access is in response to an interrupt,
therefore de-asserting and unmasking the interrupt.  It's not quite as
targetted as using the EOI for this, but it's self contained and seems
to work across all architectures.  The side-effect is a significant
performance slow-down for device in INTx mode.  Some devices, like
graphics cards, don't really use their interrupt, so this can be turned
off with the x-intx=off option, which disables INTx alltogether.  This
should be considered an experimental option until we refine this code.
Both MSI and MSI-X are supported and avoid these issues.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/vfio_pci.c     | 1864 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio_pci_int.h |  114 ++++
 2 files changed, 1978 insertions(+)
 create mode 100644 hw/vfio_pci.c
 create mode 100644 hw/vfio_pci_int.h

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
new file mode 100644
index 000000000..a24558aaa
--- /dev/null
+++ b/hw/vfio_pci.c
@@ -0,0 +1,1864 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <linux/vfio.h>
+
+#include "config.h"
+#include "event_notifier.h"
+#include "exec-memory.h"
+#include "kvm.h"
+#include "memory.h"
+#include "msi.h"
+#include "msix.h"
+#include "qemu-error.h"
+#include "range.h"
+#include "vfio_pci_int.h"
+
+/* #define DEBUG_VFIO */
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+#define MSIX_CAP_LENGTH 12
+
+static QLIST_HEAD(, VFIOContainer)
+    container_list = QLIST_HEAD_INITIALIZER(container_list);
+
+static QLIST_HEAD(, VFIOGroup)
+    group_list = QLIST_HEAD_INITIALIZER(group_list);
+
+static void vfio_disable_interrupts(VFIODevice *vdev);
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled);
+
+/*
+ * Common VFIO interrupt disable
+ */
+static void vfio_disable_irqindex(VFIODevice *vdev, int index)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+        .index = index,
+        .start = 0,
+        .count = 0,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+
+    vdev->interrupt = VFIO_INT_NONE;
+}
+
+/*
+ * INTx
+ */
+static void vfio_unmask_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+        .index = VFIO_PCI_INTX_IRQ_INDEX,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function,
+            'A' + vdev->intx.pin);
+
+    vdev->intx.pending = true;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(VFIODevice *vdev)
+{
+    if (!vdev->intx.pending) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_unmask_intx(vdev);
+}
+
+typedef struct QEMU_PACKED VFIOIRQSetFD {
+    struct vfio_irq_set irq_set;
+    int32_t fd;
+} VFIOIRQSetFD;
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    VFIOIRQSetFD irq_set_fd = {
+        .irq_set = {
+            .argsz = sizeof(irq_set_fd),
+            .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+            .index = VFIO_PCI_INTX_IRQ_INDEX,
+            .start = 0,
+            .count = 1,
+        },
+    };
+    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
+    int ret;
+
+    if (vdev->intx.disabled || !pin) {
+        return 0;
+    }
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+    ret = event_notifier_init(&vdev->intx.interrupt, 0);
+    if (ret) {
+        error_report("vfio: Error: event_notifier_init failed\n");
+        return ret;
+    }
+
+    irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt);
+    qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev);
+
+    if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) {
+        error_report("vfio: Error: Failed to setup INTx fd: %m\n");
+        return -errno;
+    }
+
+    /*
+     * Disable mmaps so we can trap on BAR accesses.  We interpret any
+     * access as a response to an interrupt and unmask the physical
+     * device.  The device will re-assert if the interrupt is still
+     * pending.  We'll likely retrigger on the host multiple times per
+     * guest interrupt, but without EOI notification it's better than
+     * nothing.  Acceleration paths through KVM will avoid this.
+     */
+    vfio_mmap_set_enabled(vdev, false);
+
+    vdev->interrupt = VFIO_INT_INTx;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_mmap_set_enabled(vdev, true);
+
+    fd = event_notifier_get_fd(&vdev->intx.interrupt);
+    qemu_set_fd_handler(fd, NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->intx.interrupt);
+
+    vdev->interrupt = VFIO_INT_NONE;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+}
+
+/*
+ * MSI/X
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    VFIOMSIVector *vector = opaque;
+    VFIODevice *vdev = vector->vdev;
+    int nr = vector - vdev->msi_vectors;
+
+    if (!event_notifier_test_and_clear(&vector->interrupt)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, nr);
+
+    if (vdev->interrupt == VFIO_INT_MSIX) {
+        msix_notify(&vdev->pdev, nr);
+    } else if (vdev->interrupt == VFIO_INT_MSI) {
+        msi_notify(&vdev->pdev, nr);
+    } else {
+        error_report("vfio: MSI interrupt receieved, but not enabled?\n");
+    }
+}
+
+static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
+{
+    struct vfio_irq_set *irq_set;
+    int ret = 0, i, argsz;
+    int32_t *fds;
+
+    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = vdev->nr_vectors;
+    fds = (int32_t *)&irq_set->data;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        if (!vdev->msi_vectors[i].use) {
+            fds[i] = -1;
+            continue;
+        }
+
+        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+    }
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+    g_free(irq_set);
+
+    if (!ret) {
+        vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
+    }
+
+    return ret;
+}
+
+static int vfio_msix_vector_use(PCIDevice *pdev,
+                                unsigned int nr, MSIMessage msg)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOMSIVector *vector;
+    int ret;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, nr);
+
+    if (vdev->interrupt != VFIO_INT_MSIX) {
+        vfio_disable_interrupts(vdev);
+    }
+
+    if (!vdev->msi_vectors) {
+        vdev->msi_vectors = g_malloc0(vdev->msix->entries *
+                                      sizeof(VFIOMSIVector));
+    }
+
+    vector = &vdev->msi_vectors[nr];
+    vector->vdev = vdev;
+    vector->use = true;
+
+    msix_vector_use(pdev, nr);
+
+    if (event_notifier_init(&vector->interrupt, 0)) {
+        error_report("vfio: Error: event_notifier_init failed\n");
+    }
+
+    /*
+     * Attempt to enable route through KVM irqchip,
+     * default to userspace handling if unavailable.
+     */
+    vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+    if (vector->virq < 0 ||
+        kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
+                                       vector->virq) < 0) {
+        if (vector->virq >= 0) {
+            kvm_irqchip_release_virq(kvm_state, vector->virq);
+            vector->virq = -1;
+        }
+        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                            vfio_msi_interrupt, NULL, vector);
+    }
+
+    /*
+     * We don't want to have the host allocate all possible MSI vectors
+     * for a device if they're not in use, so we shutdown and incrementally
+     * increase them as needed.
+     */
+    if (vdev->nr_vectors < nr + 1) {
+        int i;
+
+        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
+        vdev->nr_vectors = nr + 1;
+        ret = vfio_enable_vectors(vdev, true);
+        if (ret) {
+            error_report("vfio: failed to enable vectors, %d\n", ret);
+        }
+
+        /* We don't know if we've missed interrupts in the interim... */
+        for (i = 0; i < vdev->msix->entries; i++) {
+            if (vdev->msi_vectors[i].use) {
+                msix_notify(&vdev->pdev, i);
+            }
+        }
+    } else {
+        VFIOIRQSetFD irq_set_fd = {
+            .irq_set = {
+                .argsz = sizeof(irq_set_fd),
+                .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                         VFIO_IRQ_SET_ACTION_TRIGGER,
+                .index = VFIO_PCI_MSIX_IRQ_INDEX,
+                .start = nr,
+                .count = 1,
+            },
+            .fd = event_notifier_get_fd(&vector->interrupt),
+        };
+        ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
+        if (ret) {
+            error_report("vfio: failed to modify vector, %d\n", ret);
+        }
+
+        /*
+         * If we were connected to the hardware PBA we could skip this,
+         * until then, a spurious interrupt is better than starvation.
+         */
+        msix_notify(&vdev->pdev, nr);
+    }
+
+    return 0;
+}
+
+static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOMSIVector *vector = &vdev->msi_vectors[nr];
+    VFIOIRQSetFD irq_set_fd = {
+        .irq_set = {
+            .argsz = sizeof(irq_set_fd),
+            .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER,
+            .index = VFIO_PCI_MSIX_IRQ_INDEX,
+            .start = nr,
+            .count = 1,
+        },
+        .fd = -1,
+    };
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, nr);
+
+    /*
+     * XXX What's the right thing to do here?  This turns off the interrupt
+     * completely, but do we really just want to switch the interrupt to
+     * bouncing through userspace and let msix.c drop it?  Not sure.
+     */
+    msix_vector_unuse(pdev, nr);
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
+
+    if (vector->virq < 0) {
+        qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                            NULL, NULL, NULL);
+    } else {
+        kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
+                                          vector->virq);
+        kvm_irqchip_release_virq(kvm_state, vector->virq);
+        vector->virq = -1;
+    }
+
+    event_notifier_cleanup(&vector->interrupt);
+    vector->use = false;
+}
+
+/* TODO This should move to msi.c */
+static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
+{
+    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    MSIMessage msg;
+
+    if (msi64bit) {
+        msg.address = pci_get_quad(pdev->config +
+                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
+    } else {
+        msg.address = pci_get_long(pdev->config +
+                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
+    }
+
+    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
+                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
+    msg.data += vector;
+
+    return msg;
+}
+
+/* So should this */
+static void msi_set_qsize(PCIDevice *pdev, uint8_t size)
+{
+    uint8_t *config = pdev->config + pdev->msi_cap;
+    uint16_t flags;
+
+    flags = pci_get_word(config + PCI_MSI_FLAGS);
+    flags = le16_to_cpu(flags);
+    flags &= ~PCI_MSI_FLAGS_QSIZE;
+    flags |= (size & 0x7) << 4;
+    flags = cpu_to_le16(flags);
+    pci_set_word(config + PCI_MSI_FLAGS, flags);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int ret, i;
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
+retry:
+    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        MSIMessage msg;
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
+
+        vector->vdev = vdev;
+        vector->use = true;
+
+        if (event_notifier_init(&vector->interrupt, 0)) {
+            error_report("vfio: Error: event_notifier_init failed\n");
+        }
+
+        msg = msi_get_msg(&vdev->pdev, i);
+
+        /*
+         * Attempt to enable route through KVM irqchip,
+         * default to userspace handling if unavailable.
+         */
+        vector->virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+        if (vector->virq < 0 ||
+            kvm_irqchip_add_irqfd_notifier(kvm_state, &vector->interrupt,
+                                           vector->virq) < 0) {
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                vfio_msi_interrupt, NULL, vector);
+        }
+    }
+
+    ret = vfio_enable_vectors(vdev, false);
+    if (ret) {
+        if (ret < 0) {
+            error_report("vfio: Error: Failed to setup MSI fds: %m\n");
+        } else if (ret != vdev->nr_vectors) {
+            error_report("vfio: Error: Failed to enable %d "
+                         "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
+        }
+
+        for (i = 0; i < vdev->nr_vectors; i++) {
+            VFIOMSIVector *vector = &vdev->msi_vectors[i];
+            if (vector->virq >= 0) {
+                kvm_irqchip_remove_irqfd_notifier(kvm_state, &vector->interrupt,
+                                                  vector->virq);
+                kvm_irqchip_release_virq(kvm_state, vector->virq);
+                vector->virq = -1;
+            } else {
+                qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                    NULL, NULL, NULL);
+            }
+            event_notifier_cleanup(&vector->interrupt);
+        }
+
+        g_free(vdev->msi_vectors);
+
+        if (ret > 0 && ret != vdev->nr_vectors) {
+            vdev->nr_vectors = ret;
+            goto retry;
+        }
+        vdev->nr_vectors = 0;
+
+        return;
+    }
+
+    msi_set_qsize(&vdev->pdev, vdev->nr_vectors);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->nr_vectors);
+}
+
+static void vfio_disable_msi_x(VFIODevice *vdev, bool msix)
+{
+    int i;
+
+    vfio_disable_irqindex(vdev, msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+                                       VFIO_PCI_MSI_IRQ_INDEX);
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        VFIOMSIVector *vector = &vdev->msi_vectors[i];
+
+        if (!vector->use) {
+            continue;
+        }
+
+        if (vector->virq >= 0) {
+            kvm_irqchip_remove_irqfd_notifier(kvm_state,
+                                              &vector->interrupt, vector->virq);
+            kvm_irqchip_release_virq(kvm_state, vector->virq);
+            vector->virq = -1;
+        } else {
+            qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
+                                NULL, NULL, NULL);
+        }
+
+        if (msix) {
+            msix_vector_unuse(&vdev->pdev, i);
+        }
+
+        event_notifier_cleanup(&vector->interrupt);
+    }
+
+    g_free(vdev->msi_vectors);
+    vdev->msi_vectors = NULL;
+    vdev->nr_vectors = 0;
+
+    if (!msix) {
+        msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, msix ? "x" : "");
+
+    vfio_enable_intx(vdev);
+}
+
+/*
+ * IO Port/MMIO - Beware of the endians, VFIO is always little endian
+ */
+static void vfio_bar_write(void *opaque, target_phys_addr_t addr,
+                           uint64_t data, unsigned size)
+{
+    VFIOBAR *bar = opaque;
+    union {
+        uint8_t byte;
+        uint16_t word;
+        uint32_t dword;
+        uint64_t qword;
+    } buf;
+
+    switch (size) {
+    case 1:
+        buf.byte = data;
+        break;
+    case 2:
+        buf.word = cpu_to_le16(data);
+        break;
+    case 4:
+        buf.dword = cpu_to_le32(data);
+        break;
+    default:
+        hw_error("vfio: unsupported write size, %d bytes\n", size);
+        break;
+    }
+
+    if (pwrite(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
+        error_report("%s(,0x%"TARGET_PRIxPHYS", 0x%"PRIx64", %d) failed: %m\n",
+                     __func__, addr, data, size);
+    }
+
+    DPRINTF("%s(BAR%d+0x%"TARGET_PRIxPHYS", 0x%"PRIx64", %d)\n",
+            __func__, bar->nr, addr, data, size);
+
+    /*
+     * A read or write to a BAR always signals an INTx EOI.  This will
+     * do nothing if not pending (including not in INTx mode).  We assume
+     * that a BAR access is in response to an interrupt and that BAR
+     * accesses will service the interrupt.  Unfortunately, we don't know
+     * which access will service the interrupt, so we're potentially
+     * getting quite a few host interrupts per guest interrupt.
+     */
+    vfio_eoi(DO_UPCAST(VFIODevice, bars[bar->nr], bar));
+}
+
+static uint64_t vfio_bar_read(void *opaque,
+                              target_phys_addr_t addr, unsigned size)
+{
+    VFIOBAR *bar = opaque;
+    union {
+        uint8_t byte;
+        uint16_t word;
+        uint32_t dword;
+        uint64_t qword;
+    } buf;
+    uint64_t data = 0;
+
+    if (pread(bar->fd, &buf, size, bar->fd_offset + addr) != size) {
+        error_report("%s(,0x%"TARGET_PRIxPHYS", %d) failed: %m\n",
+                     __func__, addr, size);
+        return (uint64_t)-1;
+    }
+
+    switch (size) {
+    case 1:
+        data = buf.byte;
+        break;
+    case 2:
+        data = le16_to_cpu(buf.word);
+        break;
+    case 4:
+        data = le32_to_cpu(buf.dword);
+        break;
+    default:
+        hw_error("vfio: unsupported read size, %d bytes\n", size);
+        break;
+    }
+
+    DPRINTF("%s(BAR%d+0x%"TARGET_PRIxPHYS", %d) = 0x%"PRIx64"\n",
+            __func__, bar->nr, addr, size, data);
+
+    /* Same as write above */
+    vfio_eoi(DO_UPCAST(VFIODevice, bars[bar->nr], bar));
+
+    return data;
+}
+
+static const MemoryRegionOps vfio_bar_ops = {
+    .read = vfio_bar_read,
+    .write = vfio_bar_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    /*
+     * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
+     * capabilities, and the multifunction bit below.  We let VFIO handle
+     * virtualizing everything else.  Performance is not a concern here.
+     */
+    if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) {
+
+        val = pci_default_read_config(pdev, addr, len);
+    } else {
+        if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
+            error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m\n",
+                         __func__, vdev->host.domain, vdev->host.bus,
+                         vdev->host.slot, vdev->host.function, addr, len);
+            return -errno;
+        }
+        val = le32_to_cpu(val);
+    }
+
+    /* Multifunction bit is virualized in QEMU */
+    if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
+        uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
+
+        if (len == 4) {
+            mask <<= 16;
+        }
+
+        if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+            val |= mask;
+        } else {
+            val &= ~mask;
+        }
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, addr, len, val);
+
+    return val;
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val_le = cpu_to_le32(val);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, addr, val, len);
+
+    /* Write everything to VFIO, let it filter out what we can't write */
+    if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
+        error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m\n",
+                     __func__, vdev->host.domain, vdev->host.bus,
+                     vdev->host.slot, vdev->host.function, addr, val, len);
+    }
+
+    /* Write standard header bits to emulation */
+    if (addr < PCI_CONFIG_HEADER_SIZE) {
+        pci_default_write_config(pdev, addr, val, len);
+        return;
+    }
+
+    /* MSI/MSI-X Enabling/Disabling */
+    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
+        int is_enabled, was_enabled = msi_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        is_enabled = msi_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            vfio_enable_msi(vdev);
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi_x(vdev, false);
+        }
+    }
+
+    if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
+        int is_enabled, was_enabled = msix_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        is_enabled = msix_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            /* vfio_msix_vector_use handles this automatically */
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi_x(vdev, true);
+        }
+    }
+}
+
+/*
+ * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+ */
+static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova,
+                        ram_addr_t size, void *vaddr, bool readonly)
+{
+    struct vfio_iommu_type1_dma_map map = {
+        .argsz = sizeof(map),
+        .flags = VFIO_DMA_MAP_FLAG_READ,
+        .vaddr = (__u64)vaddr,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (!readonly) {
+        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
+    }
+
+    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
+        DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int vfio_dma_unmap(VFIOContainer *container,
+                          target_phys_addr_t iova, ram_addr_t size)
+{
+    struct vfio_iommu_type1_dma_unmap unmap = {
+        .argsz = sizeof(unmap),
+        .flags = 0,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static void vfio_listener_dummy1(MemoryListener *listener)
+{
+    /* We don't do batching (begin/commit) or care about logging */
+}
+
+static void vfio_listener_dummy2(MemoryListener *listener,
+                                 MemoryRegionSection *section)
+{
+    /* We don't do logging or care about nops */
+}
+
+static void vfio_listener_dummy3(MemoryListener *listener,
+                                 MemoryRegionSection *section,
+                                 bool match_data, uint64_t data,
+                                 EventNotifier *e)
+{
+    /* We don't care about eventfds */
+}
+
+static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+{
+    return !memory_region_is_ram(section->mr);
+}
+
+static void vfio_listener_region_add(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.listener);
+    target_phys_addr_t iova, end;
+    void *vaddr;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("vfio: SKIPPING region_add %"TARGET_PRIxPHYS" - %"PRIx64"\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space + section->size - 1);
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region\n", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + section->size) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    DPRINTF("vfio: region_add %"TARGET_PRIxPHYS" - %"TARGET_PRIxPHYS" [%p]\n",
+            iova, end - 1, vaddr);
+
+    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
+    if (ret) {
+        error_report("vfio_dma_map(%p, 0x%"TARGET_PRIxPHYS", "
+                     "0x%"TARGET_PRIxPHYS", %p) = %d (%m)\n",
+                     container, iova, end - iova, vaddr, ret);
+    }
+}
+
+static void vfio_listener_region_del(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.listener);
+    target_phys_addr_t iova, end;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("vfio: SKIPPING region_del %"TARGET_PRIxPHYS" - %"PRIx64"\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space + section->size - 1);
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region\n", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + section->size) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    DPRINTF("vfio: region_del %"TARGET_PRIxPHYS" - %"TARGET_PRIxPHYS"\n",
+            iova, end - 1);
+
+    ret = vfio_dma_unmap(container, iova, end - iova);
+    if (ret) {
+        error_report("vfio_dma_unmap(%p, 0x%"TARGET_PRIxPHYS", "
+                     "0x%"TARGET_PRIxPHYS") = %d (%m)\n",
+                     container, iova, end - iova, ret);
+    }
+}
+
+static MemoryListener vfio_memory_listener = {
+    .begin = vfio_listener_dummy1,
+    .commit = vfio_listener_dummy1,
+    .region_add = vfio_listener_region_add,
+    .region_del = vfio_listener_region_del,
+    .region_nop = vfio_listener_dummy2,
+    .log_start = vfio_listener_dummy2,
+    .log_stop = vfio_listener_dummy2,
+    .log_sync = vfio_listener_dummy2,
+    .log_global_start = vfio_listener_dummy1,
+    .log_global_stop = vfio_listener_dummy1,
+    .eventfd_add = vfio_listener_dummy3,
+    .eventfd_del = vfio_listener_dummy3,
+};
+
+static void vfio_listener_release(VFIOContainer *container)
+{
+    memory_listener_unregister(&container->iommu_data.listener);
+}
+
+/*
+ * Interrupt setup
+ */
+static void vfio_disable_interrupts(VFIODevice *vdev)
+{
+    switch (vdev->interrupt) {
+    case VFIO_INT_INTx:
+        vfio_disable_intx(vdev);
+        break;
+    case VFIO_INT_MSI:
+        vfio_disable_msi_x(vdev, false);
+        break;
+    case VFIO_INT_MSIX:
+        vfio_disable_msi_x(vdev, true);
+        break;
+    }
+}
+
+static int vfio_setup_msi(VFIODevice *vdev, int pos)
+{
+    uint16_t ctrl;
+    bool msi_64bit, msi_maskbit;
+    int ret, entries;
+
+    /*
+     * TODO: don't peek into msi_supported, let msi_init fail and
+     * check for ENOTSUP
+     */
+    if (!msi_supported) {
+        return 0;
+    }
+
+    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
+              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+        return -errno;
+    }
+    ctrl = le16_to_cpu(ctrl);
+
+    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
+    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
+    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+
+    DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
+
+    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
+    if (ret < 0) {
+        error_report("vfio: msi_init failed\n");
+        return ret;
+    }
+    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
+
+    return 0;
+}
+
+/*
+ * We don't have any control over how pci_add_capability() inserts
+ * capabilities into the chain.  In order to setup MSI-X we need a
+ * MemoryRegion for the BAR.  In order to setup the BAR and not
+ * attempt to mmap the MSI-X table area, which VFIO won't allow, we
+ * need to first look for where the MSI-X table lives.  So we
+ * unfortunately split MSI-X setup across two functions.
+ */
+static int vfio_early_setup_msix(VFIODevice *vdev)
+{
+    uint8_t pos;
+    uint16_t ctrl;
+    uint32_t table, pba;
+
+    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
+    if (!pos) {
+        return 0;
+    }
+
+    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
+              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+        return -errno;
+    }
+
+    if (pread(vdev->fd, &table, sizeof(table),
+              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
+        return -errno;
+    }
+
+    if (pread(vdev->fd, &pba, sizeof(pba),
+              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
+        return -errno;
+    }
+
+    ctrl = le16_to_cpu(ctrl);
+    table = le32_to_cpu(table);
+    pba = le32_to_cpu(pba);
+
+    vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
+    vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+
+    DPRINTF("%04x:%02x:%02x.%x "
+            "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, pos, vdev->msix->table_bar,
+            vdev->msix->table_offset, vdev->msix->entries);
+
+    return 0;
+}
+
+static int vfio_setup_msix(VFIODevice *vdev, int pos)
+{
+    int ret;
+
+    /*
+     * TODO: don't peek into msi_supported, let msix_init fail and
+     * check for ENOTSUP
+     */
+    if (!msi_supported) {
+        return 0;
+    }
+
+    ret = msix_init(&vdev->pdev, vdev->msix->entries,
+                    &vdev->bars[vdev->msix->table_bar].mem,
+                    vdev->msix->table_bar, vdev->msix->table_offset,
+                    &vdev->bars[vdev->msix->pba_bar].mem,
+                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
+    if (ret < 0) {
+        error_report("vfio: msix_init failed\n");
+        return ret;
+    }
+
+    ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
+                                    vfio_msix_vector_release);
+    if (ret) {
+        error_report("vfio: msix_set_vector_notifiers failed %d\n", ret);
+        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
+                    &vdev->bars[vdev->msix->pba_bar].mem);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    msi_uninit(&vdev->pdev);
+
+    if (vdev->msix) {
+        /* FIXME: Why can't unset just silently do nothing?? */
+        if (vdev->pdev.msix_vector_use_notifier &&
+            vdev->pdev.msix_vector_release_notifier) {
+            msix_unset_vector_notifiers(&vdev->pdev);
+        }
+
+        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
+                    &vdev->bars[vdev->msix->pba_bar].mem);
+    }
+}
+
+/*
+ * Resource setup
+ */
+static void vfio_mmap_set_enabled(VFIODevice *vdev, bool enabled)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        VFIOBAR *bar = &vdev->bars[i];
+
+        if (!bar->size) {
+            continue;
+        }
+
+        memory_region_set_enabled(&bar->mmap_mem, enabled);
+        if (vdev->msix && vdev->msix->table_bar == i) {
+            memory_region_set_enabled(&vdev->msix->mmap_mem, enabled);
+        }
+    }
+}
+
+static void vfio_unmap_bar(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+
+    if (!bar->size) {
+        return;
+    }
+
+    memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
+    munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
+
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
+        munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
+    }
+
+    memory_region_destroy(&bar->mem);
+}
+
+static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
+                         void **map, size_t size, off_t offset,
+                         const char *name)
+{
+    int ret = 0;
+
+    if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
+        int prot = 0;
+
+        if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
+            prot |= PROT_READ;
+        }
+
+        if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
+            prot |= PROT_WRITE;
+        }
+
+        *map = mmap(NULL, size, prot, MAP_SHARED,
+                    bar->fd, bar->fd_offset + offset);
+        if (*map == MAP_FAILED) {
+            *map = NULL;
+            ret = -errno;
+            goto empty_region;
+        }
+
+        memory_region_init_ram_ptr(submem, name, size, *map);
+    } else {
+empty_region:
+        /* Create a zero sized sub-region to make cleanup easy. */
+        memory_region_init(submem, name, 0);
+    }
+
+    memory_region_add_subregion(mem, offset, submem);
+
+    return ret;
+}
+
+static void vfio_map_bar(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    unsigned size = bar->size;
+    char name[64];
+    uint32_t pci_bar;
+    uint8_t type;
+    int ret;
+
+    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
+    if (!size) {
+        return;
+    }
+
+    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function, nr);
+
+    /* Determine what type of BAR this is for registration */
+    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
+                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
+    if (ret != sizeof(pci_bar)) {
+        error_report("vfio: Failed to read BAR %d (%m)\n", nr);
+        return;
+    }
+
+    pci_bar = le32_to_cpu(pci_bar);
+    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
+           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
+
+    /* A "slow" read/write mapping underlies all BARs */
+    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
+    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
+
+    /*
+     * We can't mmap areas overlapping the MSIX vector table, so we
+     * potentially insert a direct-mapped subregion before and after it.
+     */
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
+    }
+
+    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
+    if (vfio_mmap_bar(bar, &bar->mem,
+                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
+        error_report("%s unsupported. Performance may be slow\n", name);
+    }
+
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        unsigned start;
+
+        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
+                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+
+        size = start < bar->size ? bar->size - start : 0;
+        strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
+        /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
+        if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
+                          &vdev->msix->mmap, size, start, name)) {
+            error_report("%s unsupported. Performance may be slow\n", name);
+        }
+    }
+}
+
+static void vfio_map_bars(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_map_bar(vdev, i);
+    }
+}
+
+static void vfio_unmap_bars(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_unmap_bar(vdev, i);
+    }
+}
+
+/*
+ * General setup
+ */
+static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
+{
+    uint8_t tmp, next = 0xff;
+
+    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
+         tmp = pdev->config[tmp + 1]) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
+static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint8_t cap_id, next, size;
+    int ret;
+
+    cap_id = pdev->config[pos];
+    next = pdev->config[pos + 1];
+
+    /*
+     * If it becomes important to configure capabilities to their actual
+     * size, use this as the default when it's something we don't recognize.
+     * Since QEMU doesn't actually handle many of the config accesses,
+     * exact size doesn't seem worthwhile.
+     */
+    size = vfio_std_cap_max_size(pdev, pos);
+
+    /*
+     * pci_add_capability always inserts the new capability at the head
+     * of the chain.  Therefore to end up with a chain that matches the
+     * physical device, we insert from the end by making this recursive.
+     * This is also why we pre-caclulate size above as cached config space
+     * will be changed as we unwind the stack.
+     */
+    if (next) {
+        ret = vfio_add_std_cap(vdev, next);
+        if (ret) {
+            return ret;
+        }
+    } else {
+        pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */
+    }
+
+    switch (cap_id) {
+    case PCI_CAP_ID_MSI:
+        ret = vfio_setup_msi(vdev, pos);
+        break;
+    case PCI_CAP_ID_MSIX:
+        ret = vfio_setup_msix(vdev, pos);
+        break;
+    default:
+        ret = pci_add_capability(pdev, cap_id, pos, size);
+        break;
+    }
+
+    if (ret < 0) {
+        error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
+                     "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain,
+                     vdev->host.bus, vdev->host.slot, vdev->host.function,
+                     cap_id, size, pos, ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int vfio_add_capabilities(VFIODevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+
+    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
+        !pdev->config[PCI_CAPABILITY_LIST]) {
+        return 0; /* Nothing to add */
+    }
+
+    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+}
+
+static int vfio_load_rom(VFIODevice *vdev)
+{
+    uint64_t size = vdev->rom_size;
+    char name[32];
+    off_t off = 0, voff = vdev->rom_offset;
+    ssize_t bytes;
+    void *ptr;
+
+    /* If loading ROM from file, pci handles it */
+    if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
+        return 0;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function);
+    memory_region_init_ram(&vdev->pdev.rom, name, size);
+    ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
+    memset(ptr, 0xff, size);
+
+    while (size) {
+        bytes = pread(vdev->fd, ptr + off, size, voff + off);
+        if (bytes == 0) {
+            break; /* expect that we could get back less than the ROM BAR */
+        } else if (bytes > 0) {
+            off += bytes;
+            size -= bytes;
+        } else {
+            if (errno == EINTR || errno == EAGAIN) {
+                continue;
+            }
+            error_report("vfio: Error reading device ROM: %m\n");
+            memory_region_destroy(&vdev->pdev.rom);
+            return -errno;
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
+    vdev->pdev.has_rom = true;
+    return 0;
+}
+
+static int vfio_connect_container(VFIOGroup *group)
+{
+    VFIOContainer *container;
+    int ret, fd;
+
+    if (group->container) {
+        return 0;
+    }
+
+    QLIST_FOREACH(container, &container_list, next) {
+        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+            group->container = container;
+            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+            return 0;
+        }
+    }
+
+    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    if (fd < 0) {
+        error_report("vfio: failed to open /dev/vfio/vfio: %m\n");
+        return -errno;
+    }
+
+    ret = ioctl(fd, VFIO_GET_API_VERSION);
+    if (ret != VFIO_API_VERSION) {
+        error_report("vfio: supported vfio version: %d, "
+                     "reported version: %d\n", VFIO_API_VERSION, ret);
+        close(fd);
+        return -EINVAL;
+    }
+
+    container = g_malloc0(sizeof(*container));
+    container->fd = fd;
+
+    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %m\n");
+            g_free(container);
+            close(fd);
+            return -errno;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %m\n");
+            g_free(container);
+            close(fd);
+            return -errno;
+        }
+
+        container->iommu_data.listener = vfio_memory_listener;
+        container->iommu_data.release = vfio_listener_release;
+
+        memory_listener_register(&container->iommu_data.listener,
+                                 get_system_memory());
+    } else {
+        error_report("vfio: No available IOMMU models\n");
+        g_free(container);
+        close(fd);
+        return -EINVAL;
+    }
+
+    QLIST_INIT(&container->group_list);
+    QLIST_INSERT_HEAD(&container_list, container, next);
+
+    group->container = container;
+    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+
+    return 0;
+}
+
+static void vfio_disconnect_container(VFIOGroup *group)
+{
+    VFIOContainer *container = group->container;
+
+    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
+        error_report("vfio: error disconnecting group %d from container\n",
+                     group->groupid);
+    }
+
+    QLIST_REMOVE(group, container_next);
+    group->container = NULL;
+
+    if (QLIST_EMPTY(&container->group_list)) {
+        if (container->iommu_data.release) {
+            container->iommu_data.release(container);
+        }
+        QLIST_REMOVE(container, next);
+        DPRINTF("vfio_disconnect_container: close container->fd\n");
+        close(container->fd);
+        g_free(container);
+    }
+}
+
+static VFIOGroup *vfio_get_group(int groupid)
+{
+    VFIOGroup *group;
+    char path[32];
+    struct vfio_group_status status = { .argsz = sizeof(status) };
+
+    QLIST_FOREACH(group, &group_list, next) {
+        if (group->groupid == groupid) {
+            return group;
+        }
+    }
+
+    group = g_malloc0(sizeof(*group));
+
+    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
+    group->fd = qemu_open(path, O_RDWR);
+    if (group->fd < 0) {
+        error_report("vfio: error opening %s: %m\n", path);
+        g_free(group);
+        return NULL;
+    }
+
+    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
+        error_report("vfio: error getting group status: %m\n");
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+        error_report("vfio: error, group %d is not viable, please ensure "
+                     "all devices within the iommu_group are bound to their "
+                     "vfio bus driver.\n", groupid);
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    group->groupid = groupid;
+    QLIST_INIT(&group->device_list);
+
+    if (vfio_connect_container(group)) {
+        error_report("vfio: failed to setup container for group %d\n", groupid);
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    QLIST_INSERT_HEAD(&group_list, group, next);
+
+    return group;
+}
+
+static void vfio_put_group(VFIOGroup *group)
+{
+    if (!QLIST_EMPTY(&group->device_list)) {
+        return;
+    }
+
+    vfio_disconnect_container(group);
+    QLIST_REMOVE(group, next);
+    DPRINTF("vfio_put_group: close group->fd\n");
+    close(group->fd);
+    g_free(group);
+}
+
+static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
+{
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+    int ret, i;
+
+    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    if (ret < 0) {
+        error_report("vfio: error getting device %s from group %d: %m\n",
+                     name, group->groupid);
+        error_report("Verify all devices in group %d are bound to vfio-pci "
+                     "or pci-stub and not already in use\n", group->groupid);
+        return ret;
+    }
+
+    vdev->fd = ret;
+    vdev->group = group;
+    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
+
+    /* Sanity check device */
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_report("vfio: error getting device info: %m\n");
+        goto error;
+    }
+
+    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
+            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
+
+    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+        error_report("vfio: Um, this isn't a PCI device\n");
+        goto error;
+    }
+
+    vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+    if (!vdev->reset_works) {
+        error_report("Warning, device %s does not support reset\n", name);
+    }
+
+    if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
+        error_report("vfio: unexpected number of io regions %u\n",
+                     dev_info.num_regions);
+        goto error;
+    }
+
+    if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
+        error_report("vfio: unexpected number of irqs %u\n", dev_info.num_irqs);
+        goto error;
+    }
+
+    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
+        reg_info.index = i;
+
+        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+        if (ret) {
+            error_report("vfio: Error getting region %d info: %m\n", i);
+            goto error;
+        }
+
+        DPRINTF("Device %s region %d:\n", name, i);
+        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+                (unsigned long)reg_info.flags);
+
+        vdev->bars[i].flags = reg_info.flags;
+        vdev->bars[i].size = reg_info.size;
+        vdev->bars[i].fd_offset = reg_info.offset;
+        vdev->bars[i].fd = vdev->fd;
+        vdev->bars[i].nr = i;
+    }
+
+    reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+    if (ret) {
+        error_report("vfio: Error getting ROM info: %m\n");
+        goto error;
+    }
+
+    DPRINTF("Device %s ROM:\n", name);
+    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+            (unsigned long)reg_info.flags);
+
+    vdev->rom_size = reg_info.size;
+    vdev->rom_offset = reg_info.offset;
+
+    reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+    if (ret) {
+        error_report("vfio: Error getting config info: %m\n");
+        goto error;
+    }
+
+    DPRINTF("Device %s config:\n", name);
+    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+            (unsigned long)reg_info.flags);
+
+    vdev->config_size = reg_info.size;
+    vdev->config_offset = reg_info.offset;
+
+error:
+    if (ret) {
+        QLIST_REMOVE(vdev, next);
+        vdev->group = NULL;
+        close(vdev->fd);
+    }
+    return ret;
+}
+
+static void vfio_put_device(VFIODevice *vdev)
+{
+    QLIST_REMOVE(vdev, next);
+    vdev->group = NULL;
+    DPRINTF("vfio_put_device: close vdev->fd\n");
+    close(vdev->fd);
+    if (vdev->msix) {
+        g_free(vdev->msix);
+        vdev->msix = NULL;
+    }
+}
+
+static int vfio_initfn(PCIDevice *pdev)
+{
+    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group;
+    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
+    ssize_t len;
+    struct stat st;
+    int groupid;
+    int ret;
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function);
+    if (stat(path, &st) < 0) {
+        error_report("vfio: error: no such host device: %s\n", path);
+        return -errno;
+    }
+
+    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
+
+    len = readlink(path, iommu_group_path, PATH_MAX);
+    if (len <= 0) {
+        error_report("vfio: error no iommu_group for device\n");
+        return -errno;
+    }
+
+    iommu_group_path[len] = 0;
+    group_name = basename(iommu_group_path);
+
+    if (sscanf(group_name, "%d", &groupid) != 1) {
+        error_report("vfio: error reading %s: %m\n", path);
+        return -errno;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
+
+    group = vfio_get_group(groupid);
+    if (!group) {
+        error_report("vfio: failed to get group %d\n", groupid);
+        return -ENOENT;
+    }
+
+    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function);
+
+    QLIST_FOREACH(pvdev, &group->device_list, next) {
+        if (pvdev->host.domain == vdev->host.domain &&
+            pvdev->host.bus == vdev->host.bus &&
+            pvdev->host.slot == vdev->host.slot &&
+            pvdev->host.function == vdev->host.function) {
+
+            error_report("vfio: error: device %s is already attached\n", path);
+            vfio_put_group(group);
+            return -EBUSY;
+        }
+    }
+
+    ret = vfio_get_device(group, path, vdev);
+    if (ret) {
+        error_report("vfio: failed to get device %s\n", path);
+        vfio_put_group(group);
+        return ret;
+    }
+
+    /* Get a copy of config space */
+    ret = pread(vdev->fd, vdev->pdev.config,
+                MIN(pci_config_size(&vdev->pdev), vdev->config_size),
+                vdev->config_offset);
+    if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
+        ret = ret < 0 ? -errno : -EFAULT;
+        error_report("vfio: Failed to read device config space\n");
+        goto out_put;
+    }
+
+    /*
+     * Clear host resource mapping info.  If we choose not to register a
+     * BAR, such as might be the case with the option ROM, we can get
+     * confusing, unwritable, residual addresses from the host here.
+     */
+    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+    vfio_load_rom(vdev);
+
+    ret = vfio_early_setup_msix(vdev);
+    if (ret) {
+        goto out_put;
+    }
+
+    vfio_map_bars(vdev);
+
+    ret = vfio_add_capabilities(vdev);
+    if (ret) {
+        goto out_teardown;
+    }
+
+    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
+        if (vdev->intx.intx && strcmp(vdev->intx.intx, "off")) {
+            error_report("vfio: Unknown option x-intx=%s, "
+                         "valid options: \"off\".\n", vdev->intx.intx);
+            ret = -EINVAL;
+            goto out_teardown;
+        }
+
+        if (vdev->intx.intx && !strcmp(vdev->intx.intx, "off")) {
+            vdev->intx.disabled = true;
+        }
+
+        ret = vfio_enable_intx(vdev);
+        if (ret) {
+            goto out_teardown;
+        }
+    }
+
+    return 0;
+
+out_teardown:
+    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_bars(vdev);
+out_put:
+    vfio_put_device(vdev);
+    vfio_put_group(group);
+    return ret;
+}
+
+static void vfio_exitfn(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group = vdev->group;
+
+    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    vfio_disable_interrupts(vdev);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_bars(vdev);
+    vfio_put_device(vdev);
+    vfio_put_group(group);
+}
+
+static void vfio_pci_reset(DeviceState *dev)
+{
+    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    if (!vdev->reset_works) {
+        return;
+    }
+
+    if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+        error_report("vfio: Error unable to reset physical device "
+                     "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain,
+                     vdev->host.bus, vdev->host.slot, vdev->host.function);
+    }
+}
+
+static Property vfio_pci_dev_properties[] = {
+    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
+    DEFINE_PROP_STRING("x-intx", VFIODevice, intx.intx),
+    /*
+     * TODO - support passed fds... is this necessary?
+     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
+     */
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+
+static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
+
+    dc->reset = vfio_pci_reset;
+    dc->props = vfio_pci_dev_properties;
+    pdc->init = vfio_initfn;
+    pdc->exit = vfio_exitfn;
+    pdc->config_read = vfio_pci_read_config;
+    pdc->config_write = vfio_pci_write_config;
+}
+
+static const TypeInfo vfio_pci_dev_info = {
+    .name = "vfio-pci",
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(VFIODevice),
+    .class_init = vfio_pci_dev_class_init,
+};
+
+static void register_vfio_pci_dev_type(void)
+{
+    type_register_static(&vfio_pci_dev_info);
+}
+
+type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio_pci_int.h b/hw/vfio_pci_int.h
new file mode 100644
index 000000000..3812d8d7f
--- /dev/null
+++ b/hw/vfio_pci_int.h
@@ -0,0 +1,114 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_VFIO_PCI_INT_H
+#define HW_VFIO_PCI_INT_H
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+#include "event_notifier.h"
+
+typedef struct VFIOBAR {
+    off_t fd_offset; /* offset of BAR within device fd */
+    int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
+    MemoryRegion mem; /* slow, read/write access */
+    MemoryRegion mmap_mem; /* direct mapped access */
+    void *mmap;
+    size_t size;
+    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
+    uint8_t nr; /* cache the BAR number for debug */
+} VFIOBAR;
+
+typedef struct VFIOINTx {
+    bool pending; /* interrupt pending */
+    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
+    uint8_t pin; /* which pin to pull for qemu_set_irq */
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
+    PCIINTxRoute route; /* routing info for QEMU bypass */
+    bool disabled;
+    char *intx;
+} VFIOINTx;
+
+struct VFIODevice;
+
+typedef struct VFIOMSIVector {
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    struct VFIODevice *vdev; /* back pointer to device */
+    int virq; /* KVM irqchip route for QEMU bypass */
+    bool use;
+} VFIOMSIVector;
+
+enum {
+    VFIO_INT_NONE = 0,
+    VFIO_INT_INTx = 1,
+    VFIO_INT_MSI  = 2,
+    VFIO_INT_MSIX = 3,
+};
+
+struct VFIOGroup;
+
+typedef struct VFIOContainer {
+    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+    struct {
+        /* enable abstraction to support various iommu backends */
+        union {
+            MemoryListener listener; /* Used by type1 iommu */
+        };
+        void (*release)(struct VFIOContainer *);
+    } iommu_data;
+    QLIST_HEAD(, VFIOGroup) group_list;
+    QLIST_ENTRY(VFIOContainer) next;
+} VFIOContainer;
+
+/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
+typedef struct VFIOMSIXInfo {
+    uint8_t table_bar;
+    uint8_t pba_bar;
+    uint16_t entries;
+    uint32_t table_offset;
+    uint32_t pba_offset;
+    MemoryRegion mmap_mem;
+    void *mmap;
+} VFIOMSIXInfo;
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    int fd;
+    VFIOINTx intx;
+    unsigned int config_size;
+    off_t config_offset; /* Offset of config space region within device fd */
+    unsigned int rom_size;
+    off_t rom_offset; /* Offset of ROM region within device fd */
+    int msi_cap_size;
+    VFIOMSIVector *msi_vectors;
+    VFIOMSIXInfo *msix;
+    int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
+    int interrupt; /* Current interrupt type */
+    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
+    PCIHostDeviceAddress host;
+    QLIST_ENTRY(VFIODevice) next;
+    struct VFIOGroup *group;
+    bool reset_works;
+} VFIODevice;
+
+typedef struct VFIOGroup {
+    int fd;
+    int groupid;
+    VFIOContainer *container;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOGroup) next;
+    QLIST_ENTRY(VFIOGroup) container_next;
+} VFIOGroup;
+
+#endif /* HW_VFIO_PCI_INT_H */
-- 
cgit v1.2.3


From 92e1fb5ed1a26d09ee754845d5691d473410c1dc Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Wed, 26 Sep 2012 11:19:39 -0600
Subject: vfio: Enable vfio-pci and mark supported

Enabled for all softmmu guests supporting PCI on Linux hosts.  Note
that currently only x86 hosts have the kernel side VFIO IOMMU support
for this.  PPC (g3beige) is the only non-x86 guest known to work.
ARM (veratile) hangs in firmware, others untested.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 MAINTAINERS      | 5 +++++
 hw/Makefile.objs | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 25733fc86..29aac4fce 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -474,6 +474,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb*
 
+VFIO
+M: Alex Williamson <alex.williamson@redhat.com>
+S: Supported
+F: hw/vfio*
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 6dfebd297..ecdbe44dd 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -198,7 +198,8 @@ obj-$(CONFIG_VGA) += vga.o
 obj-$(CONFIG_SOFTMMU) += device-hotplug.o
 obj-$(CONFIG_XEN) += xen_domainbuild.o xen_machine_pv.o
 
-# Inter-VM PCI shared memory
+# Inter-VM PCI shared memory & VFIO PCI device assignment
 ifeq ($(CONFIG_PCI), y)
 obj-$(CONFIG_KVM) += ivshmem.o
+obj-$(CONFIG_LINUX) += vfio_pci.o
 endif
-- 
cgit v1.2.3


From 0f41dc182c7e4e2ae160641552101a2a199f7c4c Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Mon, 1 Oct 2012 13:40:15 -0500
Subject: vfio_pci: fix build on 32-bit systems

We cannot cast directly from pointer to uint64.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Alex Barcelo <abarcelo@ac.upc.edu>
Reported-by: Alex Barcelo <abarcelo@ac.upc.edu>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/vfio_pci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
index a24558aaa..a1eeced8f 100644
--- a/hw/vfio_pci.c
+++ b/hw/vfio_pci.c
@@ -768,7 +768,7 @@ static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova,
     struct vfio_iommu_type1_dma_map map = {
         .argsz = sizeof(map),
         .flags = VFIO_DMA_MAP_FLAG_READ,
-        .vaddr = (__u64)vaddr,
+        .vaddr = (__u64)(intptr_t)vaddr,
         .iova = iova,
         .size = size,
     };
-- 
cgit v1.2.3


From 4be8eeacb9f2b01ad98747ad88e54a7feabc664c Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Mon, 24 Sep 2012 17:28:35 +0100
Subject: fpu/softfloat.c: Remove pointless shift of always-zero value

In float16_to_float32, when returning an infinity, just pass zero
as the mantissa argument to packFloat32(), rather than shifting
a value which we know must be zero.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 fpu/softfloat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index b29256a8e..01a28cab1 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -3007,7 +3007,7 @@ float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
         if (aSig) {
             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
         }
-        return packFloat32(aSign, 0xff, aSig << 13);
+        return packFloat32(aSign, 0xff, 0);
     }
     if (aExp == 0) {
         int8 shiftCount;
-- 
cgit v1.2.3


From e744c06fca438dc08271e626034e632a270c91c8 Mon Sep 17 00:00:00 2001
From: Peter Maydell <peter.maydell@linaro.org>
Date: Fri, 28 Sep 2012 16:17:03 +0100
Subject: fpu/softfloat.c: Return correctly signed values from
 uint64_to_float32

The uint64_to_float32() conversion function was incorrectly always
returning numbers with the sign bit set (ie negative numbers). Correct
this so we return positive numbers instead.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Aurelien Jarno <aurelien@aurel32.net>
---
 fpu/softfloat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 01a28cab1..841314686 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -1238,7 +1238,7 @@ float32 uint64_to_float32( uint64 a STATUS_PARAM )
     if ( a == 0 ) return float32_zero;
     shiftCount = countLeadingZeros64( a ) - 40;
     if ( 0 <= shiftCount ) {
-        return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
+        return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
     }
     else {
         shiftCount += 7;
@@ -1248,7 +1248,7 @@ float32 uint64_to_float32( uint64 a STATUS_PARAM )
         else {
             a <<= shiftCount;
         }
-        return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
+        return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
     }
 }
 
-- 
cgit v1.2.3


From bd4982a6c6f1b133aff38873bc3b580af15cd334 Mon Sep 17 00:00:00 2001
From: Anthony PERARD <anthony.perard@citrix.com>
Date: Wed, 3 Oct 2012 13:45:24 +0000
Subject: xen: Fix, no unplug of pt device by platform device.

The Xen platform device will unplug any NICs if requested by the guest (PVonHVM)
including a NIC that would have been passthrough. This patch makes sure that a
passthrough device will not be unplug.

Reported-by: "Zhang, Yang Z" <yang.z.zhang@intel.com>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 hw/xen_platform.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hw/xen_platform.c b/hw/xen_platform.c
index 0d6c2ff8c..956dbfeae 100644
--- a/hw/xen_platform.c
+++ b/hw/xen_platform.c
@@ -85,8 +85,10 @@ static void log_writeb(PCIXenPlatformState *s, char val)
 
 static void unplug_nic(PCIBus *b, PCIDevice *d, void *o)
 {
+    /* We have to ignore passthrough devices */
     if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
-            PCI_CLASS_NETWORK_ETHERNET) {
+            PCI_CLASS_NETWORK_ETHERNET
+            && strcmp(d->name, "xen-pci-passthrough") != 0) {
         qdev_free(&d->qdev);
     }
 }
@@ -98,8 +100,10 @@ static void pci_unplug_nics(PCIBus *bus)
 
 static void unplug_disks(PCIBus *b, PCIDevice *d, void *o)
 {
+    /* We have to ignore passthrough devices */
     if (pci_get_word(d->config + PCI_CLASS_DEVICE) ==
-            PCI_CLASS_STORAGE_IDE) {
+            PCI_CLASS_STORAGE_IDE
+            && strcmp(d->name, "xen-pci-passthrough") != 0) {
         qdev_unplug(&(d->qdev), NULL);
     }
 }
-- 
cgit v1.2.3


From aabc8530c7ba2be89e21463f051056ad7c255e6e Mon Sep 17 00:00:00 2001
From: Xudong Hao <xudong.hao@intel.com>
Date: Wed, 3 Oct 2012 13:46:23 +0000
Subject: qemu/xen: Add 64 bits big bar support on qemu

Currently it is assumed PCI device BAR access < 4G memory. If there is such a
device whose BAR size is larger than 4G, it must access > 4G memory address.
This patch enable the 64bits big BAR support on qemu.

Signed-off-by: Xudong Hao <xudong.hao@intel.com>
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 hw/xen_pt.c             |  7 +++++--
 hw/xen_pt_config_init.c | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/hw/xen_pt.c b/hw/xen_pt.c
index 307119a12..838bcea4d 100644
--- a/hw/xen_pt.c
+++ b/hw/xen_pt.c
@@ -410,14 +410,17 @@ static int xen_pt_register_regions(XenPCIPassthroughState *s)
             if (r->type & XEN_HOST_PCI_REGION_TYPE_PREFETCH) {
                 type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
             }
+            if (r->type & XEN_HOST_PCI_REGION_TYPE_MEM_64) {
+                type |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+            }
         }
 
         memory_region_init_io(&s->bar[i], &ops, &s->dev,
                               "xen-pci-pt-bar", r->size);
         pci_register_bar(&s->dev, i, type, &s->bar[i]);
 
-        XEN_PT_LOG(&s->dev, "IO region %i registered (size=0x%08"PRIx64
-                   " base_addr=0x%08"PRIx64" type: %#x)\n",
+        XEN_PT_LOG(&s->dev, "IO region %i registered (size=0x%lx"PRIx64
+                   " base_addr=0x%lx"PRIx64" type: %#x)\n",
                    i, r->size, r->base_addr, type);
     }
 
diff --git a/hw/xen_pt_config_init.c b/hw/xen_pt_config_init.c
index e524a4094..0a5f82cb8 100644
--- a/hw/xen_pt_config_init.c
+++ b/hw/xen_pt_config_init.c
@@ -342,6 +342,23 @@ static int xen_pt_cmd_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
 #define XEN_PT_BAR_IO_RO_MASK     0x00000003  /* BAR ReadOnly mask(I/O) */
 #define XEN_PT_BAR_IO_EMU_MASK    0xFFFFFFFC  /* BAR emul mask(I/O) */
 
+static bool is_64bit_bar(PCIIORegion *r)
+{
+    return !!(r->type & PCI_BASE_ADDRESS_MEM_TYPE_64);
+}
+
+static uint64_t xen_pt_get_bar_size(PCIIORegion *r)
+{
+    if (is_64bit_bar(r)) {
+        uint64_t size64;
+        size64 = (r + 1)->size;
+        size64 <<= 32;
+        size64 += r->size;
+        return size64;
+    }
+    return r->size;
+}
+
 static XenPTBarFlag xen_pt_bar_reg_parse(XenPCIPassthroughState *s,
                                          XenPTRegInfo *reg)
 {
@@ -366,7 +383,7 @@ static XenPTBarFlag xen_pt_bar_reg_parse(XenPCIPassthroughState *s,
 
     /* check unused BAR */
     r = &d->io_regions[index];
-    if (r->size == 0) {
+    if (!xen_pt_get_bar_size(r)) {
         return XEN_PT_BAR_FLAG_UNUSED;
     }
 
@@ -481,7 +498,12 @@ static int xen_pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
     switch (s->bases[index].bar_flag) {
     case XEN_PT_BAR_FLAG_MEM:
         bar_emu_mask = XEN_PT_BAR_MEM_EMU_MASK;
-        bar_ro_mask = XEN_PT_BAR_MEM_RO_MASK | (r_size - 1);
+        if (!r_size) {
+            /* low 32 bits mask for 64 bit bars */
+            bar_ro_mask = XEN_PT_BAR_ALLF;
+        } else {
+            bar_ro_mask = XEN_PT_BAR_MEM_RO_MASK | (r_size - 1);
+        }
         break;
     case XEN_PT_BAR_FLAG_IO:
         bar_emu_mask = XEN_PT_BAR_IO_EMU_MASK;
@@ -489,7 +511,7 @@ static int xen_pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
         break;
     case XEN_PT_BAR_FLAG_UPPER:
         bar_emu_mask = XEN_PT_BAR_ALLF;
-        bar_ro_mask = 0;    /* all upper 32bit are R/W */
+        bar_ro_mask = r_size ? r_size - 1 : 0;
         break;
     default:
         break;
@@ -501,22 +523,13 @@ static int xen_pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
 
     /* check whether we need to update the virtual region address or not */
     switch (s->bases[index].bar_flag) {
+    case XEN_PT_BAR_FLAG_UPPER:
     case XEN_PT_BAR_FLAG_MEM:
         /* nothing to do */
         break;
     case XEN_PT_BAR_FLAG_IO:
         /* nothing to do */
         break;
-    case XEN_PT_BAR_FLAG_UPPER:
-        if (cfg_entry->data) {
-            if (cfg_entry->data != (XEN_PT_BAR_ALLF & ~bar_ro_mask)) {
-                XEN_PT_WARN(d, "Guest attempt to set high MMIO Base Address. "
-                            "Ignore mapping. "
-                            "(offset: 0x%02x, high address: 0x%08x)\n",
-                            reg->offset, cfg_entry->data);
-            }
-        }
-        break;
     default:
         break;
     }
-- 
cgit v1.2.3


From 39f42439d0629d3921629dc4b38e68df8f2f7b83 Mon Sep 17 00:00:00 2001
From: Anthony PERARD <anthony.perard@citrix.com>
Date: Wed, 3 Oct 2012 13:48:19 +0000
Subject: QMP, Introduce xen-set-global-dirty-log command.

This command is used during a migration of a guest under Xen. It calls
memory_global_dirty_log_start or memory_global_dirty_log_stop according to the
argument pass to the command.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
Reviewed-by: Luiz Capitulino <lcapitulino@redhat.com>
---
 qapi-schema.json | 13 +++++++++++++
 qmp-commands.hx  | 24 ++++++++++++++++++++++++
 xen-all.c        | 15 +++++++++++++++
 xen-stub.c       |  5 +++++
 4 files changed, 57 insertions(+)

diff --git a/qapi-schema.json b/qapi-schema.json
index 14e44199b..4a4a850f5 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1955,6 +1955,19 @@
 ##
 { 'command': 'xen-save-devices-state', 'data': {'filename': 'str'} }
 
+##
+# @xen-set-global-dirty-log
+#
+# Enable or disable the global dirty log mode.
+#
+# @enable: true to enable, false to disable.
+#
+# Returns: nothing
+#
+# Since: 1.3
+##
+{ 'command': 'xen-set-global-dirty-log', 'data': { 'enable': 'bool' } }
+
 ##
 # @device_del:
 #
diff --git a/qmp-commands.hx b/qmp-commands.hx
index 6e21ddba6..662b7cf32 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -490,6 +490,30 @@ Example:
      "arguments": { "filename": "/tmp/save" } }
 <- { "return": {} }
 
+EQMP
+
+    {
+        .name       = "xen-set-global-dirty-log",
+        .args_type  = "enable:b",
+        .mhandler.cmd_new = qmp_marshal_input_xen_set_global_dirty_log,
+    },
+
+SQMP
+xen-set-global-dirty-log
+-------
+
+Enable or disable the global dirty log mode.
+
+Arguments:
+
+- "enable": Enable it or disable it.
+
+Example:
+
+-> { "execute": "xen-set-global-dirty-log",
+     "arguments": { "enable": true } }
+<- { "return": {} }
+
 EQMP
 
     {
diff --git a/xen-all.c b/xen-all.c
index f76b051ee..f75ae9fd2 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -14,6 +14,7 @@
 #include "hw/pc.h"
 #include "hw/xen_common.h"
 #include "hw/xen_backend.h"
+#include "qmp-commands.h"
 
 #include "range.h"
 #include "xen-mapcache.h"
@@ -36,6 +37,7 @@
 
 static MemoryRegion ram_memory, ram_640k, ram_lo, ram_hi;
 static MemoryRegion *framebuffer;
+static bool xen_in_migration;
 
 /* Compatibility with older version */
 #if __XEN_LATEST_INTERFACE_VERSION__ < 0x0003020a
@@ -552,10 +554,14 @@ static void xen_log_sync(MemoryListener *listener, MemoryRegionSection *section)
 
 static void xen_log_global_start(MemoryListener *listener)
 {
+    if (xen_enabled()) {
+        xen_in_migration = true;
+    }
 }
 
 static void xen_log_global_stop(MemoryListener *listener)
 {
+    xen_in_migration = false;
 }
 
 static void xen_eventfd_add(MemoryListener *listener,
@@ -588,6 +594,15 @@ static MemoryListener xen_memory_listener = {
     .priority = 10,
 };
 
+void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
+{
+    if (enable) {
+        memory_global_dirty_log_start();
+    } else {
+        memory_global_dirty_log_stop();
+    }
+}
+
 /* VCPU Operations, MMIO, IO ring ... */
 
 static void xen_reset_vcpu(void *opaque)
diff --git a/xen-stub.c b/xen-stub.c
index 8ff2b79ac..5e66ba8f0 100644
--- a/xen-stub.c
+++ b/xen-stub.c
@@ -11,6 +11,7 @@
 #include "qemu-common.h"
 #include "hw/xen.h"
 #include "memory.h"
+#include "qmp-commands.h"
 
 void xenstore_store_pv_console_info(int i, CharDriverState *chr)
 {
@@ -54,3 +55,7 @@ int xen_init(void)
 void xen_register_framebuffer(MemoryRegion *mr)
 {
 }
+
+void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
+{
+}
-- 
cgit v1.2.3


From 910b38e4dc4c37683c8b821e75a7f4cf095e4b21 Mon Sep 17 00:00:00 2001
From: Anthony PERARD <anthony.perard@citrix.com>
Date: Wed, 3 Oct 2012 13:48:45 +0000
Subject: xen: Introduce xen_modified_memory.

This function is to be used during live migration. Every write access to the
guest memory should call this funcion so the Xen tools knows which pages are
dirty.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 hw/xen.h   |  1 +
 xen-all.c  | 21 +++++++++++++++++++++
 xen-stub.c |  4 ++++
 3 files changed, 26 insertions(+)

diff --git a/hw/xen.h b/hw/xen.h
index e5926b7b8..d14e92d5a 100644
--- a/hw/xen.h
+++ b/hw/xen.h
@@ -48,6 +48,7 @@ void xenstore_store_pv_console_info(int i, struct CharDriverState *chr);
 struct MemoryRegion;
 void xen_ram_alloc(ram_addr_t ram_addr, ram_addr_t size,
                    struct MemoryRegion *mr);
+void xen_modified_memory(ram_addr_t start, ram_addr_t length);
 #endif
 
 struct MemoryRegion;
diff --git a/xen-all.c b/xen-all.c
index f75ae9fd2..b11542c3e 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -1228,3 +1228,24 @@ void xen_shutdown_fatal_error(const char *fmt, ...)
     /* destroy the domain */
     qemu_system_shutdown_request();
 }
+
+void xen_modified_memory(ram_addr_t start, ram_addr_t length)
+{
+    if (unlikely(xen_in_migration)) {
+        int rc;
+        ram_addr_t start_pfn, nb_pages;
+
+        if (length == 0) {
+            length = TARGET_PAGE_SIZE;
+        }
+        start_pfn = start >> TARGET_PAGE_BITS;
+        nb_pages = ((start + length + TARGET_PAGE_SIZE - 1) >> TARGET_PAGE_BITS)
+            - start_pfn;
+        rc = xc_hvm_modified_memory(xen_xc, xen_domid, start_pfn, nb_pages);
+        if (rc) {
+            fprintf(stderr,
+                    "%s failed for "RAM_ADDR_FMT" ("RAM_ADDR_FMT"): %i, %s\n",
+                    __func__, start, nb_pages, rc, strerror(-rc));
+        }
+    }
+}
diff --git a/xen-stub.c b/xen-stub.c
index 5e66ba8f0..921439263 100644
--- a/xen-stub.c
+++ b/xen-stub.c
@@ -59,3 +59,7 @@ void xen_register_framebuffer(MemoryRegion *mr)
 void qmp_xen_set_global_dirty_log(bool enable, Error **errp)
 {
 }
+
+void xen_modified_memory(ram_addr_t start, ram_addr_t length)
+{
+}
-- 
cgit v1.2.3


From 51d7a9eb2b64e787c90bea1027308087eac22065 Mon Sep 17 00:00:00 2001
From: Anthony PERARD <anthony.perard@citrix.com>
Date: Wed, 3 Oct 2012 13:49:05 +0000
Subject: exec: Introduce helper to set dirty flags.

This new helper/hook is used in the next patch to add an extra call in a single
place.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
Reviewed-by: Avi Kivity <avi@redhat.com>
---
 exec.c | 52 +++++++++++++++++-----------------------------------
 1 file changed, 17 insertions(+), 35 deletions(-)

diff --git a/exec.c b/exec.c
index bb6aa4a07..366684cd7 100644
--- a/exec.c
+++ b/exec.c
@@ -3417,6 +3417,18 @@ int cpu_memory_rw_debug(CPUArchState *env, target_ulong addr,
 }
 
 #else
+
+static void invalidate_and_set_dirty(target_phys_addr_t addr,
+                                     target_phys_addr_t length)
+{
+    if (!cpu_physical_memory_is_dirty(addr)) {
+        /* invalidate code */
+        tb_invalidate_phys_page_range(addr, addr + length, 0);
+        /* set dirty bit */
+        cpu_physical_memory_set_dirty_flags(addr, (0xff & ~CODE_DIRTY_FLAG));
+    }
+}
+
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             int len, int is_write)
 {
@@ -3462,13 +3474,7 @@ void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                 /* RAM case */
                 ptr = qemu_get_ram_ptr(addr1);
                 memcpy(ptr, buf, l);
-                if (!cpu_physical_memory_is_dirty(addr1)) {
-                    /* invalidate code */
-                    tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
-                    /* set dirty bit */
-                    cpu_physical_memory_set_dirty_flags(
-                        addr1, (0xff & ~CODE_DIRTY_FLAG));
-                }
+                invalidate_and_set_dirty(addr1, l);
                 qemu_put_ram_ptr(ptr);
             }
         } else {
@@ -3534,13 +3540,7 @@ void cpu_physical_memory_write_rom(target_phys_addr_t addr,
             /* ROM/RAM case */
             ptr = qemu_get_ram_ptr(addr1);
             memcpy(ptr, buf, l);
-            if (!cpu_physical_memory_is_dirty(addr1)) {
-                /* invalidate code */
-                tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
-                /* set dirty bit */
-                cpu_physical_memory_set_dirty_flags(
-                    addr1, (0xff & ~CODE_DIRTY_FLAG));
-            }
+            invalidate_and_set_dirty(addr1, l);
             qemu_put_ram_ptr(ptr);
         }
         len -= l;
@@ -3666,13 +3666,7 @@ void cpu_physical_memory_unmap(void *buffer, target_phys_addr_t len,
                 l = TARGET_PAGE_SIZE;
                 if (l > access_len)
                     l = access_len;
-                if (!cpu_physical_memory_is_dirty(addr1)) {
-                    /* invalidate code */
-                    tb_invalidate_phys_page_range(addr1, addr1 + l, 0);
-                    /* set dirty bit */
-                    cpu_physical_memory_set_dirty_flags(
-                        addr1, (0xff & ~CODE_DIRTY_FLAG));
-                }
+                invalidate_and_set_dirty(addr1, l);
                 addr1 += l;
                 access_len -= l;
             }
@@ -3978,13 +3972,7 @@ static inline void stl_phys_internal(target_phys_addr_t addr, uint32_t val,
             stl_p(ptr, val);
             break;
         }
-        if (!cpu_physical_memory_is_dirty(addr1)) {
-            /* invalidate code */
-            tb_invalidate_phys_page_range(addr1, addr1 + 4, 0);
-            /* set dirty bit */
-            cpu_physical_memory_set_dirty_flags(addr1,
-                (0xff & ~CODE_DIRTY_FLAG));
-        }
+        invalidate_and_set_dirty(addr1, 4);
     }
 }
 
@@ -4051,13 +4039,7 @@ static inline void stw_phys_internal(target_phys_addr_t addr, uint32_t val,
             stw_p(ptr, val);
             break;
         }
-        if (!cpu_physical_memory_is_dirty(addr1)) {
-            /* invalidate code */
-            tb_invalidate_phys_page_range(addr1, addr1 + 2, 0);
-            /* set dirty bit */
-            cpu_physical_memory_set_dirty_flags(addr1,
-                (0xff & ~CODE_DIRTY_FLAG));
-        }
+        invalidate_and_set_dirty(addr1, 2);
     }
 }
 
-- 
cgit v1.2.3


From e226939de5814527a21396903b08c3d0ed989558 Mon Sep 17 00:00:00 2001
From: Anthony PERARD <anthony.perard@citrix.com>
Date: Wed, 3 Oct 2012 13:49:22 +0000
Subject: exec, memory: Call to xen_modified_memory.

This patch add some calls to xen_modified_memory to notify Xen about dirtybits
during migration.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
Reviewed-by: Avi Kivity <avi@redhat.com>
---
 exec-obsolete.h | 2 ++
 exec.c          | 1 +
 2 files changed, 3 insertions(+)

diff --git a/exec-obsolete.h b/exec-obsolete.h
index c09925610..286e2f75e 100644
--- a/exec-obsolete.h
+++ b/exec-obsolete.h
@@ -24,6 +24,7 @@
 #endif
 
 #ifndef CONFIG_USER_ONLY
+#include "hw/xen.h"
 
 ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
                                    MemoryRegion *mr);
@@ -111,6 +112,7 @@ static inline void cpu_physical_memory_set_dirty_range(ram_addr_t start,
     for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) {
         cpu_physical_memory_set_dirty_flags(addr, dirty_flags);
     }
+    xen_modified_memory(addr, length);
 }
 
 static inline void cpu_physical_memory_mask_dirty_range(ram_addr_t start,
diff --git a/exec.c b/exec.c
index 366684cd7..1114a0932 100644
--- a/exec.c
+++ b/exec.c
@@ -3427,6 +3427,7 @@ static void invalidate_and_set_dirty(target_phys_addr_t addr,
         /* set dirty bit */
         cpu_physical_memory_set_dirty_flags(addr, (0xff & ~CODE_DIRTY_FLAG));
     }
+    xen_modified_memory(addr, length);
 }
 
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
-- 
cgit v1.2.3


From 8aba7dc02d5660df7e7d8651304b3079908358be Mon Sep 17 00:00:00 2001
From: Anthony PERARD <anthony.perard@citrix.com>
Date: Wed, 3 Oct 2012 13:49:40 +0000
Subject: xen: Set the vram dirty when an error occur.

If the call to xc_hvm_track_dirty_vram() fails, then we set dirtybit on all the
video ram. This case happens during migration.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 xen-all.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/xen-all.c b/xen-all.c
index b11542c3e..e6308be23 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -507,7 +507,8 @@ static void xen_sync_dirty_bitmap(XenIOState *state,
                                  bitmap);
     if (rc < 0) {
         if (rc != -ENODATA) {
-            fprintf(stderr, "xen: track_dirty_vram failed (0x" TARGET_FMT_plx
+            memory_region_set_dirty(framebuffer, 0, size);
+            DPRINTF("xen: track_dirty_vram failed (0x" TARGET_FMT_plx
                     ", 0x" TARGET_FMT_plx "): %s\n",
                     start_addr, start_addr + size, strerror(-rc));
         }
-- 
cgit v1.2.3


From 4be403c8158e1b6be743f0fef004310cea4e3975 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 4 Oct 2012 12:36:04 +0200
Subject: Make target_phys_addr_t 64 bits unconditionally

The hassle and compile time overhead of maintaining both 32-bit and 64-bit
capable source isn't worth the tiny performance advantage which is seen on
a minority of configurations.  Switch to compiling libhw only once, with
target_phys_addr_t unconditionally typedefed to uint64_t.

Signed-off-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 .gitignore              |  1 +
 Makefile                |  2 +-
 Makefile.hw             |  1 -
 Makefile.target         |  3 ---
 configure               | 34 ++++------------------------------
 cpu-common.h            |  2 +-
 dma.h                   |  2 +-
 hw/hw.h                 |  2 +-
 hw/intel-hda.c          |  8 +-------
 hw/rtl8139.c            |  6 +-----
 monitor.c               |  4 ----
 target-ppc/mmu_helper.c |  4 +---
 targphys.h              | 19 +------------------
 13 files changed, 13 insertions(+), 75 deletions(-)

diff --git a/.gitignore b/.gitignore
index 824c0d24d..3ef77d062 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ trace-dtrace.dtrace
 *-linux-user
 *-bsd-user
 libdis*
+libhw
 libhw32
 libhw64
 libuser
diff --git a/Makefile b/Makefile
index 04642975a..1cebe3a9d 100644
--- a/Makefile
+++ b/Makefile
@@ -214,7 +214,7 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
 
 qemu-ga$(EXESUF): qemu-ga.o $(qga-obj-y) $(tools-obj-y) $(qapi-obj-y) $(qobject-obj-y) $(version-obj-y)
 
-QEMULIBS=libhw32 libhw64 libuser libdis libdis-user
+QEMULIBS=libhw libuser libdis libdis-user
 
 clean:
 # avoid old build problems by removing potentially incorrect old files
diff --git a/Makefile.hw b/Makefile.hw
index 59f5b4835..86f0bf40f 100644
--- a/Makefile.hw
+++ b/Makefile.hw
@@ -2,7 +2,6 @@
 
 include ../config-host.mak
 include ../config-all-devices.mak
-include config.mak
 include $(SRC_PATH)/rules.mak
 
 .PHONY: all
diff --git a/Makefile.target b/Makefile.target
index d9d54b8dd..4449444a0 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -4,9 +4,6 @@ include ../config-host.mak
 include config-devices.mak
 include config-target.mak
 include $(SRC_PATH)/rules.mak
-ifneq ($(HWDIR),)
-include $(HWDIR)/config.mak
-endif
 
 $(call set-vpath, $(SRC_PATH))
 ifdef CONFIG_LINUX
diff --git a/configure b/configure
index 8f99b7b94..65bd876cd 100755
--- a/configure
+++ b/configure
@@ -3694,7 +3694,6 @@ TARGET_ABI_DIR=""
 
 case "$target_arch2" in
   i386)
-    target_phys_bits=64
   ;;
   x86_64)
     TARGET_BASE_ARCH=i386
@@ -3702,7 +3701,6 @@ case "$target_arch2" in
     target_long_alignment=8
   ;;
   alpha)
-    target_phys_bits=64
     target_long_alignment=8
     target_nptl="yes"
   ;;
@@ -3711,22 +3709,18 @@ case "$target_arch2" in
     bflt="yes"
     target_nptl="yes"
     gdb_xml_files="arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml"
-    target_phys_bits=64
     target_llong_alignment=4
     target_libs_softmmu="$fdt_libs"
   ;;
   cris)
     target_nptl="yes"
-    target_phys_bits=32
   ;;
   lm32)
-    target_phys_bits=32
     target_libs_softmmu="$opengl_libs"
   ;;
   m68k)
     bflt="yes"
     gdb_xml_files="cf-core.xml cf-fp.xml"
-    target_phys_bits=32
     target_int_alignment=2
     target_long_alignment=2
     target_llong_alignment=2
@@ -3735,36 +3729,30 @@ case "$target_arch2" in
     TARGET_ARCH=microblaze
     bflt="yes"
     target_nptl="yes"
-    target_phys_bits=32
     target_libs_softmmu="$fdt_libs"
   ;;
   mips|mipsel)
     TARGET_ARCH=mips
     echo "TARGET_ABI_MIPSO32=y" >> $config_target_mak
     target_nptl="yes"
-    target_phys_bits=64
   ;;
   mipsn32|mipsn32el)
     TARGET_ARCH=mipsn32
     TARGET_BASE_ARCH=mips
     echo "TARGET_ABI_MIPSN32=y" >> $config_target_mak
-    target_phys_bits=64
   ;;
   mips64|mips64el)
     TARGET_ARCH=mips64
     TARGET_BASE_ARCH=mips
     echo "TARGET_ABI_MIPSN64=y" >> $config_target_mak
-    target_phys_bits=64
     target_long_alignment=8
   ;;
   or32)
     TARGET_ARCH=openrisc
     TARGET_BASE_ARCH=openrisc
-    target_phys_bits=32
   ;;
   ppc)
     gdb_xml_files="power-core.xml power-fpu.xml power-altivec.xml power-spe.xml"
-    target_phys_bits=64
     target_nptl="yes"
     target_libs_softmmu="$fdt_libs"
   ;;
@@ -3772,7 +3760,6 @@ case "$target_arch2" in
     TARGET_BASE_ARCH=ppc
     TARGET_ABI_DIR=ppc
     gdb_xml_files="power-core.xml power-fpu.xml power-altivec.xml power-spe.xml"
-    target_phys_bits=64
     target_nptl="yes"
     target_libs_softmmu="$fdt_libs"
   ;;
@@ -3780,7 +3767,6 @@ case "$target_arch2" in
     TARGET_BASE_ARCH=ppc
     TARGET_ABI_DIR=ppc
     gdb_xml_files="power64-core.xml power-fpu.xml power-altivec.xml power-spe.xml"
-    target_phys_bits=64
     target_long_alignment=8
     target_libs_softmmu="$fdt_libs"
   ;;
@@ -3790,21 +3776,17 @@ case "$target_arch2" in
     TARGET_ABI_DIR=ppc
     echo "TARGET_ABI32=y" >> $config_target_mak
     gdb_xml_files="power64-core.xml power-fpu.xml power-altivec.xml power-spe.xml"
-    target_phys_bits=64
     target_libs_softmmu="$fdt_libs"
   ;;
   sh4|sh4eb)
     TARGET_ARCH=sh4
     bflt="yes"
     target_nptl="yes"
-    target_phys_bits=32
   ;;
   sparc)
-    target_phys_bits=64
   ;;
   sparc64)
     TARGET_BASE_ARCH=sparc
-    target_phys_bits=64
     target_long_alignment=8
   ;;
   sparc32plus)
@@ -3812,11 +3794,9 @@ case "$target_arch2" in
     TARGET_BASE_ARCH=sparc
     TARGET_ABI_DIR=sparc
     echo "TARGET_ABI32=y" >> $config_target_mak
-    target_phys_bits=64
   ;;
   s390x)
     target_nptl="yes"
-    target_phys_bits=64
     target_long_alignment=8
   ;;
   unicore32)
@@ -3824,7 +3804,6 @@ case "$target_arch2" in
   ;;
   xtensa|xtensaeb)
     TARGET_ARCH=xtensa
-    target_phys_bits=32
   ;;
   *)
     echo "Unsupported target CPU"
@@ -3859,7 +3838,6 @@ echo "TARGET_ABI_DIR=$TARGET_ABI_DIR" >> $config_target_mak
 case "$target_arch2" in
   i386|x86_64)
     if test "$xen" = "yes" -a "$target_softmmu" = "yes" ; then
-      target_phys_bits=64
       echo "CONFIG_XEN=y" >> $config_target_mak
       if test "$xen_pci_passthrough" = yes; then
         echo "CONFIG_XEN_PCI_PASSTHROUGH=y" >> "$config_target_mak"
@@ -3899,11 +3877,10 @@ if test "$target_bigendian" = "yes" ; then
   echo "TARGET_WORDS_BIGENDIAN=y" >> $config_target_mak
 fi
 if test "$target_softmmu" = "yes" ; then
-  echo "TARGET_PHYS_ADDR_BITS=$target_phys_bits" >> $config_target_mak
   echo "CONFIG_SOFTMMU=y" >> $config_target_mak
   echo "LIBS+=$libs_softmmu $target_libs_softmmu" >> $config_target_mak
-  echo "HWDIR=../libhw$target_phys_bits" >> $config_target_mak
-  echo "subdir-$target: subdir-libhw$target_phys_bits" >> $config_host_mak
+  echo "HWDIR=../libhw" >> $config_target_mak
+  echo "subdir-$target: subdir-libhw" >> $config_host_mak
   if test "$smartcard_nss" = "yes" ; then
     echo "subdir-$target: subdir-libcacard" >> $config_host_mak
   fi
@@ -4145,11 +4122,8 @@ for rom in seabios vgabios ; do
     echo "LD=$ld" >> $config_mak
 done
 
-for hwlib in 32 64; do
-  d=libhw$hwlib
-  symlink "$source_path/Makefile.hw" "$d/Makefile"
-  echo "QEMU_CFLAGS+=-DTARGET_PHYS_ADDR_BITS=$hwlib" > $d/config.mak
-done
+d=libhw
+symlink "$source_path/Makefile.hw" "$d/Makefile"
 
 d=libuser
 symlink "$source_path/Makefile.user" "$d/Makefile"
diff --git a/cpu-common.h b/cpu-common.h
index 85548de5e..c0d27afd8 100644
--- a/cpu-common.h
+++ b/cpu-common.h
@@ -21,7 +21,7 @@ enum device_endian {
 };
 
 /* address in the RAM (different from a physical address) */
-#if defined(CONFIG_XEN_BACKEND) && TARGET_PHYS_ADDR_BITS == 64
+#if defined(CONFIG_XEN_BACKEND)
 typedef uint64_t ram_addr_t;
 #  define RAM_ADDR_MAX UINT64_MAX
 #  define RAM_ADDR_FMT "%" PRIx64
diff --git a/dma.h b/dma.h
index f35c4b663..1a33603f2 100644
--- a/dma.h
+++ b/dma.h
@@ -31,7 +31,7 @@ struct QEMUSGList {
     DMAContext *dma;
 };
 
-#if defined(TARGET_PHYS_ADDR_BITS)
+#ifndef CONFIG_USER_ONLY
 
 /*
  * When an IOMMU is present, bus addresses become distinct from
diff --git a/hw/hw.h b/hw/hw.h
index e5cb9bf94..16101de3c 100644
--- a/hw/hw.h
+++ b/hw/hw.h
@@ -4,7 +4,7 @@
 
 #include "qemu-common.h"
 
-#if defined(TARGET_PHYS_ADDR_BITS) && !defined(NEED_CPU_H)
+#if !defined(CONFIG_USER_ONLY) && !defined(NEED_CPU_H)
 #include "cpu-common.h"
 #endif
 
diff --git a/hw/intel-hda.c b/hw/intel-hda.c
index 127e81888..d8e1b23a6 100644
--- a/hw/intel-hda.c
+++ b/hw/intel-hda.c
@@ -210,13 +210,7 @@ static target_phys_addr_t intel_hda_addr(uint32_t lbase, uint32_t ubase)
 {
     target_phys_addr_t addr;
 
-#if TARGET_PHYS_ADDR_BITS == 32
-    addr = lbase;
-#else
-    addr = ubase;
-    addr <<= 32;
-    addr |= lbase;
-#endif
+    addr = ((uint64_t)ubase << 32) | lbase;
     return addr;
 }
 
diff --git a/hw/rtl8139.c b/hw/rtl8139.c
index 844f1b8c3..b7c82ee02 100644
--- a/hw/rtl8139.c
+++ b/hw/rtl8139.c
@@ -774,11 +774,7 @@ static void rtl8139_write_buffer(RTL8139State *s, const void *buf, int size)
 #define MIN_BUF_SIZE 60
 static inline dma_addr_t rtl8139_addr64(uint32_t low, uint32_t high)
 {
-#if TARGET_PHYS_ADDR_BITS > 32
-    return low | ((target_phys_addr_t)high << 32);
-#else
-    return low;
-#endif
+    return low | ((uint64_t)high << 32);
 }
 
 /* Workaround for buggy guest driver such as linux who allocates rx
diff --git a/monitor.c b/monitor.c
index 67064e270..7beac9a80 100644
--- a/monitor.c
+++ b/monitor.c
@@ -3259,11 +3259,7 @@ static int64_t expr_unary(Monitor *mon)
         break;
     default:
         errno = 0;
-#if TARGET_PHYS_ADDR_BITS > 32
         n = strtoull(pch, &p, 0);
-#else
-        n = strtoul(pch, &p, 0);
-#endif
         if (errno == ERANGE) {
             expr_error(mon, "number too large");
         }
diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
index d2664acef..532b114ae 100644
--- a/target-ppc/mmu_helper.c
+++ b/target-ppc/mmu_helper.c
@@ -1032,12 +1032,10 @@ static int ppcemb_tlb_check(CPUPPCState *env, ppcemb_tlb_t *tlb,
         return -1;
     }
     *raddrp = (tlb->RPN & mask) | (address & ~mask);
-#if (TARGET_PHYS_ADDR_BITS >= 36)
     if (ext) {
         /* Extend the physical address to 36 bits */
-        *raddrp |= (target_phys_addr_t)(tlb->RPN & 0xF) << 32;
+        *raddrp |= (uint64_t)(tlb->RPN & 0xF) << 32;
     }
-#endif
 
     return 0;
 }
diff --git a/targphys.h b/targphys.h
index bd4938fc0..08cade909 100644
--- a/targphys.h
+++ b/targphys.h
@@ -3,25 +3,10 @@
 #ifndef TARGPHYS_H
 #define TARGPHYS_H
 
-#ifdef TARGET_PHYS_ADDR_BITS
+#define TARGET_PHYS_ADDR_BITS 64
 /* target_phys_addr_t is the type of a physical address (its size can
    be different from 'target_ulong').  */
 
-#if TARGET_PHYS_ADDR_BITS == 32
-typedef uint32_t target_phys_addr_t;
-#define TARGET_PHYS_ADDR_MAX UINT32_MAX
-#define TARGET_FMT_plx "%08x"
-/* Format strings for printing target_phys_addr_t types.
- * These are recommended over the less flexible TARGET_FMT_plx,
- * which is retained for the benefit of existing code.
- */
-#define TARGET_PRIdPHYS PRId32
-#define TARGET_PRIiPHYS PRIi32
-#define TARGET_PRIoPHYS PRIo32
-#define TARGET_PRIuPHYS PRIu32
-#define TARGET_PRIxPHYS PRIx32
-#define TARGET_PRIXPHYS PRIX32
-#elif TARGET_PHYS_ADDR_BITS == 64
 typedef uint64_t target_phys_addr_t;
 #define TARGET_PHYS_ADDR_MAX UINT64_MAX
 #define TARGET_FMT_plx "%016" PRIx64
@@ -31,7 +16,5 @@ typedef uint64_t target_phys_addr_t;
 #define TARGET_PRIuPHYS PRIu64
 #define TARGET_PRIxPHYS PRIx64
 #define TARGET_PRIXPHYS PRIX64
-#endif
-#endif
 
 #endif
-- 
cgit v1.2.3


From a5cf8262e4eb9c4646434e2c6211ef8608db3233 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:44 +0200
Subject: scsi, pci, qdev, isa-bus, sysbus: don't let *_get_fw_dev_path return
 NULL

Use g_strdup rather than strdup, because the sole caller
(qdev_get_fw_dev_path_helper) assumes it gets non-NULL, and dereferences
it.  Besides, in that caller, the allocated buffer is already freed with
g_free, so it's better to allocate with a matching g_strdup.

In one case, (scsi-bus.c) it was trivial, so I replaced an snprintf+
g_strdup combination with an equivalent g_strdup_printf use.

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/ide/qdev.c | 2 +-
 hw/isa-bus.c  | 2 +-
 hw/pci.c      | 2 +-
 hw/qdev.c     | 2 +-
 hw/scsi-bus.c | 8 ++------
 hw/sysbus.c   | 2 +-
 6 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/hw/ide/qdev.c b/hw/ide/qdev.c
index 5ea9b8f4b..f2e4ea420 100644
--- a/hw/ide/qdev.c
+++ b/hw/ide/qdev.c
@@ -60,7 +60,7 @@ static char *idebus_get_fw_dev_path(DeviceState *dev)
     snprintf(path, sizeof(path), "%s@%d", qdev_fw_name(dev),
              ((IDEBus*)dev->parent_bus)->bus_id);
 
-    return strdup(path);
+    return g_strdup(path);
 }
 
 static int ide_qdev_init(DeviceState *qdev)
diff --git a/hw/isa-bus.c b/hw/isa-bus.c
index f9b237387..47c93d37b 100644
--- a/hw/isa-bus.c
+++ b/hw/isa-bus.c
@@ -236,7 +236,7 @@ static char *isabus_get_fw_dev_path(DeviceState *dev)
         snprintf(path + off, sizeof(path) - off, "@%04x", d->ioport_id);
     }
 
-    return strdup(path);
+    return g_strdup(path);
 }
 
 MemoryRegion *isa_address_space(ISADevice *dev)
diff --git a/hw/pci.c b/hw/pci.c
index f855cf3f3..de4b4485e 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1962,7 +1962,7 @@ static char *pcibus_get_fw_dev_path(DeviceState *dev)
                    PCI_SLOT(d->devfn));
     if (PCI_FUNC(d->devfn))
         snprintf(path + off, sizeof(path) + off, ",%x", PCI_FUNC(d->devfn));
-    return strdup(path);
+    return g_strdup(path);
 }
 
 static char *pcibus_get_dev_path(DeviceState *dev)
diff --git a/hw/qdev.c b/hw/qdev.c
index b5a52ac50..3b5ce3312 100644
--- a/hw/qdev.c
+++ b/hw/qdev.c
@@ -520,7 +520,7 @@ char* qdev_get_fw_dev_path(DeviceState *dev)
 
     path[l-1] = '\0';
 
-    return strdup(path);
+    return g_strdup(path);
 }
 
 char *qdev_get_dev_path(DeviceState *dev)
diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 058d3b237..dfb263121 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -1723,12 +1723,8 @@ static char *scsibus_get_dev_path(DeviceState *dev)
 static char *scsibus_get_fw_dev_path(DeviceState *dev)
 {
     SCSIDevice *d = SCSI_DEVICE(dev);
-    char path[100];
-
-    snprintf(path, sizeof(path), "channel@%x/%s@%x,%x", d->channel,
-             qdev_fw_name(dev), d->id, d->lun);
-
-    return strdup(path);
+    return g_strdup_printf("channel@%x/%s@%x,%x", d->channel,
+                           qdev_fw_name(dev), d->id, d->lun);
 }
 
 SCSIDevice *scsi_device_find(SCSIBus *bus, int channel, int id, int lun)
diff --git a/hw/sysbus.c b/hw/sysbus.c
index 9d8b1eaf7..c1738403d 100644
--- a/hw/sysbus.c
+++ b/hw/sysbus.c
@@ -211,7 +211,7 @@ static char *sysbus_get_fw_dev_path(DeviceState *dev)
         snprintf(path + off, sizeof(path) - off, "@i%04x", s->pio[0]);
     }
 
-    return strdup(path);
+    return g_strdup(path);
 }
 
 void sysbus_add_memory(SysBusDevice *dev, target_phys_addr_t addr,
-- 
cgit v1.2.3


From bfad67399bcca8c1afbbc93593d365044d92f7c6 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:45 +0200
Subject: sparc: use g_strdup in place of unchecked strdup

This avoids a NULL-deref upon strdup failure.
Also update matching free to g_free.

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 target-sparc/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target-sparc/cpu.c b/target-sparc/cpu.c
index f7c004c7d..eb9f0e725 100644
--- a/target-sparc/cpu.c
+++ b/target-sparc/cpu.c
@@ -643,7 +643,7 @@ static int cpu_sparc_find_by_name(sparc_def_t *cpu_def, const char *cpu_model)
 {
     unsigned int i;
     const sparc_def_t *def = NULL;
-    char *s = strdup(cpu_model);
+    char *s = g_strdup(cpu_model);
     char *featurestr, *name = strtok(s, ",");
     uint32_t plus_features = 0;
     uint32_t minus_features = 0;
@@ -735,7 +735,7 @@ static int cpu_sparc_find_by_name(sparc_def_t *cpu_def, const char *cpu_model)
 #ifdef DEBUG_FEATURES
     print_features(stderr, fprintf, cpu_def->features, NULL);
 #endif
-    free(s);
+    g_free(s);
     return 0;
 
  error:
-- 
cgit v1.2.3


From c2cba3d9314f972dfaf724d0ec2d018eb54c95f1 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:46 +0200
Subject: block: avoid buffer overrun by using pstrcpy, not strncpy

Also, use PATH_MAX, rather than the arbitrary 1024.
Using PATH_MAX is more consistent with other filename-related
variables in this file, like backing_filename and tmp_filename.

Acked-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 block.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index c108a7695..e95f613aa 100644
--- a/block.c
+++ b/block.c
@@ -1506,7 +1506,7 @@ int bdrv_commit(BlockDriverState *bs)
     int n, ro, open_flags;
     int ret = 0;
     uint8_t *buf;
-    char filename[1024];
+    char filename[PATH_MAX];
 
     if (!drv)
         return -ENOMEDIUM;
@@ -1520,7 +1520,8 @@ int bdrv_commit(BlockDriverState *bs)
     }
 
     ro = bs->backing_hd->read_only;
-    strncpy(filename, bs->backing_hd->filename, sizeof(filename));
+    /* Use pstrcpy (not strncpy): filename must be NUL-terminated. */
+    pstrcpy(filename, sizeof(filename), bs->backing_hd->filename);
     open_flags =  bs->backing_hd->open_flags;
 
     if (ro) {
-- 
cgit v1.2.3


From 3178e2755ec5a7fb1afe583fb6ac2622c2c42184 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:47 +0200
Subject: sheepdog: avoid a few buffer overruns

* parse_vdiname: Use pstrcpy, not strncpy, when the destination
buffer must be NUL-terminated.
* sd_open: Likewise, avoid buffer overrun.
* do_sd_create: Likewise.  Leave the preceding memset, since
pstrcpy does not NUL-fill, and filename needs that.
* sd_snapshot_create: Add a comment/question.
* find_vdi_name: Remove a useless memset.
* sd_snapshot_goto: Remove a useless memset.
Use pstrcpy to NUL-terminate, because find_vdi_name requires
that its vdi arg (filename parameter) be NUL-terminated.
It seems ok not to NUL-fill the buffer.
Do the same for snapid: remove useless memset-0 (instead,
zero tag[0]).  Use pstrcpy, not strncpy.
* sd_snapshot_list: Use pstrcpy, not strncpy to write
into the ->name member.  Each must be NUL-terminated.

Acked-by: Kevin Wolf <kwolf@redhat.com>
Acked-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 block/sheepdog.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 4742f8ae6..f35ff5bbe 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -866,14 +866,14 @@ static int parse_vdiname(BDRVSheepdogState *s, const char *filename,
         s->port = 0;
     }
 
-    strncpy(vdi, p, SD_MAX_VDI_LEN);
+    pstrcpy(vdi, SD_MAX_VDI_LEN, p);
 
     p = strchr(vdi, ':');
     if (p) {
         *p++ = '\0';
         *snapid = strtoul(p, NULL, 10);
         if (*snapid == 0) {
-            strncpy(tag, p, SD_MAX_VDI_TAG_LEN);
+            pstrcpy(tag, SD_MAX_VDI_TAG_LEN, p);
         }
     } else {
         *snapid = CURRENT_VDI_ID; /* search current vdi */
@@ -900,7 +900,10 @@ static int find_vdi_name(BDRVSheepdogState *s, char *filename, uint32_t snapid,
         return fd;
     }
 
-    memset(buf, 0, sizeof(buf));
+    /* This pair of strncpy calls ensures that the buffer is zero-filled,
+     * which is desirable since we'll soon be sending those bytes, and
+     * don't want the send_req to read uninitialized data.
+     */
     strncpy(buf, filename, SD_MAX_VDI_LEN);
     strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN);
 
@@ -1149,7 +1152,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
     s->max_dirty_data_idx = 0;
 
     bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
-    strncpy(s->name, vdi, sizeof(s->name));
+    pstrcpy(s->name, sizeof(s->name), vdi);
     qemu_co_mutex_init(&s->lock);
     g_free(buf);
     return 0;
@@ -1177,8 +1180,11 @@ static int do_sd_create(char *filename, int64_t vdi_size,
         return fd;
     }
 
+    /* FIXME: would it be better to fail (e.g., return -EIO) when filename
+     * does not fit in buf?  For now, just truncate and avoid buffer overrun.
+     */
     memset(buf, 0, sizeof(buf));
-    strncpy(buf, filename, SD_MAX_VDI_LEN);
+    pstrcpy(buf, sizeof(buf), filename);
 
     memset(&hdr, 0, sizeof(hdr));
     hdr.opcode = SD_OP_NEW_VDI;
@@ -1752,6 +1758,9 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
 
     s->inode.vm_state_size = sn_info->vm_state_size;
     s->inode.vm_clock_nsec = sn_info->vm_clock_nsec;
+    /* It appears that inode.tag does not require a NUL terminator,
+     * which means this use of strncpy is ok.
+     */
     strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
     /* we don't need to update entire object */
     datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
@@ -1811,13 +1820,13 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
 
     memcpy(old_s, s, sizeof(BDRVSheepdogState));
 
-    memset(vdi, 0, sizeof(vdi));
-    strncpy(vdi, s->name, sizeof(vdi));
+    pstrcpy(vdi, sizeof(vdi), s->name);
 
-    memset(tag, 0, sizeof(tag));
     snapid = strtoul(snapshot_id, NULL, 10);
-    if (!snapid) {
-        strncpy(tag, s->name, sizeof(tag));
+    if (snapid) {
+        tag[0] = 0;
+    } else {
+        pstrcpy(tag, sizeof(tag), s->name);
     }
 
     ret = find_vdi_name(s, vdi, snapid, tag, &vid, 1);
@@ -1946,8 +1955,9 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab)
 
             snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u",
                      inode.snap_id);
-            strncpy(sn_tab[found].name, inode.tag,
-                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)));
+            pstrcpy(sn_tab[found].name,
+                    MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
+                    inode.tag);
             found++;
         }
     }
-- 
cgit v1.2.3


From d66f8e7bd3de4a2ecf0680c635f870f2138425b8 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:48 +0200
Subject: vmdk: relative_path: use pstrcpy in place of strncpy

Avoid strncpy+manual-NUL-terminate.  Use pstrcpy instead.

Acked-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 block/vmdk.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index f2e861b07..1a80e5a24 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1408,8 +1408,7 @@ static int relative_path(char *dest, int dest_size,
         return -1;
     }
     if (path_is_absolute(target)) {
-        dest[dest_size - 1] = '\0';
-        strncpy(dest, target, dest_size - 1);
+        pstrcpy(dest, dest_size, target);
         return 0;
     }
     while (base[i] == target[i]) {
-- 
cgit v1.2.3


From a79b5f8b80890b402fdb0733b0a073695a7875b5 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:49 +0200
Subject: hw/9pfs: avoid buffer overrun

v9fs_add_dir_node and qemu_v9fs_synth_add_file used strncpy
to form node->name, which requires NUL-termination, but
strncpy does not ensure NUL-termination.
Use pstrcpy, which does.

Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/9pfs/virtio-9p-synth.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/9pfs/virtio-9p-synth.c b/hw/9pfs/virtio-9p-synth.c
index 92e0b09d3..e95a856d2 100644
--- a/hw/9pfs/virtio-9p-synth.c
+++ b/hw/9pfs/virtio-9p-synth.c
@@ -58,7 +58,7 @@ static V9fsSynthNode *v9fs_add_dir_node(V9fsSynthNode *parent, int mode,
         node->attr->read  = NULL;
     }
     node->private = node;
-    strncpy(node->name, name, sizeof(node->name));
+    pstrcpy(node->name, sizeof(node->name), name);
     QLIST_INSERT_HEAD_RCU(&parent->child, node, sibling);
     return node;
 }
@@ -132,7 +132,7 @@ int qemu_v9fs_synth_add_file(V9fsSynthNode *parent, int mode,
     node->attr->write  = write;
     node->attr->mode   = mode;
     node->private      = arg;
-    strncpy(node->name, name, sizeof(node->name));
+    pstrcpy(node->name, sizeof(node->name), name);
     QLIST_INSERT_HEAD_RCU(&parent->child, node, sibling);
     ret = 0;
 err_out:
-- 
cgit v1.2.3


From 1044dc1118d9a90e2aa324047bea9c91c889e28f Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:50 +0200
Subject: lm32: avoid buffer overrun

Actually do what the comment says, using pstrcpy NUL-terminate:
strncpy does not always do that.

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/lm32_hwsetup.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/lm32_hwsetup.h b/hw/lm32_hwsetup.h
index 8fc285efc..70dc61f9e 100644
--- a/hw/lm32_hwsetup.h
+++ b/hw/lm32_hwsetup.h
@@ -96,7 +96,7 @@ static inline void hwsetup_add_tag(HWSetup *hw, enum hwsetup_tag t)
 
 static inline void hwsetup_add_str(HWSetup *hw, const char *str)
 {
-    strncpy(hw->ptr, str, 31); /* make sure last byte is zero */
+    pstrcpy(hw->ptr, 32, str);
     hw->ptr += 32;
 }
 
-- 
cgit v1.2.3


From 3eadc68ebd174f5bad51fe6e0bbcf6d6651c784c Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:51 +0200
Subject: os-posix: avoid buffer overrun

os_set_proc_name: Use pstrcpy, in place of strncpy and the
ineffectual preceding assignment: name[sizeof(name) - 1] = 0;

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 os-posix.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/os-posix.c b/os-posix.c
index eabccb8fe..f855abb09 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -148,8 +148,7 @@ void os_set_proc_name(const char *s)
     char name[16];
     if (!s)
         return;
-    name[sizeof(name) - 1] = 0;
-    strncpy(name, s, sizeof(name));
+    pstrcpy(name, sizeof(name), s);
     /* Could rewrite argv[0] too, but that's a bit more complicated.
        This simple way is enough for `top'. */
     if (prctl(PR_SET_NAME, name)) {
-- 
cgit v1.2.3


From ae2150680190e510dcbcdfdbfb3a54369c75367f Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:52 +0200
Subject: ppc: avoid buffer overrun: use pstrcpy, not strncpy

A terminal NUL is required by caller's use of strchr.
It's better not to use strncpy at all, since there is no need
to zero out hundreds of trailing bytes for each iteration.

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 target-ppc/kvm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
index a31d278a5..7f6e4e0b8 100644
--- a/target-ppc/kvm.c
+++ b/target-ppc/kvm.c
@@ -795,7 +795,7 @@ static int read_cpuinfo(const char *field, char *value, int len)
             break;
         }
         if (!strncmp(line, field, field_len)) {
-            strncpy(value, line, len);
+            pstrcpy(value, len, line);
             ret = 0;
             break;
         }
-- 
cgit v1.2.3


From 900cfbcac6fa689b5fc8d53b60c3ed39047b8a33 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:53 +0200
Subject: linux-user: remove two unchecked uses of strdup

Remove two uses of strdup (use g_path_get_basename instead),
and add a comment that this strncpy use is ok.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 linux-user/elfload.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/linux-user/elfload.c b/linux-user/elfload.c
index 819fdd515..1d8bcb4e7 100644
--- a/linux-user/elfload.c
+++ b/linux-user/elfload.c
@@ -2442,7 +2442,7 @@ static void fill_prstatus(struct target_elf_prstatus *prstatus,
 
 static int fill_psinfo(struct target_elf_prpsinfo *psinfo, const TaskState *ts)
 {
-    char *filename, *base_filename;
+    char *base_filename;
     unsigned int i, len;
 
     (void) memset(psinfo, 0, sizeof (*psinfo));
@@ -2464,13 +2464,15 @@ static int fill_psinfo(struct target_elf_prpsinfo *psinfo, const TaskState *ts)
     psinfo->pr_uid = getuid();
     psinfo->pr_gid = getgid();
 
-    filename = strdup(ts->bprm->filename);
-    base_filename = strdup(basename(filename));
+    base_filename = g_path_get_basename(ts->bprm->filename);
+    /*
+     * Using strncpy here is fine: at max-length,
+     * this field is not NUL-terminated.
+     */
     (void) strncpy(psinfo->pr_fname, base_filename,
                    sizeof(psinfo->pr_fname));
-    free(base_filename);
-    free(filename);
 
+    g_free(base_filename);
     bswap_psinfo(psinfo);
     return (0);
 }
-- 
cgit v1.2.3


From 5847d9e1399d3497be8eeca6f3a20a18a40b114b Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:54 +0200
Subject: ui/vnc: simplify and avoid strncpy

Don't bother with strncpy.  There's no need for its zero-fill.
Use g_strndup in place of g_malloc+strncpy+NUL-terminate.

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 ui/vnc-auth-sasl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ui/vnc-auth-sasl.c b/ui/vnc-auth-sasl.c
index 8fba7702c..bfdcb46ef 100644
--- a/ui/vnc-auth-sasl.c
+++ b/ui/vnc-auth-sasl.c
@@ -432,9 +432,7 @@ static int protocol_client_auth_sasl_start_len(VncState *vs, uint8_t *data, size
 
 static int protocol_client_auth_sasl_mechname(VncState *vs, uint8_t *data, size_t len)
 {
-    char *mechname = g_malloc(len + 1);
-    strncpy(mechname, (char*)data, len);
-    mechname[len] = '\0';
+    char *mechname = g_strndup((const char *) data, len);
     VNC_DEBUG("Got client mechname '%s' check against '%s'\n",
               mechname, vs->sasl.mechlist);
 
-- 
cgit v1.2.3


From e5fda03839e3c61b01d6c60de5625501d01c69d0 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:55 +0200
Subject: bt: replace fragile snprintf use and unwarranted strncpy

In bt_hci_name_req a failed snprintf could return len larger than
sizeof(params.name), which means the following memset call would
have a "length" value of (size_t)-1, -2, etc...  Sounds scary.
But currently, one can deduce that there is no problem:
strlen(slave->lmp_name) is guaranteed to be smaller than
CHANGE_LOCAL_NAME_CP_SIZE, which is the same as sizeof(params.name),
so this cannot happen.  Regardless, there is no justification for
using snprintf+memset.  Use pstrcpy instead.

Also, in bt_hci_event_complete_read_local_name, use pstrcpy in place
of unwarranted strncpy.

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/bt-hci.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/hw/bt-hci.c b/hw/bt-hci.c
index a3a7fb49e..47f9a4e49 100644
--- a/hw/bt-hci.c
+++ b/hw/bt-hci.c
@@ -943,7 +943,6 @@ static int bt_hci_name_req(struct bt_hci_s *hci, bdaddr_t *bdaddr)
 {
     struct bt_device_s *slave;
     evt_remote_name_req_complete params;
-    int len;
 
     for (slave = hci->device.net->slave; slave; slave = slave->next)
         if (slave->page_scan && !bacmp(&slave->bd_addr, bdaddr))
@@ -955,9 +954,7 @@ static int bt_hci_name_req(struct bt_hci_s *hci, bdaddr_t *bdaddr)
 
     params.status       = HCI_SUCCESS;
     bacpy(&params.bdaddr, &slave->bd_addr);
-    len = snprintf(params.name, sizeof(params.name),
-                    "%s", slave->lmp_name ?: "");
-    memset(params.name + len, 0, sizeof(params.name) - len);
+    pstrcpy(params.name, sizeof(params.name), slave->lmp_name ?: "");
     bt_hci_event(hci, EVT_REMOTE_NAME_REQ_COMPLETE,
                     &params, EVT_REMOTE_NAME_REQ_COMPLETE_SIZE);
 
@@ -1388,7 +1385,7 @@ static inline void bt_hci_event_complete_read_local_name(struct bt_hci_s *hci)
     params.status = HCI_SUCCESS;
     memset(params.name, 0, sizeof(params.name));
     if (hci->device.lmp_name)
-        strncpy(params.name, hci->device.lmp_name, sizeof(params.name));
+        pstrcpy(params.name, sizeof(params.name), hci->device.lmp_name);
 
     bt_hci_event_complete(hci, &params, READ_LOCAL_NAME_RP_SIZE);
 }
-- 
cgit v1.2.3


From 9238c2099d37748a4e2cbbe709ed1ebffa6f3c8b Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:56 +0200
Subject: virtio-9p: avoid unwarranted uses of strncpy

In all of these cases, the uses of strncpy were unnecessary, since
at each point of use we know that the NUL-terminated source bytes
fit in the destination buffer.  Use memcpy in place of strncpy.

Acked-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/9pfs/virtio-9p-posix-acl.c  | 6 ++++--
 hw/9pfs/virtio-9p-xattr-user.c | 3 ++-
 hw/9pfs/virtio-9p-xattr.c      | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/hw/9pfs/virtio-9p-posix-acl.c b/hw/9pfs/virtio-9p-posix-acl.c
index a1948e3af..c064017b1 100644
--- a/hw/9pfs/virtio-9p-posix-acl.c
+++ b/hw/9pfs/virtio-9p-posix-acl.c
@@ -44,7 +44,8 @@ static ssize_t mp_pacl_listxattr(FsContext *ctx, const char *path,
         return -1;
     }
 
-    strncpy(value, ACL_ACCESS, len);
+    /* len includes the trailing NUL */
+    memcpy(value, ACL_ACCESS, len);
     return 0;
 }
 
@@ -95,7 +96,8 @@ static ssize_t mp_dacl_listxattr(FsContext *ctx, const char *path,
         return -1;
     }
 
-    strncpy(value, ACL_DEFAULT, len);
+    /* len includes the trailing NUL */
+    memcpy(value, ACL_ACCESS, len);
     return 0;
 }
 
diff --git a/hw/9pfs/virtio-9p-xattr-user.c b/hw/9pfs/virtio-9p-xattr-user.c
index 5044a3e5a..5bb602007 100644
--- a/hw/9pfs/virtio-9p-xattr-user.c
+++ b/hw/9pfs/virtio-9p-xattr-user.c
@@ -61,7 +61,8 @@ static ssize_t mp_user_listxattr(FsContext *ctx, const char *path,
         return -1;
     }
 
-    strncpy(value, name, name_size);
+    /* name_size includes the trailing NUL. */
+    memcpy(value, name, name_size);
     return name_size;
 }
 
diff --git a/hw/9pfs/virtio-9p-xattr.c b/hw/9pfs/virtio-9p-xattr.c
index 7f08f6e17..a83960676 100644
--- a/hw/9pfs/virtio-9p-xattr.c
+++ b/hw/9pfs/virtio-9p-xattr.c
@@ -53,7 +53,8 @@ ssize_t pt_listxattr(FsContext *ctx, const char *path,
         return -1;
     }
 
-    strncpy(value, name, name_size);
+    /* no need for strncpy: name_size is strlen(name)+1 */
+    memcpy(value, name, name_size);
     return name_size;
 }
 
-- 
cgit v1.2.3


From 9d055d8ac83cfd590263e8862ff683f705dfdf56 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:57 +0200
Subject: vscsi: avoid unwarranted strncpy

Don't use strncpy when the source string is known to fit
in the destination buffer.  Use equivalent memcpy.
We could even use strcpy, here, but some static analyzers
warn about that, so don't add new uses.

Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/spapr_vscsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/spapr_vscsi.c b/hw/spapr_vscsi.c
index 3cf5844e0..e3d4b237f 100644
--- a/hw/spapr_vscsi.c
+++ b/hw/spapr_vscsi.c
@@ -737,7 +737,7 @@ static int vscsi_send_adapter_info(VSCSIState *s, vscsi_req *req)
 #endif
     memset(&info, 0, sizeof(info));
     strcpy(info.srp_version, SRP_VERSION);
-    strncpy(info.partition_name, "qemu", sizeof("qemu"));
+    memcpy(info.partition_name, "qemu", sizeof("qemu"));
     info.partition_number = cpu_to_be32(0);
     info.mad_version = cpu_to_be32(1);
     info.os_type = cpu_to_be32(2);
-- 
cgit v1.2.3


From 1ab516ed9b6ba00bafc5ca37604f8af4680323ca Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:58 +0200
Subject: qemu-ga: prefer pstrcpy: consistently NUL-terminate ifreq.ifr_name

NUL-termination of the .ifr_name field is not required, but is fine
(and preferable to using strncpy and leaving the reader to wonder),
since the first thing the linux kernel does is to clear the last byte.
Besides, using pstrcpy here makes this setting of ifr_name consistent
with the other code (e.g., net/tap-linux.c) that does the same thing.

Reviewed-by: Luiz Capitulino <lcapitulino@redhat.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 qga/commands-posix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index ce9042123..b9f357cb9 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -828,7 +828,7 @@ GuestNetworkInterfaceList *qmp_guest_network_get_interfaces(Error **errp)
             }
 
             memset(&ifr, 0, sizeof(ifr));
-            strncpy(ifr.ifr_name,  info->value->name, IF_NAMESIZE);
+            pstrcpy(ifr.ifr_name, IF_NAMESIZE, info->value->name);
             if (ioctl(sock, SIOCGIFHWADDR, &ifr) == -1) {
                 snprintf(err_msg, sizeof(err_msg),
                          "failed to get MAC address of %s: %s",
-- 
cgit v1.2.3


From 2e679780ae86c6ca8bc81efe0a376a0b99b09b8f Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:09:59 +0200
Subject: libcacard/vcard_emul_nss: use pstrcpy in place of strncpy

Replace strncpy+NUL-terminate use with use of pstrcpy.
This requires linking with cutils.o (or else vssclient doesn't link),
so add that in the Makefile.

Acked-by: Alon Levy <alevy@redhat.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 libcacard/Makefile         | 3 +++
 libcacard/vcard_emul_nss.c | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libcacard/Makefile b/libcacard/Makefile
index 63990b700..487f43489 100644
--- a/libcacard/Makefile
+++ b/libcacard/Makefile
@@ -14,6 +14,9 @@ QEMU_CFLAGS+=-I../
 
 libcacard.lib-y=$(patsubst %.o,%.lo,$(libcacard-y))
 
+vscclient: $(libcacard-y) $(QEMU_OBJS) vscclient.o cutils.o
+	$(call quiet-command,$(CC) -o $@ $^ $(libcacard_libs) $(LIBS),"  LINK  $@")
+
 clean:
 	rm -f *.o */*.o *.d */*.d *.a */*.a *~ */*~ vscclient *.lo */*.lo .libs/* */.libs/* *.la */*.la *.pc
 	rm -Rf .libs */.libs
diff --git a/libcacard/vcard_emul_nss.c b/libcacard/vcard_emul_nss.c
index 802cae3a2..e1cae5bc5 100644
--- a/libcacard/vcard_emul_nss.c
+++ b/libcacard/vcard_emul_nss.c
@@ -1169,8 +1169,7 @@ vcard_emul_options(const char *args)
             NEXT_TOKEN(vname)
             NEXT_TOKEN(type_params)
             type_params_length = MIN(type_params_length, sizeof(type_str)-1);
-            strncpy(type_str, type_params, type_params_length);
-            type_str[type_params_length] = 0;
+            pstrcpy(type_str, type_params_length, type_params);
             type = vcard_emul_type_from_string(type_str);
 
             NEXT_TOKEN(type_params)
-- 
cgit v1.2.3


From 3cda346269784c234c7a296ff6851f36a1a9189d Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:10:00 +0200
Subject: acpi: remove strzcpy (strncpy-identical) function; just use strncpy

Adjust all uses s/strzcpy/strncpy/ and mark these uses
of strncpy as "ok".

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/acpi.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/hw/acpi.c b/hw/acpi.c
index f7950be26..f4aca493f 100644
--- a/hw/acpi.c
+++ b/hw/acpi.c
@@ -61,18 +61,6 @@ static int acpi_checksum(const uint8_t *data, int len)
     return (-sum) & 0xff;
 }
 
-/* like strncpy() but zero-fills the tail of destination */
-static void strzcpy(char *dst, const char *src, size_t size)
-{
-    size_t len = strlen(src);
-    if (len >= size) {
-        len = size;
-    } else {
-      memset(dst + len, 0, size - len);
-    }
-    memcpy(dst, src, len);
-}
-
 /* XXX fixme: this function uses obsolete argument parsing interface */
 int acpi_table_add(const char *t)
 {
@@ -157,7 +145,8 @@ int acpi_table_add(const char *t)
     hdr._length = cpu_to_le16(len);
 
     if (get_param_value(buf, sizeof(buf), "sig", t)) {
-        strzcpy(hdr.sig, buf, sizeof(hdr.sig));
+        /* strncpy is justified: the field need not be NUL-terminated. */
+        strncpy(hdr.sig, buf, sizeof(hdr.sig));
         ++changed;
     }
 
@@ -187,12 +176,14 @@ int acpi_table_add(const char *t)
     }
 
     if (get_param_value(buf, sizeof(buf), "oem_id", t)) {
-        strzcpy(hdr.oem_id, buf, sizeof(hdr.oem_id));
+        /* strncpy is justified: the field need not be NUL-terminated. */
+        strncpy(hdr.oem_id, buf, sizeof(hdr.oem_id));
         ++changed;
     }
 
     if (get_param_value(buf, sizeof(buf), "oem_table_id", t)) {
-        strzcpy(hdr.oem_table_id, buf, sizeof(hdr.oem_table_id));
+        /* strncpy is justified: the field need not be NUL-terminated. */
+        strncpy(hdr.oem_table_id, buf, sizeof(hdr.oem_table_id));
         ++changed;
     }
 
@@ -207,7 +198,8 @@ int acpi_table_add(const char *t)
     }
 
     if (get_param_value(buf, sizeof(buf), "asl_compiler_id", t)) {
-        strzcpy(hdr.asl_compiler_id, buf, sizeof(hdr.asl_compiler_id));
+        /* strncpy is justified: the field need not be NUL-terminated. */
+        strncpy(hdr.asl_compiler_id, buf, sizeof(hdr.asl_compiler_id));
         ++changed;
     }
 
-- 
cgit v1.2.3


From 00ea188125f6ee33e6beaff5da878fa9478e6a0d Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:10:01 +0200
Subject: qcow2: mark this file's sole strncpy use as justified

Acked-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 block/qcow2.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index aa5e603cd..c1ff31f48 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1096,6 +1096,7 @@ int qcow2_update_header(BlockDriverState *bs)
             goto fail;
         }
 
+        /* Using strncpy is ok here, since buf is not NUL-terminated. */
         strncpy(buf, bs->backing_file, buflen);
 
         header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
-- 
cgit v1.2.3


From 9310b9be14f73d4c1e98bfa315fe84326ad9e8e7 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:10:02 +0200
Subject: hw/r2d: add comment: this strncpy use is ok

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/r2d.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/r2d.c b/hw/r2d.c
index 0f16e81af..1bc191ff3 100644
--- a/hw/r2d.c
+++ b/hw/r2d.c
@@ -332,6 +332,8 @@ static void r2d_init(ram_addr_t ram_size,
     }
 
     if (kernel_cmdline) {
+        /* I see no evidence that this .kernel_cmdline buffer requires
+           NUL-termination, so using strncpy should be ok. */
         strncpy(boot_params.kernel_cmdline, kernel_cmdline,
                 sizeof(boot_params.kernel_cmdline));
     }
-- 
cgit v1.2.3


From 9b9e3ec1b47f615f635055924e24705cc3f45b20 Mon Sep 17 00:00:00 2001
From: Jim Meyering <meyering@redhat.com>
Date: Thu, 4 Oct 2012 13:10:03 +0200
Subject: doc: update HACKING wrt strncpy/pstrcpy

Reword the section on strncpy: its NUL-filling is important
in some cases.  Mention that pstrcpy's signature is different.

Signed-off-by: Jim Meyering <meyering@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 HACKING | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/HACKING b/HACKING
index 471cf1d19..dddd617a6 100644
--- a/HACKING
+++ b/HACKING
@@ -91,10 +91,11 @@ emulators.
 
 4. String manipulation
 
-Do not use the strncpy function.  According to the man page, it does
-*not* guarantee a NULL-terminated buffer, which makes it extremely dangerous
-to use.  Instead, use functionally equivalent function:
-void pstrcpy(char *buf, int buf_size, const char *str)
+Do not use the strncpy function.  As mentioned in the man page, it does *not*
+guarantee a NULL-terminated buffer, which makes it extremely dangerous to use.
+It also zeros trailing destination bytes out to the specified length.  Instead,
+use this similar function when possible, but note its different signature:
+void pstrcpy(char *dest, int dest_buf_size, const char *src)
 
 Don't use strcat because it can't check for buffer overflows, but:
 char *pstrcat(char *buf, int buf_size, const char *s)
-- 
cgit v1.2.3


From 610b823ef66b993660f1ab1447a769f190e4f3b3 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Wed, 3 Oct 2012 23:11:02 +0200
Subject: qemu-barrier: Fix compiler version check for future gcc versions

The current check will give a wrong result for gcc-5.x with x < 4.
Using QEMU_GNUC_PREREQ is simpler and fixes that issue.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 qemu-barrier.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qemu-barrier.h b/qemu-barrier.h
index 7e1119781..16f09429c 100644
--- a/qemu-barrier.h
+++ b/qemu-barrier.h
@@ -19,7 +19,7 @@
  * mfence on 32 bit as well, e.g. if built with -march=pentium-m.
  * However, on i386, there seem to be known bugs as recently as 4.3.
  * */
-#if defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 4
+#if QEMU_GNUC_PREREQ(4, 4)
 #define smp_mb() __sync_synchronize()
 #else
 #define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
-- 
cgit v1.2.3


From e0fea6b1e4df2067a51e08e67a17cb98a547287c Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 1 Oct 2012 14:18:07 +0200
Subject: qtest: implement QTEST_STOP

It is quite difficult to debug qtest test cases without extra wrapper
scripts for QEMU or similar.  This patch adds a simple environment
variable-based trigger that sends a STOP signal to the QEMU instance
under test, before attempting to connect to its QMP session.

This will block execution of the testcase and give time to attach a
debugger to the stopped QEMU process.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 tests/libqtest.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/tests/libqtest.c b/tests/libqtest.c
index 02d039218..71b84c12d 100644
--- a/tests/libqtest.c
+++ b/tests/libqtest.c
@@ -85,6 +85,22 @@ static int socket_accept(int sock)
     return ret;
 }
 
+static pid_t qtest_qemu_pid(QTestState *s)
+{
+    FILE *f;
+    char buffer[1024];
+    pid_t pid = -1;
+
+    f = fopen(s->pid_file, "r");
+    if (f) {
+        if (fgets(buffer, sizeof(buffer), f)) {
+            pid = atoi(buffer);
+        }
+    }
+    fclose(f);
+    return pid;
+}
+
 QTestState *qtest_init(const char *extra_args)
 {
     QTestState *s;
@@ -136,25 +152,21 @@ QTestState *qtest_init(const char *extra_args)
     qtest_qmp(s, "");
     qtest_qmp(s, "{ 'execute': 'qmp_capabilities' }");
 
+    if (getenv("QTEST_STOP")) {
+        kill(qtest_qemu_pid(s), SIGSTOP);
+    }
+
     return s;
 }
 
 void qtest_quit(QTestState *s)
 {
-    FILE *f;
-    char buffer[1024];
-
-    f = fopen(s->pid_file, "r");
-    if (f) {
-        if (fgets(buffer, sizeof(buffer), f)) {
-            pid_t pid = atoi(buffer);
-            int status = 0;
-
-            kill(pid, SIGTERM);
-            waitpid(pid, &status, 0);
-        }
+    int status;
 
-        fclose(f);
+    pid_t pid = qtest_qemu_pid(s);
+    if (pid != -1) {
+        kill(pid, SIGTERM);
+        waitpid(pid, &status, 0);
     }
 
     unlink(s->pid_file);
-- 
cgit v1.2.3


From b6db4aca20e9af4f62c9c9e08b9b9672a6ed3390 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 1 Oct 2012 14:22:06 +0200
Subject: rtc: fix overflow in mktimegm

When setting a date in 1980, Linux is actually disregarding the century
byte and setting the year to 2080.  This causes a year-2038 overflow
in mktimegm.  Fix this by doing the days-to-seconds computation in
64-bit math.

Reported-by: Lucas Meneghel Rodrigues <lookkas@gmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 cutils.c         |  2 +-
 tests/rtc-test.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/cutils.c b/cutils.c
index 8ef648f4b..8edd8fa13 100644
--- a/cutils.c
+++ b/cutils.c
@@ -115,7 +115,7 @@ time_t mktimegm(struct tm *tm)
         m += 12;
         y--;
     }
-    t = 86400 * (d + (153 * m - 457) / 5 + 365 * y + y / 4 - y / 100 + 
+    t = 86400ULL * (d + (153 * m - 457) / 5 + 365 * y + y / 4 - y / 100 + 
                  y / 400 - 719469);
     t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec;
     return t;
diff --git a/tests/rtc-test.c b/tests/rtc-test.c
index f23ac3a6c..2b9aa63c1 100644
--- a/tests/rtc-test.c
+++ b/tests/rtc-test.c
@@ -179,6 +179,50 @@ static void check_time(int wiggle)
 
 static int wiggle = 2;
 
+static void set_year(void)
+{
+    /* Set BCD mode */
+    cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) & ~REG_B_DM);
+    cmos_write(RTC_REG_A, 0x76);
+    cmos_write(RTC_YEAR, 0x11);
+    cmos_write(RTC_MONTH, 0x02);
+    cmos_write(RTC_DAY_OF_MONTH, 0x02);
+    cmos_write(RTC_HOURS, 0x02);
+    cmos_write(RTC_MINUTES, 0x04);
+    cmos_write(RTC_SECONDS, 0x58);
+    cmos_write(RTC_REG_A, 0x26);
+
+    g_assert_cmpint(cmos_read(RTC_HOURS), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MINUTES), ==, 0x04);
+    g_assert_cmpint(cmos_read(RTC_SECONDS), >=, 0x58);
+    g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x11);
+
+    /* Set a date in 2080 to ensure there is no year-2038 overflow.  */
+    cmos_write(RTC_REG_A, 0x76);
+    cmos_write(RTC_YEAR, 0x80);
+    cmos_write(RTC_REG_A, 0x26);
+
+    g_assert_cmpint(cmos_read(RTC_HOURS), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MINUTES), ==, 0x04);
+    g_assert_cmpint(cmos_read(RTC_SECONDS), >=, 0x58);
+    g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x80);
+
+    cmos_write(RTC_REG_A, 0x76);
+    cmos_write(RTC_YEAR, 0x11);
+    cmos_write(RTC_REG_A, 0x26);
+
+    g_assert_cmpint(cmos_read(RTC_HOURS), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MINUTES), ==, 0x04);
+    g_assert_cmpint(cmos_read(RTC_SECONDS), >=, 0x58);
+    g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x11);
+}
+
 static void bcd_check_time(void)
 {
     /* Set BCD mode */
@@ -269,6 +313,7 @@ int main(int argc, char **argv)
     qtest_add_func("/rtc/bcd/check-time", bcd_check_time);
     qtest_add_func("/rtc/dec/check-time", dec_check_time);
     qtest_add_func("/rtc/alarm-time", alarm_time);
+    qtest_add_func("/rtc/set-year", set_year);
     qtest_add_func("/rtc/fuzz-registers", fuzz_registers);
     ret = g_test_run();
 
-- 
cgit v1.2.3


From e67edb943f0c812530aaae2491da56f9542f928b Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 1 Oct 2012 14:22:07 +0200
Subject: rtc: map CMOS index 0x37 to 0x32 on read and writes

QEMU's attempt to implement the century byte cover two possible places
for the byte.  A common one on modern chipsets is 0x32, but QEMU also
stores the value in 0x37 (apparently for IBM PS/2 compatibility---it's
only been 25 years).  To simplify the implementation of the century
byte, store it only at 0x32 but remap transparently 0x37 to 0x32 when
reading and writing from CMOS.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/mc146818rtc.c      | 15 +++++++++------
 hw/mc146818rtc_regs.h |  4 ++++
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index d63554f89..a7d20d5c2 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -399,6 +399,10 @@ static void cmos_ioport_write(void *opaque, uint32_t addr, uint32_t data)
             s->cmos_data[s->cmos_index] = data;
             check_update_timer(s);
             break;
+	case RTC_IBM_PS2_CENTURY_BYTE:
+            s->cmos_index = RTC_CENTURY;
+            /* fall through */
+        case RTC_CENTURY:
         case RTC_SECONDS:
         case RTC_MINUTES:
         case RTC_HOURS:
@@ -598,6 +602,10 @@ static uint32_t cmos_ioport_read(void *opaque, uint32_t addr)
         return 0xff;
     } else {
         switch(s->cmos_index) {
+	case RTC_IBM_PS2_CENTURY_BYTE:
+            s->cmos_index = RTC_CENTURY;
+            /* fall through */
+        case RTC_CENTURY:
         case RTC_SECONDS:
         case RTC_MINUTES:
         case RTC_HOURS:
@@ -661,10 +669,6 @@ void rtc_set_memory(ISADevice *dev, int addr, int val)
         s->cmos_data[addr] = val;
 }
 
-/* PC cmos mappings */
-#define REG_IBM_CENTURY_BYTE        0x32
-#define REG_IBM_PS2_CENTURY_BYTE    0x37
-
 static void rtc_set_date_from_host(ISADevice *dev)
 {
     RTCState *s = DO_UPCAST(RTCState, dev, dev);
@@ -681,8 +685,7 @@ static void rtc_set_date_from_host(ISADevice *dev)
     rtc_set_cmos(s, &tm);
 
     val = rtc_to_bcd(s, (tm.tm_year / 100) + 19);
-    rtc_set_memory(dev, REG_IBM_CENTURY_BYTE, val);
-    rtc_set_memory(dev, REG_IBM_PS2_CENTURY_BYTE, val);
+    rtc_set_memory(dev, RTC_CENTURY, val);
 }
 
 static int rtc_post_load(void *opaque, int version_id)
diff --git a/hw/mc146818rtc_regs.h b/hw/mc146818rtc_regs.h
index fc10076ec..ccdee42b3 100644
--- a/hw/mc146818rtc_regs.h
+++ b/hw/mc146818rtc_regs.h
@@ -44,6 +44,10 @@
 #define RTC_REG_C               12
 #define RTC_REG_D               13
 
+/* PC cmos mappings */
+#define RTC_CENTURY              0x32
+#define RTC_IBM_PS2_CENTURY_BYTE 0x37
+
 #define REG_A_UIP 0x80
 
 #define REG_B_SET  0x80
-- 
cgit v1.2.3


From b8994faf2a8d6fc791669bb432bdb3a7a1711013 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Mon, 1 Oct 2012 14:22:08 +0200
Subject: rtc: implement century byte

Implement the century byte in the RTC emulation, and test that it works.
This leads to some annoying compatibility code because we need to treat
a value of 2000 for the base_year property as "use the century byte
properly" (which would be a value of 0).

The century byte will now be always-zero, rather than always-20,
for the MIPS Magnum machine whose base_year is 1980.  Commit 42fc73a
(Support epoch of 1980 in RTC emulation for MIPS Magnum, 2009-01-24)
correctly said:

    With an epoch of 1980 and a year of 2009, one could argue that [the
    century byte] should hold either 0, 1, 19 or 20.  NT 3.50 on MIPS
    does not read the century byte.

so I picked the simplest and most sensible implementation which is to
return 0 for 1980-2079, 1 for 2080-2179 and so on.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 hw/mc146818rtc.c | 27 ++++++++++++++++++---------
 tests/rtc-test.c | 32 ++++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 11 deletions(-)

diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c
index a7d20d5c2..332a77d4d 100644
--- a/hw/mc146818rtc.c
+++ b/hw/mc146818rtc.c
@@ -519,7 +519,9 @@ static void rtc_get_time(RTCState *s, struct tm *tm)
     tm->tm_wday = rtc_from_bcd(s, s->cmos_data[RTC_DAY_OF_WEEK]) - 1;
     tm->tm_mday = rtc_from_bcd(s, s->cmos_data[RTC_DAY_OF_MONTH]);
     tm->tm_mon = rtc_from_bcd(s, s->cmos_data[RTC_MONTH]) - 1;
-    tm->tm_year = rtc_from_bcd(s, s->cmos_data[RTC_YEAR]) + s->base_year - 1900;
+    tm->tm_year =
+        rtc_from_bcd(s, s->cmos_data[RTC_YEAR]) + s->base_year +
+        rtc_from_bcd(s, s->cmos_data[RTC_CENTURY]) * 100 - 1900;
 }
 
 static void rtc_set_time(RTCState *s)
@@ -552,10 +554,9 @@ static void rtc_set_cmos(RTCState *s, const struct tm *tm)
     s->cmos_data[RTC_DAY_OF_WEEK] = rtc_to_bcd(s, tm->tm_wday + 1);
     s->cmos_data[RTC_DAY_OF_MONTH] = rtc_to_bcd(s, tm->tm_mday);
     s->cmos_data[RTC_MONTH] = rtc_to_bcd(s, tm->tm_mon + 1);
-    year = (tm->tm_year - s->base_year) % 100;
-    if (year < 0)
-        year += 100;
-    s->cmos_data[RTC_YEAR] = rtc_to_bcd(s, year);
+    year = tm->tm_year + 1900 - s->base_year;
+    s->cmos_data[RTC_YEAR] = rtc_to_bcd(s, year % 100);
+    s->cmos_data[RTC_CENTURY] = rtc_to_bcd(s, year / 100);
 }
 
 static void rtc_update_time(RTCState *s)
@@ -673,7 +674,6 @@ static void rtc_set_date_from_host(ISADevice *dev)
 {
     RTCState *s = DO_UPCAST(RTCState, dev, dev);
     struct tm tm;
-    int val;
 
     qemu_get_timedate(&tm, 0);
 
@@ -683,9 +683,6 @@ static void rtc_set_date_from_host(ISADevice *dev)
 
     /* set the CMOS date */
     rtc_set_cmos(s, &tm);
-
-    val = rtc_to_bcd(s, (tm.tm_year / 100) + 19);
-    rtc_set_memory(dev, RTC_CENTURY, val);
 }
 
 static int rtc_post_load(void *opaque, int version_id)
@@ -810,6 +807,18 @@ static int rtc_initfn(ISADevice *dev)
     s->cmos_data[RTC_REG_C] = 0x00;
     s->cmos_data[RTC_REG_D] = 0x80;
 
+    /* This is for historical reasons.  The default base year qdev property
+     * was set to 2000 for most machine types before the century byte was
+     * implemented.
+     *
+     * This if statement means that the century byte will be always 0
+     * (at least until 2079...) for base_year = 1980, but will be set
+     * correctly for base_year = 2000.
+     */
+    if (s->base_year == 2000) {
+        s->base_year = 0;
+    }
+
     rtc_set_date_from_host(dev);
 
 #ifdef TARGET_I386
diff --git a/tests/rtc-test.c b/tests/rtc-test.c
index 2b9aa63c1..7fdc94a3d 100644
--- a/tests/rtc-test.c
+++ b/tests/rtc-test.c
@@ -179,12 +179,13 @@ static void check_time(int wiggle)
 
 static int wiggle = 2;
 
-static void set_year(void)
+static void set_year_20xx(void)
 {
     /* Set BCD mode */
     cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) & ~REG_B_DM);
     cmos_write(RTC_REG_A, 0x76);
     cmos_write(RTC_YEAR, 0x11);
+    cmos_write(RTC_CENTURY, 0x20);
     cmos_write(RTC_MONTH, 0x02);
     cmos_write(RTC_DAY_OF_MONTH, 0x02);
     cmos_write(RTC_HOURS, 0x02);
@@ -198,6 +199,7 @@ static void set_year(void)
     g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
     g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
     g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x11);
+    g_assert_cmpint(cmos_read(RTC_CENTURY), ==, 0x20);
 
     /* Set a date in 2080 to ensure there is no year-2038 overflow.  */
     cmos_write(RTC_REG_A, 0x76);
@@ -210,6 +212,7 @@ static void set_year(void)
     g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
     g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
     g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x80);
+    g_assert_cmpint(cmos_read(RTC_CENTURY), ==, 0x20);
 
     cmos_write(RTC_REG_A, 0x76);
     cmos_write(RTC_YEAR, 0x11);
@@ -221,6 +224,30 @@ static void set_year(void)
     g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
     g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
     g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x11);
+    g_assert_cmpint(cmos_read(RTC_CENTURY), ==, 0x20);
+}
+
+static void set_year_1980(void)
+{
+    /* Set BCD mode */
+    cmos_write(RTC_REG_B, cmos_read(RTC_REG_B) & ~REG_B_DM);
+    cmos_write(RTC_REG_A, 0x76);
+    cmos_write(RTC_YEAR, 0x80);
+    cmos_write(RTC_CENTURY, 0x19);
+    cmos_write(RTC_MONTH, 0x02);
+    cmos_write(RTC_DAY_OF_MONTH, 0x02);
+    cmos_write(RTC_HOURS, 0x02);
+    cmos_write(RTC_MINUTES, 0x04);
+    cmos_write(RTC_SECONDS, 0x58);
+    cmos_write(RTC_REG_A, 0x26);
+
+    g_assert_cmpint(cmos_read(RTC_HOURS), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MINUTES), ==, 0x04);
+    g_assert_cmpint(cmos_read(RTC_SECONDS), >=, 0x58);
+    g_assert_cmpint(cmos_read(RTC_DAY_OF_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_MONTH), ==, 0x02);
+    g_assert_cmpint(cmos_read(RTC_YEAR), ==, 0x80);
+    g_assert_cmpint(cmos_read(RTC_CENTURY), ==, 0x19);
 }
 
 static void bcd_check_time(void)
@@ -313,7 +340,8 @@ int main(int argc, char **argv)
     qtest_add_func("/rtc/bcd/check-time", bcd_check_time);
     qtest_add_func("/rtc/dec/check-time", dec_check_time);
     qtest_add_func("/rtc/alarm-time", alarm_time);
-    qtest_add_func("/rtc/set-year", set_year);
+    qtest_add_func("/rtc/set-year/20xx", set_year_20xx);
+    qtest_add_func("/rtc/set-year/1980", set_year_1980);
     qtest_add_func("/rtc/fuzz-registers", fuzz_registers);
     ret = g_test_run();
 
-- 
cgit v1.2.3


From c9159fe9aa9abe24115ea4d16127179e9cb07e22 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Fri, 5 Oct 2012 19:39:33 +0200
Subject: Remove libhw

The entries for libhw* are no longer needed in .gitignore.

There is also no longer a difference between common-obj-y and
hw-obj-y, so one of those two macros is sufficient.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 .gitignore            |   3 -
 Makefile              |   2 +-
 Makefile.hw           |  22 -----
 Makefile.objs         |   9 +-
 Makefile.target       |   1 -
 configure             |   5 --
 hw/9pfs/Makefile.objs |  14 ++--
 hw/Makefile.objs      | 226 +++++++++++++++++++++++++-------------------------
 hw/ide/Makefile.objs  |  20 ++---
 hw/usb/Makefile.objs  |  14 ++--
 10 files changed, 141 insertions(+), 175 deletions(-)
 delete mode 100644 Makefile.hw

diff --git a/.gitignore b/.gitignore
index 3ef77d062..bd6ba1c71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,9 +12,6 @@ trace-dtrace.dtrace
 *-linux-user
 *-bsd-user
 libdis*
-libhw
-libhw32
-libhw64
 libuser
 linux-headers/asm
 qapi-generated
diff --git a/Makefile b/Makefile
index 1cebe3a9d..a9c22bf1d 100644
--- a/Makefile
+++ b/Makefile
@@ -214,7 +214,7 @@ $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN)
 
 qemu-ga$(EXESUF): qemu-ga.o $(qga-obj-y) $(tools-obj-y) $(qapi-obj-y) $(qobject-obj-y) $(version-obj-y)
 
-QEMULIBS=libhw libuser libdis libdis-user
+QEMULIBS=libuser libdis libdis-user
 
 clean:
 # avoid old build problems by removing potentially incorrect old files
diff --git a/Makefile.hw b/Makefile.hw
deleted file mode 100644
index 86f0bf40f..000000000
--- a/Makefile.hw
+++ /dev/null
@@ -1,22 +0,0 @@
-# Makefile for qemu target independent devices.
-
-include ../config-host.mak
-include ../config-all-devices.mak
-include $(SRC_PATH)/rules.mak
-
-.PHONY: all
-
-$(call set-vpath, $(SRC_PATH))
-
-QEMU_CFLAGS+=-I..
-QEMU_CFLAGS += -I$(SRC_PATH)/include
-
-include $(SRC_PATH)/Makefile.objs
-
-all: $(hw-obj-y)
-# Dummy command so that make thinks it has done something
-	@true
-
-clean:
-	rm -f $(addsuffix *.o, $(sort $(dir $(hw-obj-y))))
-	rm -f $(addsuffix *.d, $(sort $(dir $(hw-obj-y))))
diff --git a/Makefile.objs b/Makefile.objs
index b1f3e2254..74b35422c 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -90,10 +90,13 @@ common-obj-y += hw/
 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
 
+common-obj-y += dma-helpers.o
 common-obj-y += iov.o acl.o
 common-obj-$(CONFIG_POSIX) += compatfd.o
 common-obj-y += notify.o event_notifier.o
 common-obj-y += qemu-timer.o qemu-timer-common.o
+common-obj-y += qtest.o
+common-obj-y += vl.o
 
 common-obj-$(CONFIG_SLIRP) += slirp/
 
@@ -115,11 +118,6 @@ user-obj-y += qemu-user.o
 user-obj-y += $(trace-obj-y)
 user-obj-y += qom/
 
-######################################################################
-# libhw
-
-hw-obj-y = vl.o dma-helpers.o qtest.o hw/
-
 ######################################################################
 # libdis
 # NOTE: the disassembler code is only needed for debugging
@@ -240,7 +238,6 @@ vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS)
 QEMU_CFLAGS+=$(GLIB_CFLAGS)
 
 nested-vars += \
-	hw-obj-y \
 	qga-obj-y \
 	block-obj-y \
 	qom-obj-y \
diff --git a/Makefile.target b/Makefile.target
index 4449444a0..3822bc5ac 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -154,7 +154,6 @@ all-obj-y += $(addprefix ../, $(universal-obj-y))
 ifdef CONFIG_SOFTMMU
 all-obj-y += $(addprefix ../, $(common-obj-y))
 all-obj-y += $(addprefix ../libdis/, $(libdis-y))
-all-obj-y += $(addprefix $(HWDIR)/, $(hw-obj-y))
 all-obj-y += $(addprefix ../, $(trace-obj-y))
 else
 all-obj-y += $(addprefix ../libuser/, $(user-obj-y))
diff --git a/configure b/configure
index e58846d5e..73a5f3dc7 100755
--- a/configure
+++ b/configure
@@ -3914,8 +3914,6 @@ fi
 if test "$target_softmmu" = "yes" ; then
   echo "CONFIG_SOFTMMU=y" >> $config_target_mak
   echo "LIBS+=$libs_softmmu $target_libs_softmmu" >> $config_target_mak
-  echo "HWDIR=../libhw" >> $config_target_mak
-  echo "subdir-$target: subdir-libhw" >> $config_host_mak
   if test "$smartcard_nss" = "yes" ; then
     echo "subdir-$target: subdir-libcacard" >> $config_host_mak
   fi
@@ -4157,9 +4155,6 @@ for rom in seabios vgabios ; do
     echo "LD=$ld" >> $config_mak
 done
 
-d=libhw
-symlink "$source_path/Makefile.hw" "$d/Makefile"
-
 d=libuser
 symlink "$source_path/Makefile.user" "$d/Makefile"
 
diff --git a/hw/9pfs/Makefile.objs b/hw/9pfs/Makefile.objs
index 972df2405..1e9b595cb 100644
--- a/hw/9pfs/Makefile.objs
+++ b/hw/9pfs/Makefile.objs
@@ -1,9 +1,9 @@
-hw-obj-y  = virtio-9p.o
-hw-obj-y += virtio-9p-local.o virtio-9p-xattr.o
-hw-obj-y += virtio-9p-xattr-user.o virtio-9p-posix-acl.o
-hw-obj-y += virtio-9p-coth.o cofs.o codir.o cofile.o
-hw-obj-y += coxattr.o virtio-9p-synth.o
-hw-obj-$(CONFIG_OPEN_BY_HANDLE) +=  virtio-9p-handle.o
-hw-obj-y += virtio-9p-proxy.o
+common-obj-y  = virtio-9p.o
+common-obj-y += virtio-9p-local.o virtio-9p-xattr.o
+common-obj-y += virtio-9p-xattr-user.o virtio-9p-posix-acl.o
+common-obj-y += virtio-9p-coth.o cofs.o codir.o cofile.o
+common-obj-y += coxattr.o virtio-9p-synth.o
+common-obj-$(CONFIG_OPEN_BY_HANDLE) +=  virtio-9p-handle.o
+common-obj-y += virtio-9p-proxy.o
 
 obj-y += virtio-9p-device.o
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index ecdbe44dd..b59c74953 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -1,140 +1,140 @@
-hw-obj-y = usb/ ide/
-hw-obj-y += loader.o
-hw-obj-$(CONFIG_VIRTIO) += virtio-console.o
-hw-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
-hw-obj-y += fw_cfg.o
-hw-obj-$(CONFIG_PCI) += pci.o pci_bridge.o pci_bridge_dev.o
-hw-obj-$(CONFIG_PCI) += msix.o msi.o
-hw-obj-$(CONFIG_PCI) += shpc.o
-hw-obj-$(CONFIG_PCI) += slotid_cap.o
-hw-obj-$(CONFIG_PCI) += pci_host.o pcie_host.o
-hw-obj-$(CONFIG_PCI) += ioh3420.o xio3130_upstream.o xio3130_downstream.o
-hw-obj-y += watchdog.o
-hw-obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
-hw-obj-$(CONFIG_ECC) += ecc.o
-hw-obj-$(CONFIG_NAND) += nand.o
-hw-obj-$(CONFIG_PFLASH_CFI01) += pflash_cfi01.o
-hw-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
-
-hw-obj-$(CONFIG_M48T59) += m48t59.o
-hw-obj-$(CONFIG_ESCC) += escc.o
-hw-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
-
-hw-obj-$(CONFIG_SERIAL) += serial.o
-hw-obj-$(CONFIG_PARALLEL) += parallel.o
-hw-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
-hw-obj-$(CONFIG_PCSPK) += pcspk.o
-hw-obj-$(CONFIG_PCKBD) += pckbd.o
-hw-obj-$(CONFIG_FDC) += fdc.o
-hw-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o
-hw-obj-$(CONFIG_APM) += pm_smbus.o apm.o
-hw-obj-$(CONFIG_DMA) += dma.o
-hw-obj-$(CONFIG_I82374) += i82374.o
-hw-obj-$(CONFIG_HPET) += hpet.o
-hw-obj-$(CONFIG_APPLESMC) += applesmc.o
-hw-obj-$(CONFIG_SMARTCARD) += ccid-card-passthru.o
-hw-obj-$(CONFIG_SMARTCARD_NSS) += ccid-card-emulated.o
-hw-obj-$(CONFIG_I8259) += i8259_common.o i8259.o
+common-obj-y = usb/ ide/
+common-obj-y += loader.o
+common-obj-$(CONFIG_VIRTIO) += virtio-console.o
+common-obj-$(CONFIG_VIRTIO_PCI) += virtio-pci.o
+common-obj-y += fw_cfg.o
+common-obj-$(CONFIG_PCI) += pci.o pci_bridge.o pci_bridge_dev.o
+common-obj-$(CONFIG_PCI) += msix.o msi.o
+common-obj-$(CONFIG_PCI) += shpc.o
+common-obj-$(CONFIG_PCI) += slotid_cap.o
+common-obj-$(CONFIG_PCI) += pci_host.o pcie_host.o
+common-obj-$(CONFIG_PCI) += ioh3420.o xio3130_upstream.o xio3130_downstream.o
+common-obj-y += watchdog.o
+common-obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
+common-obj-$(CONFIG_ECC) += ecc.o
+common-obj-$(CONFIG_NAND) += nand.o
+common-obj-$(CONFIG_PFLASH_CFI01) += pflash_cfi01.o
+common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
+
+common-obj-$(CONFIG_M48T59) += m48t59.o
+common-obj-$(CONFIG_ESCC) += escc.o
+common-obj-$(CONFIG_EMPTY_SLOT) += empty_slot.o
+
+common-obj-$(CONFIG_SERIAL) += serial.o
+common-obj-$(CONFIG_PARALLEL) += parallel.o
+common-obj-$(CONFIG_I8254) += i8254_common.o i8254.o
+common-obj-$(CONFIG_PCSPK) += pcspk.o
+common-obj-$(CONFIG_PCKBD) += pckbd.o
+common-obj-$(CONFIG_FDC) += fdc.o
+common-obj-$(CONFIG_ACPI) += acpi.o acpi_piix4.o
+common-obj-$(CONFIG_APM) += pm_smbus.o apm.o
+common-obj-$(CONFIG_DMA) += dma.o
+common-obj-$(CONFIG_I82374) += i82374.o
+common-obj-$(CONFIG_HPET) += hpet.o
+common-obj-$(CONFIG_APPLESMC) += applesmc.o
+common-obj-$(CONFIG_SMARTCARD) += ccid-card-passthru.o
+common-obj-$(CONFIG_SMARTCARD_NSS) += ccid-card-emulated.o
+common-obj-$(CONFIG_I8259) += i8259_common.o i8259.o
 
 # PPC devices
-hw-obj-$(CONFIG_PREP_PCI) += prep_pci.o
-hw-obj-$(CONFIG_I82378) += i82378.o
+common-obj-$(CONFIG_PREP_PCI) += prep_pci.o
+common-obj-$(CONFIG_I82378) += i82378.o
 # Mac shared devices
-hw-obj-$(CONFIG_MACIO) += macio.o
-hw-obj-$(CONFIG_CUDA) += cuda.o
-hw-obj-$(CONFIG_ADB) += adb.o
-hw-obj-$(CONFIG_MAC_NVRAM) += mac_nvram.o
-hw-obj-$(CONFIG_MAC_DBDMA) += mac_dbdma.o
+common-obj-$(CONFIG_MACIO) += macio.o
+common-obj-$(CONFIG_CUDA) += cuda.o
+common-obj-$(CONFIG_ADB) += adb.o
+common-obj-$(CONFIG_MAC_NVRAM) += mac_nvram.o
+common-obj-$(CONFIG_MAC_DBDMA) += mac_dbdma.o
 # OldWorld PowerMac
-hw-obj-$(CONFIG_HEATHROW_PIC) += heathrow_pic.o
-hw-obj-$(CONFIG_GRACKLE_PCI) += grackle_pci.o
+common-obj-$(CONFIG_HEATHROW_PIC) += heathrow_pic.o
+common-obj-$(CONFIG_GRACKLE_PCI) += grackle_pci.o
 # NewWorld PowerMac
-hw-obj-$(CONFIG_UNIN_PCI) += unin_pci.o
-hw-obj-$(CONFIG_DEC_PCI) += dec_pci.o
+common-obj-$(CONFIG_UNIN_PCI) += unin_pci.o
+common-obj-$(CONFIG_DEC_PCI) += dec_pci.o
 # PowerPC E500 boards
-hw-obj-$(CONFIG_PPCE500_PCI) += ppce500_pci.o
+common-obj-$(CONFIG_PPCE500_PCI) += ppce500_pci.o
 
 # MIPS devices
-hw-obj-$(CONFIG_PIIX4) += piix4.o
-hw-obj-$(CONFIG_G364FB) += g364fb.o
-hw-obj-$(CONFIG_JAZZ_LED) += jazz_led.o
+common-obj-$(CONFIG_PIIX4) += piix4.o
+common-obj-$(CONFIG_G364FB) += g364fb.o
+common-obj-$(CONFIG_JAZZ_LED) += jazz_led.o
 
 # Xilinx devices
-hw-obj-$(CONFIG_XILINX) += xilinx_intc.o
-hw-obj-$(CONFIG_XILINX) += xilinx_timer.o
-hw-obj-$(CONFIG_XILINX) += xilinx_uartlite.o
-hw-obj-$(CONFIG_XILINX_AXI) += xilinx_axidma.o
-hw-obj-$(CONFIG_XILINX_AXI) += xilinx_axienet.o
-hw-obj-$(CONFIG_XILINX_AXI) += stream.o
+common-obj-$(CONFIG_XILINX) += xilinx_intc.o
+common-obj-$(CONFIG_XILINX) += xilinx_timer.o
+common-obj-$(CONFIG_XILINX) += xilinx_uartlite.o
+common-obj-$(CONFIG_XILINX_AXI) += xilinx_axidma.o
+common-obj-$(CONFIG_XILINX_AXI) += xilinx_axienet.o
+common-obj-$(CONFIG_XILINX_AXI) += stream.o
 
 # PKUnity SoC devices
-hw-obj-$(CONFIG_PUV3) += puv3_intc.o
-hw-obj-$(CONFIG_PUV3) += puv3_ost.o
-hw-obj-$(CONFIG_PUV3) += puv3_gpio.o
-hw-obj-$(CONFIG_PUV3) += puv3_pm.o
-hw-obj-$(CONFIG_PUV3) += puv3_dma.o
+common-obj-$(CONFIG_PUV3) += puv3_intc.o
+common-obj-$(CONFIG_PUV3) += puv3_ost.o
+common-obj-$(CONFIG_PUV3) += puv3_gpio.o
+common-obj-$(CONFIG_PUV3) += puv3_pm.o
+common-obj-$(CONFIG_PUV3) += puv3_dma.o
 
 # ARM devices
-hw-obj-$(CONFIG_ARM_TIMER) += arm_timer.o
-hw-obj-$(CONFIG_PL011) += pl011.o
-hw-obj-$(CONFIG_PL022) += pl022.o
-hw-obj-$(CONFIG_PL031) += pl031.o
-hw-obj-$(CONFIG_PL041) += pl041.o lm4549.o
-hw-obj-$(CONFIG_PL050) += pl050.o
-hw-obj-$(CONFIG_PL061) += pl061.o
-hw-obj-$(CONFIG_PL080) += pl080.o
-hw-obj-$(CONFIG_PL110) += pl110.o
-hw-obj-$(CONFIG_PL181) += pl181.o
-hw-obj-$(CONFIG_PL190) += pl190.o
-hw-obj-$(CONFIG_PL310) += arm_l2x0.o
-hw-obj-$(CONFIG_VERSATILE_PCI) += versatile_pci.o
-hw-obj-$(CONFIG_VERSATILE_I2C) += versatile_i2c.o
-hw-obj-$(CONFIG_CADENCE) += cadence_uart.o
-hw-obj-$(CONFIG_CADENCE) += cadence_ttc.o
-hw-obj-$(CONFIG_CADENCE) += cadence_gem.o
-hw-obj-$(CONFIG_XGMAC) += xgmac.o
+common-obj-$(CONFIG_ARM_TIMER) += arm_timer.o
+common-obj-$(CONFIG_PL011) += pl011.o
+common-obj-$(CONFIG_PL022) += pl022.o
+common-obj-$(CONFIG_PL031) += pl031.o
+common-obj-$(CONFIG_PL041) += pl041.o lm4549.o
+common-obj-$(CONFIG_PL050) += pl050.o
+common-obj-$(CONFIG_PL061) += pl061.o
+common-obj-$(CONFIG_PL080) += pl080.o
+common-obj-$(CONFIG_PL110) += pl110.o
+common-obj-$(CONFIG_PL181) += pl181.o
+common-obj-$(CONFIG_PL190) += pl190.o
+common-obj-$(CONFIG_PL310) += arm_l2x0.o
+common-obj-$(CONFIG_VERSATILE_PCI) += versatile_pci.o
+common-obj-$(CONFIG_VERSATILE_I2C) += versatile_i2c.o
+common-obj-$(CONFIG_CADENCE) += cadence_uart.o
+common-obj-$(CONFIG_CADENCE) += cadence_ttc.o
+common-obj-$(CONFIG_CADENCE) += cadence_gem.o
+common-obj-$(CONFIG_XGMAC) += xgmac.o
 
 # PCI watchdog devices
-hw-obj-$(CONFIG_PCI) += wdt_i6300esb.o
+common-obj-$(CONFIG_PCI) += wdt_i6300esb.o
 
-hw-obj-$(CONFIG_PCI) += pcie.o pcie_aer.o pcie_port.o
+common-obj-$(CONFIG_PCI) += pcie.o pcie_aer.o pcie_port.o
 
 # PCI network cards
-hw-obj-$(CONFIG_NE2000_PCI) += ne2000.o
-hw-obj-$(CONFIG_EEPRO100_PCI) += eepro100.o
-hw-obj-$(CONFIG_PCNET_PCI) += pcnet-pci.o
-hw-obj-$(CONFIG_PCNET_COMMON) += pcnet.o
-hw-obj-$(CONFIG_E1000_PCI) += e1000.o
-hw-obj-$(CONFIG_RTL8139_PCI) += rtl8139.o
-
-hw-obj-$(CONFIG_SMC91C111) += smc91c111.o
-hw-obj-$(CONFIG_LAN9118) += lan9118.o
-hw-obj-$(CONFIG_NE2000_ISA) += ne2000-isa.o
-hw-obj-$(CONFIG_OPENCORES_ETH) += opencores_eth.o
+common-obj-$(CONFIG_NE2000_PCI) += ne2000.o
+common-obj-$(CONFIG_EEPRO100_PCI) += eepro100.o
+common-obj-$(CONFIG_PCNET_PCI) += pcnet-pci.o
+common-obj-$(CONFIG_PCNET_COMMON) += pcnet.o
+common-obj-$(CONFIG_E1000_PCI) += e1000.o
+common-obj-$(CONFIG_RTL8139_PCI) += rtl8139.o
+
+common-obj-$(CONFIG_SMC91C111) += smc91c111.o
+common-obj-$(CONFIG_LAN9118) += lan9118.o
+common-obj-$(CONFIG_NE2000_ISA) += ne2000-isa.o
+common-obj-$(CONFIG_OPENCORES_ETH) += opencores_eth.o
 
 # SCSI layer
-hw-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o
-hw-obj-$(CONFIG_MEGASAS_SCSI_PCI) += megasas.o
-hw-obj-$(CONFIG_ESP) += esp.o
-hw-obj-$(CONFIG_ESP_PCI) += esp-pci.o
+common-obj-$(CONFIG_LSI_SCSI_PCI) += lsi53c895a.o
+common-obj-$(CONFIG_MEGASAS_SCSI_PCI) += megasas.o
+common-obj-$(CONFIG_ESP) += esp.o
+common-obj-$(CONFIG_ESP_PCI) += esp-pci.o
 
-hw-obj-y += sysbus.o isa-bus.o
-hw-obj-y += qdev-addr.o
+common-obj-y += sysbus.o isa-bus.o
+common-obj-y += qdev-addr.o
 
 # VGA
-hw-obj-$(CONFIG_VGA_PCI) += vga-pci.o
-hw-obj-$(CONFIG_VGA_ISA) += vga-isa.o
-hw-obj-$(CONFIG_VGA_ISA_MM) += vga-isa-mm.o
-hw-obj-$(CONFIG_VMWARE_VGA) += vmware_vga.o
-hw-obj-$(CONFIG_VMMOUSE) += vmmouse.o
-hw-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o
+common-obj-$(CONFIG_VGA_PCI) += vga-pci.o
+common-obj-$(CONFIG_VGA_ISA) += vga-isa.o
+common-obj-$(CONFIG_VGA_ISA_MM) += vga-isa-mm.o
+common-obj-$(CONFIG_VMWARE_VGA) += vmware_vga.o
+common-obj-$(CONFIG_VMMOUSE) += vmmouse.o
+common-obj-$(CONFIG_VGA_CIRRUS) += cirrus_vga.o
 
-hw-obj-$(CONFIG_RC4030) += rc4030.o
-hw-obj-$(CONFIG_DP8393X) += dp8393x.o
-hw-obj-$(CONFIG_DS1225Y) += ds1225y.o
-hw-obj-$(CONFIG_MIPSNET) += mipsnet.o
+common-obj-$(CONFIG_RC4030) += rc4030.o
+common-obj-$(CONFIG_DP8393X) += dp8393x.o
+common-obj-$(CONFIG_DS1225Y) += ds1225y.o
+common-obj-$(CONFIG_MIPSNET) += mipsnet.o
 
-hw-obj-y += null-machine.o
+common-obj-y += null-machine.o
 
 # Sound
 sound-obj-y =
@@ -148,9 +148,9 @@ sound-obj-$(CONFIG_HDA) += intel-hda.o hda-audio.o
 
 $(obj)/adlib.o $(obj)/fmopl.o: QEMU_CFLAGS += -DBUILD_Y8950=0
 
-hw-obj-$(CONFIG_SOUND) += $(sound-obj-y)
+common-obj-$(CONFIG_SOUND) += $(sound-obj-y)
 
-hw-obj-$(CONFIG_REALLY_VIRTFS) += 9pfs/
+common-obj-$(CONFIG_REALLY_VIRTFS) += 9pfs/
 
 common-obj-y += usb/
 common-obj-y += irq.o
diff --git a/hw/ide/Makefile.objs b/hw/ide/Makefile.objs
index cf718dd01..5c8c22aad 100644
--- a/hw/ide/Makefile.objs
+++ b/hw/ide/Makefile.objs
@@ -1,10 +1,10 @@
-hw-obj-$(CONFIG_IDE_CORE) += core.o atapi.o
-hw-obj-$(CONFIG_IDE_QDEV) += qdev.o
-hw-obj-$(CONFIG_IDE_PCI) += pci.o
-hw-obj-$(CONFIG_IDE_ISA) += isa.o
-hw-obj-$(CONFIG_IDE_PIIX) += piix.o
-hw-obj-$(CONFIG_IDE_CMD646) += cmd646.o
-hw-obj-$(CONFIG_IDE_MACIO) += macio.o
-hw-obj-$(CONFIG_IDE_VIA) += via.o
-hw-obj-$(CONFIG_AHCI) += ahci.o
-hw-obj-$(CONFIG_AHCI) += ich.o
+common-obj-$(CONFIG_IDE_CORE) += core.o atapi.o
+common-obj-$(CONFIG_IDE_QDEV) += qdev.o
+common-obj-$(CONFIG_IDE_PCI) += pci.o
+common-obj-$(CONFIG_IDE_ISA) += isa.o
+common-obj-$(CONFIG_IDE_PIIX) += piix.o
+common-obj-$(CONFIG_IDE_CMD646) += cmd646.o
+common-obj-$(CONFIG_IDE_MACIO) += macio.o
+common-obj-$(CONFIG_IDE_VIA) += via.o
+common-obj-$(CONFIG_AHCI) += ahci.o
+common-obj-$(CONFIG_AHCI) += ich.o
diff --git a/hw/usb/Makefile.objs b/hw/usb/Makefile.objs
index 4225136d0..6425c1ff7 100644
--- a/hw/usb/Makefile.objs
+++ b/hw/usb/Makefile.objs
@@ -1,11 +1,11 @@
-hw-obj-$(CONFIG_USB_UHCI) += hcd-uhci.o
-hw-obj-$(CONFIG_USB_OHCI) += hcd-ohci.o
-hw-obj-$(CONFIG_USB_EHCI) += hcd-ehci.o
-hw-obj-$(CONFIG_USB_XHCI) += hcd-xhci.o
-hw-obj-y += libhw.o
+common-obj-$(CONFIG_USB_UHCI) += hcd-uhci.o
+common-obj-$(CONFIG_USB_OHCI) += hcd-ohci.o
+common-obj-$(CONFIG_USB_EHCI) += hcd-ehci.o
+common-obj-$(CONFIG_USB_XHCI) += hcd-xhci.o
+common-obj-y += libhw.o
 
-hw-obj-$(CONFIG_SMARTCARD) += dev-smartcard-reader.o
-hw-obj-$(CONFIG_USB_REDIR) += redirect.o
+common-obj-$(CONFIG_SMARTCARD) += dev-smartcard-reader.o
+common-obj-$(CONFIG_USB_REDIR) += redirect.o
 
 common-obj-y += core.o bus.o desc.o dev-hub.o
 common-obj-y += host-$(HOST_USB).o dev-bluetooth.o
-- 
cgit v1.2.3