diff options
-rw-r--r-- | Makefile.target | 5 | ||||
-rw-r--r-- | hw/hypercall.c | 92 | ||||
-rw-r--r-- | hw/vga.c | 8 | ||||
-rw-r--r-- | kvm/Makefile | 2 | ||||
-rw-r--r-- | kvm/drivers/Makefile | 3 | ||||
-rw-r--r-- | kvm/drivers/hypercall.c | 304 | ||||
-rw-r--r-- | kvm/kernel/Makefile | 3 | ||||
-rw-r--r-- | kvm/kernel/external-module-compat.h | 22 | ||||
-rw-r--r-- | kvm/kernel/include/linux/kvm_para.h | 73 | ||||
-rw-r--r-- | kvm/kernel/include/linux/mutex.h | 5 | ||||
-rw-r--r-- | kvm/kernel/kvm.h | 10 | ||||
-rw-r--r-- | kvm/kernel/kvm_main.c | 140 | ||||
-rw-r--r-- | kvm/kernel/kvm_svm.h | 3 | ||||
-rw-r--r-- | kvm/kernel/mmu.c | 36 | ||||
-rw-r--r-- | kvm/kernel/paging_tmpl.h | 18 | ||||
-rw-r--r-- | kvm/kernel/svm.c | 36 | ||||
-rw-r--r-- | kvm/kernel/vmx.c | 32 | ||||
-rwxr-xr-x | kvm/kvm_stat | 10 | ||||
-rw-r--r-- | kvm/user/kvmctl.c | 69 | ||||
-rw-r--r-- | kvm/user/kvmctl.h | 3 | ||||
-rw-r--r-- | qemu-kvm.c | 31 | ||||
-rw-r--r-- | vl.c | 4 |
22 files changed, 770 insertions, 139 deletions
diff --git a/Makefile.target b/Makefile.target index 3e1d07333..5c81a0b54 100644 --- a/Makefile.target +++ b/Makefile.target @@ -25,6 +25,8 @@ CFLAGS+=-Wall -O2 -g -fno-strict-aliasing #CFLAGS+=-Werror LDFLAGS+=-g LIBS= +# libraries we depend on +DEPLIBS= HELPER_CFLAGS=$(CFLAGS) DYNGEN=../dyngen$(EXESUF) # user emulator name @@ -331,6 +333,7 @@ AUDIODRV+= wavcapture.o ifdef CONFIG_KVM_KERNEL_INC DEFINES += -I $(CONFIG_KVM_KERNEL_INC) LIBS += -lkvm +DEPLIBS += ../user/libkvm.a endif # SCSI layer @@ -437,7 +440,7 @@ ifdef CONFIG_WIN32 SDL_LIBS := $(filter-out -mwindows, $(SDL_LIBS)) -mconsole endif -$(QEMU_SYSTEM): $(VL_OBJS) libqemu.a +$(QEMU_SYSTEM): $(VL_OBJS) libqemu.a $(DEPLIBS) $(CC) $(VL_LDFLAGS) -o $@ $^ $(LIBS) $(SDL_LIBS) $(COCOA_LIBS) $(VL_LIBS) cocoa.o: cocoa.m diff --git a/hw/hypercall.c b/hw/hypercall.c index a1f629ca4..e531cb583 100644 --- a/hw/hypercall.c +++ b/hw/hypercall.c @@ -25,21 +25,22 @@ #include "vl.h" #include <stddef.h> -#define HP_CMD 0x00 // The command register WR -#define HP_ISRSTATUS 0x04 // Interrupt status reg RD +#define HCR_REGISTER 0x00 // Hypercall Command Register WR +#define HSR_REGISTER 0x04 // Hypercall Status Register RD #define HP_TXSIZE 0x08 #define HP_TXBUFF 0x0c #define HP_RXSIZE 0x10 #define HP_RXBUFF 0x14 -// HP_CMD register commands -#define HP_CMD_DI 1 // disable interrupts -#define HP_CMD_EI 2 // enable interrupts -#define HP_CMD_RESET 4 // enable interrupts +// HCR_REGISTER commands +#define HCR_DI 1 // disable interrupts +#define HCR_EI 2 // enable interrupts +#define HCR_GRS 4 // Global reset +#define HCR_RESET (HCR_GRS|HCR_DI) -/* Bits in HP_ISR - Interrupt status register */ -#define HPISR_RX 0x01 // Data is ready to be read +// Bits in HSR_REGISTER +#define HSR_VDR 0x01 // vmchannel Data is ready to be read int use_hypercall_dev = 0; static CharDriverState *vmchannel_hd; @@ -47,8 +48,8 @@ static CharDriverState *vmchannel_hd; #define HP_MEM_SIZE 0xE0 typedef struct HypercallState { - uint32_t cmd; - uint32_t isr; + uint32_t hcr; + uint32_t hsr; uint32_t txsize; uint32_t txbuff; uint32_t rxsize; @@ -61,32 +62,40 @@ typedef struct HypercallState { HypercallState *pHypercallState = NULL; + +#define HYPERCALL_DEBUG 1 + static void hp_reset(HypercallState *s) { - s->cmd = 0; - s->isr = 0; + s->hcr = 0; + s->hsr = 0; s->txsize = 0; s->txbuff = 0; s->rxsize= 0; s->txbufferaccu_offset = 0; } +static void hypercall_update_irq(HypercallState *s); + + static void hp_ioport_write(void *opaque, uint32_t addr, uint32_t val) { HypercallState *s = opaque; - //printf("hp_ioport_write,addr=0x%x, val=0x%x\n",addr, val); - +#ifdef HYPERCALL_DEBUG + printf("%s: addr=0x%x, val=0x%x\n", __FUNCTION__, addr, val); +#endif addr &= 0xff; switch(addr) { - case HP_CMD: + case HCR_REGISTER: { - s->cmd = val; - if (val == HP_CMD_RESET){ + s->hcr = val; + if (s->hcr & HCR_DI) + hypercall_update_irq(s); + if (val & HCR_GRS){ hp_reset(s); - return; } break; } @@ -115,7 +124,6 @@ static void hp_ioport_write(void *opaque, uint32_t addr, uint32_t val) s->txbufferaccu[s->txbufferaccu_offset] = val; s->txbufferaccu_offset++; if (s->txbufferaccu_offset >= s->txsize) { - printf("tranmit txbuf, Len:0x%x\n", s->txbufferaccu_offset); qemu_chr_write(vmchannel_hd, s->txbufferaccu, s->txsize); s->txbufferaccu_offset = 0; s->txsize = 0; @@ -134,10 +142,9 @@ static uint32_t hp_ioport_read(void *opaque, uint32_t addr) HypercallState *s = opaque; int ret; - if (addr != 0xc204) { - //printf("hp_ioport_read addr:0x%x\n",addr); - } - +#ifdef HYPERCALL_DEBUG + printf("%s: addr=0x%x\n", __FUNCTION__, addr); +#endif addr &= 0xff; if (addr >= offsetof(HypercallState, RxBuff) ) @@ -149,13 +156,10 @@ static uint32_t hp_ioport_read(void *opaque, uint32_t addr) switch (addr) { - case HP_ISRSTATUS: - if (s->isr != 0){ - printf("hp_ioport_read s->isr=0x%x\n", s->isr); - } - ret = s->isr; - if (ret & HPISR_RX) { - s->isr &= ~HPISR_RX; + case HSR_REGISTER: + ret = s->hsr; + if (ret & HSR_VDR) { + s->hsr &= ~HSR_VDR; } break; case HP_RXSIZE: @@ -192,13 +196,8 @@ static void hp_map(PCIDevice *pci_dev, int region_num, static void hypercall_update_irq(HypercallState *s) { - printf("hypercall_update_irq\n"); - - if (s->cmd &= HP_CMD_DI) { - return; - } - /* PCI irq */ - pci_set_irq(s->pci_dev, 0, 1); + /* PCI irq */ + pci_set_irq(s->pci_dev, 0, !(s->hcr & HCR_DI)); } void pci_hypercall_init(PCIBus *bus) @@ -250,24 +249,21 @@ static int vmchannel_can_read(void *opaque) static void vmchannel_read(void *opaque, const uint8_t *buf, int size) { int i; - - printf("vmchannel_read buf:%p, size:%d\n", buf, size); - for(i = 0; i < size; i++) { - printf("%x,", buf[i]); - } - printf("\n"); + +#ifdef HYPERCALL_DEBUG + printf("vmchannel_read buf:%s, size:%d\n", buf, size); +#endif // if the hypercall device is in interrupts disabled state, don't accept the data - if (pHypercallState->cmd &= HP_CMD_DI) { + if (pHypercallState->hcr & HCR_DI) { return; } for(i = 0; i < size; i++) { - //printf("buf[i%d]=%x\n",i, buf[i]); pHypercallState->RxBuff[i] = buf[i]; } pHypercallState->rxsize = size; - pHypercallState->isr = HPISR_RX; + pHypercallState->hsr = HSR_VDR; hypercall_update_irq(pHypercallState); } @@ -275,7 +271,9 @@ void vmchannel_init(CharDriverState *hd) { vmchannel_hd = hd; - //printf("vmchannel_init\n"); +#ifdef HYPERCALL_DEBUG + printf("vmchannel_init\n"); +#endif use_hypercall_dev = 1; qemu_chr_add_read_handler(vmchannel_hd, vmchannel_can_read, vmchannel_read, &pHypercallState); @@ -1396,9 +1396,13 @@ static void vga_draw_graphic(VGAState *s, int full_update) /* HACK ALERT */ #define BITMAP_SIZE ((8*1024*1024) / 4096 / 8 / sizeof(long)) unsigned long bitmap[BITMAP_SIZE]; + int r; - if (kvm_allowed) - kvm_get_dirty_pages(kvm_context, 1, &bitmap); + if (kvm_allowed) { + r = kvm_get_dirty_pages(kvm_context, 1, &bitmap); + if (r < 0) + fprintf(stderr, "kvm: get_dirty_pages returned %d\n", r); + } #endif full_update |= update_basic_params(s); diff --git a/kvm/Makefile b/kvm/Makefile index 48dda0264..48549299d 100644 --- a/kvm/Makefile +++ b/kvm/Makefile @@ -68,4 +68,4 @@ clean: for i in $(if $(WANT_MODULE), kernel) user qemu; do \ make -C $$i clean; \ done - rm -f config.make user/config.mak + rm -f config.mak user/config.mak diff --git a/kvm/drivers/Makefile b/kvm/drivers/Makefile index d0b681d43..56facbb0e 100644 --- a/kvm/drivers/Makefile +++ b/kvm/drivers/Makefile @@ -1,4 +1,5 @@ -KERNELDIR := /lib/modules/$(shell uname -r)/build +include ../config.mak +KERNELDIR ?= /lib/modules/$(shell uname -r)/build KVERREL = $(patsubst /lib/modules/%/build,%,$(KERNELDIR)) DESTDIR= diff --git a/kvm/drivers/hypercall.c b/kvm/drivers/hypercall.c index 9c9462f66..e5f4c8b56 100644 --- a/kvm/drivers/hypercall.c +++ b/kvm/drivers/hypercall.c @@ -6,11 +6,12 @@ #include <linux/init.h> #include <linux/ioport.h> #include <linux/completion.h> +#include <linux/interrupt.h> #include <asm/io.h> #include <asm/uaccess.h> #include <asm/irq.h> -#define HYPERCALL_DRIVER_NAME "Qumranet hypercall driver" +#define HYPERCALL_DRIVER_NAME "Qumranet_hypercall_driver" #define HYPERCALL_DRIVER_VERSION "1" #define PCI_VENDOR_ID_HYPERCALL 0x5002 #define PCI_DEVICE_ID_HYPERCALL 0x2258 @@ -43,26 +44,68 @@ static struct pci_device_id hypercall_pci_tbl[] = { }; MODULE_DEVICE_TABLE (pci, hypercall_pci_tbl); + + +/****** Hypercall device definitions ***************/ +/* To be moved into a shared file with user space */ +#define HP_CMD 0x00 // The command register WR +#define HP_ISRSTATUS 0x04 // Interrupt status reg RD +#define HP_TXSIZE 0x08 +#define HP_TXBUFF 0x0c +#define HP_RXSIZE 0x10 +#define HP_RXBUFF 0x14 + +// HP_CMD register commands +#define HP_CMD_DI 1 // disable interrupts +#define HP_CMD_EI 2 // enable interrupts +#define HP_CMD_INIT 4 // reset device +#define HP_CMD_RESET (HP_CMD_INIT|HP_CMD_DI) + +/* Bits in HP_ISR - Interrupt status register */ +#define HPISR_RX 0x01 // Data is ready to be read + +#define HP_MEM_SIZE 0xE0 +/******* End of Hypercall device definitions */ + +/* read PIO/MMIO register */ +#define HIO_READ8(reg, ioaddr) ioread8(ioaddr + (reg)) +#define HIO_READ16(reg, ioaddr) ioread16(ioaddr + (reg)) +#define HIO_READ32(reg, ioaddr) ioread32(ioaddr + (reg)) + +/* write PIO/MMIO register */ +#define HIO_WRITE8(reg, val8, ioaddr) iowrite8((val8), ioaddr + (reg)) +#define HIO_WRITE16(reg, val16, ioaddr) iowrite16((val16), ioaddr + (reg)) +#define HIO_WRITE32(reg, val32, ioaddr) iowrite32((val32), ioaddr + (reg)) + + struct hypercall_dev { struct pci_dev *pci_dev; + struct kobject kobject; u32 state; spinlock_t lock; u8 name[128]; u16 irq; u32 regs_len; - void __iomem *mmio_addr; + void __iomem *io_addr; unsigned long base_addr; /* device I/O address */ + unsigned long cmd; }; - +static int hypercall_close(struct hypercall_dev* dev); +static int hypercall_open(struct hypercall_dev *dev); static void hypercall_cleanup_dev(struct hypercall_dev *dev); +static irqreturn_t hypercall_interrupt(int irq, void *dev_instance, + struct pt_regs *regs); + +static void __exit hypercall_sysfs_remove(struct hypercall_dev *dev); +static int hypercall_sysfs_add(struct hypercall_dev *dev); static int __devinit hypercall_init_board(struct pci_dev *pdev, struct hypercall_dev **dev_out) { - unsigned long *ioaddr; + unsigned long ioaddr; struct hypercall_dev *dev; int rc; u32 disable_dev_on_err = 0; @@ -101,17 +144,17 @@ static int __devinit hypercall_init_board(struct pci_dev *pdev, if (rc) goto err_out; - pci_set_master (pdev); - #define USE_IO_OPS 1 #ifdef USE_IO_OPS - ioaddr = pci_iomap(pdev, 0, 0); + ioaddr = (unsigned long)pci_iomap(pdev, 0, 0); + //ioaddr = ioport_map(pio_start, pio_len); if (!ioaddr) { printk(KERN_ERR "%s: cannot map PIO, aborting\n", pci_name(pdev)); rc = -EIO; goto err_out; } - dev->base_addr = (unsigned long)ioaddr; + dev->base_addr = (unsigned long)pio_start; + dev->io_addr = (void*)ioaddr; dev->regs_len = pio_len; #else ioaddr = pci_iomap(pdev, 1, 0); @@ -121,6 +164,7 @@ static int __devinit hypercall_init_board(struct pci_dev *pdev, goto err_out; } dev->base_addr = ioaddr; + dev->io_addr = (void*)ioaddr; dev->regs_len = mmio_len; #endif /* USE_IO_OPS */ @@ -161,7 +205,13 @@ static int __devinit hypercall_init_one(struct pci_dev *pdev, spin_lock_init(&dev->lock); pci_set_drvdata(pdev, dev); - printk (KERN_INFO "%s: 0x%lx, IRQ %d\n", dev->name, dev->base_addr, dev->irq); + printk (KERN_INFO "name=%s: base_addr=0x%lx, io_addr=0x%lx, IRQ=%d\n", + dev->name, dev->base_addr, (unsigned long)dev->io_addr, dev->irq); + hypercall_open(dev); + + if (hypercall_sysfs_add(dev) != 0) + return -1; + return 0; } @@ -171,10 +221,111 @@ static void __devexit hypercall_remove_one(struct pci_dev *pdev) assert(dev != NULL); + hypercall_close(dev); + hypercall_sysfs_remove(dev); hypercall_cleanup_dev(dev); pci_disable_device(pdev); } +static int hypercall_tx(struct hypercall_dev *dev, unsigned char *buf, size_t len) +{ + void __iomem *ioaddr = (void __iomem*)dev->io_addr; + int i; + + if (len > HP_MEM_SIZE) + return -EINVAL; + + spin_lock(&dev->lock); + HIO_WRITE8(HP_TXSIZE, len, ioaddr); + for (i=0; i< len; i++) + HIO_WRITE8(HP_TXBUFF, buf[i], ioaddr); + spin_unlock(&dev->lock); + + return 0; +} + +/* + * The interrupt handler does all of the rx work and cleans up + * after the tx + */ +static irqreturn_t hypercall_interrupt(int irq, void *dev_instance, + struct pt_regs *regs) +{ + struct hypercall_dev *dev = (struct hypercall_dev *)dev_instance; + void __iomem *ioaddr = (void __iomem*)dev->io_addr; + u32 status; + int irq_handled = IRQ_NONE; + int rx_buf_size; + int i; + u8 buffer[HP_MEM_SIZE]; + u8 *pbuf; + + DPRINTK("base addr is 0x%lx, io_addr=0x%lx\n", dev->base_addr, (long)dev->io_addr); + + spin_lock(&dev->lock); + status = HIO_READ8(HP_ISRSTATUS, ioaddr); + DPRINTK("irq status is 0x%x\n", status); + + /* shared irq? */ + if (unlikely((status & HPISR_RX) == 0)) { + DPRINTK("not handeling irq, not ours\n"); + goto out; + } + + /* Disable device interrupts */ + HIO_WRITE8(HP_CMD, HP_CMD_DI, ioaddr); + DPRINTK("disable device interrupts\n"); + + rx_buf_size = HIO_READ8(HP_RXSIZE, ioaddr); + DPRINTK("Rx buffer size is %d\n", rx_buf_size); + + if (rx_buf_size > HP_MEM_SIZE) + rx_buf_size = HP_MEM_SIZE; + + for (i=0, pbuf=buffer; i<rx_buf_size; i++, pbuf++) { + *pbuf = HIO_READ8(HP_RXBUFF, ioaddr + i); + DPRINTK("Read 0x%x as dword %d\n", *pbuf, i); + } + *pbuf = '\0'; + DPRINTK("Read buffer %s", (char*)buffer); + + HIO_WRITE8(HP_CMD, HP_CMD_EI, ioaddr); + DPRINTK("Enable interrupt\n"); + irq_handled = IRQ_HANDLED; + out: + spin_unlock(&dev->lock); + + + hypercall_tx(dev, "hello host", sizeof("hello host")); + return irq_handled; +} + + +static int hypercall_open(struct hypercall_dev *dev) +{ + int rc; + + rc = request_irq(dev->irq, &hypercall_interrupt, + SA_SHIRQ, dev->name, dev); + if (rc) { + printk(KERN_ERR "%s failed to request an irq\n", __FUNCTION__); + return rc; + } + + //hypercall_thread_start(dev); + + return 0; +} + +static int hypercall_close(struct hypercall_dev* dev) +{ + //hypercall_thread_stop(dev); + synchronize_irq(dev->irq); + free_irq(dev->irq, dev); + + return 0; +} + #ifdef CONFIG_PM static int hypercall_suspend(struct pci_dev *pdev, pm_message_t state) @@ -201,7 +352,8 @@ static void hypercall_cleanup_dev(struct hypercall_dev *dev) { DPRINTK("cleaning up\n"); pci_release_regions(dev->pci_dev); - pci_iounmap(dev->pci_dev, (void*)dev->base_addr); + pci_iounmap(dev->pci_dev, (void*)dev->io_addr); + pci_set_drvdata (dev->pci_dev, NULL); kfree(dev); } @@ -227,5 +379,137 @@ static void __exit hypercall_cleanup_module(void) pci_unregister_driver(&hypercall_pci_driver); } +/* + * sysfs support + */ + +struct hypercall_attribute { + struct attribute attr; + ssize_t (*show)(struct hypercall_dev*, char *buf); + ssize_t (*store)(struct hypercall_dev*, unsigned long val); +}; + +static ssize_t hypercall_attribute_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct hypercall_attribute *hypercall_attr; + struct hypercall_dev *hdev; + + hypercall_attr = container_of(attr, struct hypercall_attribute, attr); + hdev = container_of(kobj, struct hypercall_dev, kobject); + + if (!hypercall_attr->show) + return -EIO; + + return hypercall_attr->show(hdev, buf); +} + +static ssize_t hypercall_attribute_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + struct hypercall_attribute *hypercall_attr; + struct hypercall_dev *hdev; + char *endp; + unsigned long val; + int rc; + + val = simple_strtoul(buf, &endp, 0); + + hypercall_attr = container_of(attr, struct hypercall_attribute, attr); + hdev = container_of(kobj, struct hypercall_dev, kobject); + + if (!hypercall_attr->store) + return -EIO; + + rc = hypercall_attr->store(hdev, val); + if (!rc) + rc = count; + return rc; +} + +#define MAKE_HYPERCALL_R_ATTR(_name) \ +static ssize_t _name##_show(struct hypercall_dev *hdev, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long)hdev->_name); \ +} \ +struct hypercall_attribute hypercall_attr_##_name = __ATTR_RO(_name) + +#define MAKE_HYPERCALL_WR_ATTR(_name) \ +static int _name##_store(struct hypercall_dev *hdev, unsigned long val) \ +{ \ + hdev->_name = (typeof(hdev->_name))val; \ + return 0; \ +} \ +static ssize_t _name##_show(struct hypercall_dev *hdev, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", (unsigned long)hdev->_name); \ +} \ +struct hypercall_attribute hypercall_attr_##_name = \ + __ATTR(_name,S_IRUGO|S_IWUGO,_name##_show,_name##_store) + +MAKE_HYPERCALL_R_ATTR(base_addr); +MAKE_HYPERCALL_R_ATTR(irq); +MAKE_HYPERCALL_WR_ATTR(cmd); + +#define GET_HYPERCALL_ATTR(_name) (&hypercall_attr_##_name.attr) + +static struct attribute *hypercall_default_attrs[] = { + GET_HYPERCALL_ATTR(base_addr), + GET_HYPERCALL_ATTR(irq), + GET_HYPERCALL_ATTR(cmd), + NULL +}; + +static struct sysfs_ops hypercall_sysfs_ops = { + .show = hypercall_attribute_show, + .store = hypercall_attribute_store, +}; + +static void hypercall_sysfs_release(struct kobject *kobj) +{ + DPRINTK(" called for obj name %s\n", kobj->name); +} + +static struct kobj_type hypercall_ktype = { + .release = hypercall_sysfs_release, + .sysfs_ops = &hypercall_sysfs_ops, + .default_attrs = hypercall_default_attrs +}; + + +static int hypercall_sysfs_add(struct hypercall_dev *dev) +{ + int rc; + + kobject_init(&dev->kobject); + dev->kobject.ktype = &hypercall_ktype; + rc = kobject_set_name(&dev->kobject, "%s", HYPERCALL_DRIVER_NAME); + if (rc != 0) { + printk("%s: kobject_set_name failed, err=%d\n", __FUNCTION__, rc); + return rc; + } + + rc = kobject_add(&dev->kobject); + if (rc != 0) { + printk("%s: kobject_add failed, err=%d\n", __FUNCTION__, rc); + return rc; + } + + rc = sysfs_create_link(&dev->pci_dev->dev.kobj, &dev->kobject, + HYPERCALL_DRIVER_NAME); + if (rc != 0) { + printk("%s: sysfs_create_link failed, err=%d\n", __FUNCTION__, rc); + kobject_del(&dev->kobject); + } + + return rc; +} + +static void hypercall_sysfs_remove(struct hypercall_dev *dev) +{ + sysfs_remove_link(&dev->pci_dev->dev.kobj, HYPERCALL_DRIVER_NAME); + kobject_del(&dev->kobject); +} + module_init(hypercall_init_module); module_exit(hypercall_cleanup_module); diff --git a/kvm/kernel/Makefile b/kvm/kernel/Makefile index 454f5f1bd..78e31a4c7 100644 --- a/kvm/kernel/Makefile +++ b/kvm/kernel/Makefile @@ -1,4 +1,5 @@ -KERNELDIR := /lib/modules/$(shell uname -r)/build +include ../config.mak +KERNELDIR ?= /lib/modules/$(shell uname -r)/build KVERREL = $(patsubst /lib/modules/%/build,%,$(KERNELDIR)) DESTDIR= diff --git a/kvm/kernel/external-module-compat.h b/kvm/kernel/external-module-compat.h index 830c46436..79608730f 100644 --- a/kvm/kernel/external-module-compat.h +++ b/kvm/kernel/external-module-compat.h @@ -72,6 +72,28 @@ static inline int smp_call_function_single1(int cpu, void (*func)(void *info), * The cpu hotplug stubs are broken if !CONFIG_CPU_HOTPLUG */ +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,15) +#define DEFINE_MUTEX(a) DECLARE_MUTEX(a) +#define mutex_lock_interruptible(a) down_interruptible(a) +#define mutex_unlock(a) up(a) +#define mutex_lock(a) down(a) +#define mutex_init(a) init_MUTEX(a) +#define mutex_trylock(a) down_trylock(a) +#define mutex semaphore +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,14) +#ifndef kzalloc +#define kzalloc(size,flags) \ +({ \ + void *__ret = kmalloc(size, flags); \ + if (__ret) + memset(__ret, 0, size); + __ret; +}) +#endif +#endif + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) #ifndef CONFIG_HOTPLUG_CPU diff --git a/kvm/kernel/include/linux/kvm_para.h b/kvm/kernel/include/linux/kvm_para.h new file mode 100644 index 000000000..3b292565a --- /dev/null +++ b/kvm/kernel/include/linux/kvm_para.h @@ -0,0 +1,73 @@ +#ifndef __LINUX_KVM_PARA_H +#define __LINUX_KVM_PARA_H + +/* + * Guest OS interface for KVM paravirtualization + * + * Note: this interface is totally experimental, and is certain to change + * as we make progress. + */ + +/* + * Per-VCPU descriptor area shared between guest and host. Writable to + * both guest and host. Registered with the host by the guest when + * a guest acknowledges paravirtual mode. + * + * NOTE: all addresses are guest-physical addresses (gpa), to make it + * easier for the hypervisor to map between the various addresses. + */ +struct kvm_vcpu_para_state { + /* + * API version information for compatibility. If there's any support + * mismatch (too old host trying to execute too new guest) then + * the host will deny entry into paravirtual mode. Any other + * combination (new host + old guest and new host + new guest) + * is supposed to work - new host versions will support all old + * guest API versions. + */ + u32 guest_version; + u32 host_version; + u32 size; + u32 ret; + + /* + * The address of the vm exit instruction (VMCALL or VMMCALL), + * which the host will patch according to the CPU model the + * VM runs on: + */ + u64 hypercall_gpa; + +} __attribute__ ((aligned(PAGE_SIZE))); + +#define KVM_PARA_API_VERSION 1 + +/* + * This is used for an RDMSR's ECX parameter to probe for a KVM host. + * Hopefully no CPU vendor will use up this number. This is placed well + * out of way of the typical space occupied by CPU vendors' MSR indices, + * and we think (or at least hope) it wont be occupied in the future + * either. + */ +#define MSR_KVM_API_MAGIC 0x87655678 + +#define KVM_EINVAL 1 + +/* + * Hypercall calling convention: + * + * Each hypercall may have 0-6 parameters. + * + * 64-bit hypercall index is in RAX, goes from 0 to __NR_hypercalls-1 + * + * 64-bit parameters 1-6 are in the standard gcc x86_64 calling convention + * order: RDI, RSI, RDX, RCX, R8, R9. + * + * 32-bit index is EBX, parameters are: EAX, ECX, EDX, ESI, EDI, EBP. + * (the first 3 are according to the gcc regparm calling convention) + * + * No registers are clobbered by the hypercall, except that the + * return value is in RAX. + */ +#define __NR_hypercalls 0 + +#endif diff --git a/kvm/kernel/include/linux/mutex.h b/kvm/kernel/include/linux/mutex.h new file mode 100644 index 000000000..71b2ae109 --- /dev/null +++ b/kvm/kernel/include/linux/mutex.h @@ -0,0 +1,5 @@ +/* + * Empty file to satisfy #include <linux/mutex.h> for older kernels. + */ + + diff --git a/kvm/kernel/kvm.h b/kvm/kernel/kvm.h index 04574a9d4..41cc27de4 100644 --- a/kvm/kernel/kvm.h +++ b/kvm/kernel/kvm.h @@ -14,6 +14,7 @@ #include "vmx.h" #include <linux/kvm.h> +#include <linux/kvm_para.h> #define CR0_PE_MASK (1ULL << 0) #define CR0_TS_MASK (1ULL << 3) @@ -237,6 +238,9 @@ struct kvm_vcpu { unsigned long cr0; unsigned long cr2; unsigned long cr3; + gpa_t para_state_gpa; + struct page *para_state_page; + gpa_t hypercall_gpa; unsigned long cr4; unsigned long cr8; u64 pdptrs[4]; /* pae */ @@ -382,6 +386,8 @@ struct kvm_arch_ops { int (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); int (*vcpu_setup)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); + void (*patch_hypercall)(struct kvm_vcpu *vcpu, + unsigned char *hypercall_addr); }; extern struct kvm_stat kvm_stat; @@ -476,6 +482,8 @@ void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); + static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { @@ -523,7 +531,7 @@ static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - return (struct kvm_mmu_page *)page->private; + return (struct kvm_mmu_page *)page_private(page); } static inline u16 read_fs(void) diff --git a/kvm/kernel/kvm_main.c b/kvm/kernel/kvm_main.c index f8b70bbce..a8b3691af 100644 --- a/kvm/kernel/kvm_main.c +++ b/kvm/kernel/kvm_main.c @@ -126,10 +126,8 @@ static inline int valid_vcpu(int n) return likely(n >= 0 && n < KVM_MAX_VCPUS); } -int kvm_read_guest(struct kvm_vcpu *vcpu, - gva_t addr, - unsigned long size, - void *dest) +int kvm_read_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, + void *dest) { unsigned char *host_buf = dest; unsigned long req_size = size; @@ -161,10 +159,8 @@ int kvm_read_guest(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_read_guest); -int kvm_write_guest(struct kvm_vcpu *vcpu, - gva_t addr, - unsigned long size, - void *data) +int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, + void *data) { unsigned char *host_buf = data; unsigned long req_size = size; @@ -457,7 +453,7 @@ EXPORT_SYMBOL_GPL(set_cr4); void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (is_long_mode(vcpu)) { - if ( cr3 & CR3_L_MODE_RESEVED_BITS) { + if (cr3 & CR3_L_MODE_RESEVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; @@ -674,7 +670,7 @@ raced: | __GFP_ZERO); if (!new.phys_mem[i]) goto out_free; - new.phys_mem[i]->private = 0; + set_page_private(new.phys_mem[i],0); } } @@ -774,7 +770,6 @@ static int kvm_dev_ioctl_get_dirty_log(struct kvm *kvm, if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) goto out; - if (any) { cleared = 0; for (i = 0; i < KVM_MAX_VCPUS; ++i) { @@ -903,8 +898,9 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; else { gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr); + if (gpa == UNMAPPED_GVA) - return vcpu_printf(vcpu, "not present\n"), X86EMUL_PROPAGATE_FAULT; + return X86EMUL_PROPAGATE_FAULT; vcpu->mmio_needed = 1; vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; @@ -1142,6 +1138,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(emulate_instruction); +int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + unsigned long nr, a0, a1, a2, a3, a4, a5, ret; + + kvm_arch_ops->decache_regs(vcpu); + ret = -KVM_EINVAL; +#ifdef CONFIG_X86_64 + if (is_long_mode(vcpu)) { + nr = vcpu->regs[VCPU_REGS_RAX]; + a0 = vcpu->regs[VCPU_REGS_RDI]; + a1 = vcpu->regs[VCPU_REGS_RSI]; + a2 = vcpu->regs[VCPU_REGS_RDX]; + a3 = vcpu->regs[VCPU_REGS_RCX]; + a4 = vcpu->regs[VCPU_REGS_R8]; + a5 = vcpu->regs[VCPU_REGS_R9]; + } else +#endif + { + nr = vcpu->regs[VCPU_REGS_RBX] & -1u; + a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; + a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; + a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; + a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; + a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; + a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; + } + switch (nr) { + default: + ; + } + vcpu->regs[VCPU_REGS_RAX] = ret; + kvm_arch_ops->cache_regs(vcpu); + return 1; +} +EXPORT_SYMBOL_GPL(kvm_hypercall); + static u64 mk_cr_64(u64 curr_cr, u32 new_val) { return (curr_cr & ~((1ULL << 32) - 1)) | new_val; @@ -1208,6 +1240,73 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, } } +/* + * Register the para guest with the host: + */ +static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) +{ + struct kvm_vcpu_para_state *para_state; + hpa_t para_state_hpa, hypercall_hpa; + struct page *para_state_page; + unsigned char *hypercall; + gpa_t hypercall_gpa; + + printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); + printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); + + /* + * Needs to be page aligned: + */ + if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) + goto err_gp; + + para_state_hpa = gpa_to_hpa(vcpu, para_state_gpa); + printk(KERN_DEBUG ".... para_state_hpa: %08Lx\n", para_state_hpa); + if (is_error_hpa(para_state_hpa)) + goto err_gp; + + para_state_page = pfn_to_page(para_state_hpa >> PAGE_SHIFT); + para_state = kmap_atomic(para_state_page, KM_USER0); + + printk(KERN_DEBUG ".... guest version: %d\n", para_state->guest_version); + printk(KERN_DEBUG ".... size: %d\n", para_state->size); + + para_state->host_version = KVM_PARA_API_VERSION; + /* + * We cannot support guests that try to register themselves + * with a newer API version than the host supports: + */ + if (para_state->guest_version > KVM_PARA_API_VERSION) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } + + hypercall_gpa = para_state->hypercall_gpa; + hypercall_hpa = gpa_to_hpa(vcpu, hypercall_gpa); + printk(KERN_DEBUG ".... hypercall_hpa: %08Lx\n", hypercall_hpa); + if (is_error_hpa(hypercall_hpa)) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } + + printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); + vcpu->para_state_page = para_state_page; + vcpu->para_state_gpa = para_state_gpa; + vcpu->hypercall_gpa = hypercall_gpa; + + hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), + KM_USER1) + (hypercall_hpa & ~PAGE_MASK); + kvm_arch_ops->patch_hypercall(vcpu, hypercall); + kunmap_atomic(hypercall, KM_USER1); + + para_state->ret = 0; +err_kunmap_skip: + kunmap_atomic(para_state, KM_USER0); + return 0; +err_gp: + return 1; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -1316,6 +1415,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->ia32_misc_enable_msr = data; break; + /* + * This is the 'probe whether the host is KVM' logic: + */ + case MSR_KVM_API_MAGIC: + return vcpu_register_para(vcpu, data); + default: printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr); return 1; @@ -1800,12 +1905,11 @@ static long kvm_dev_ioctl(struct file *filp, case KVM_GET_API_VERSION: r = KVM_API_VERSION; break; - case KVM_CREATE_VCPU: { + case KVM_CREATE_VCPU: r = kvm_dev_ioctl_create_vcpu(kvm, arg); if (r) goto out; break; - } case KVM_RUN: { struct kvm_run kvm_run; @@ -2079,13 +2183,17 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, int cpu = (long)v; switch (val) { - case CPU_DEAD: + case CPU_DOWN_PREPARE: case CPU_UP_CANCELED: + printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", + cpu); decache_vcpus_on_cpu(cpu); smp_call_function_single(cpu, kvm_arch_ops->hardware_disable, NULL, 0, 1); break; - case CPU_UP_PREPARE: + case CPU_ONLINE: + printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", + cpu); smp_call_function_single(cpu, kvm_arch_ops->hardware_enable, NULL, 0, 1); break; diff --git a/kvm/kernel/kvm_svm.h b/kvm/kernel/kvm_svm.h index 74cc862f4..624f1ca48 100644 --- a/kvm/kernel/kvm_svm.h +++ b/kvm/kernel/kvm_svm.h @@ -1,6 +1,7 @@ #ifndef __KVM_SVM_H #define __KVM_SVM_H +#include <linux/kernel.h> #include <linux/types.h> #include <linux/list.h> #include <asm/msr.h> @@ -18,7 +19,7 @@ static const u32 host_save_msrs[] = { MSR_IA32_LASTBRANCHTOIP, MSR_IA32_LASTINTFROMIP,MSR_IA32_LASTINTTOIP,*/ }; -#define NR_HOST_SAVE_MSRS (sizeof(host_save_msrs) / sizeof(*host_save_msrs)) +#define NR_HOST_SAVE_MSRS ARRAY_SIZE(host_save_msrs) #define NUM_DB_REGS 4 struct vcpu_svm { diff --git a/kvm/kernel/mmu.c b/kvm/kernel/mmu.c index 22c426cd8..573867a50 100644 --- a/kvm/kernel/mmu.c +++ b/kvm/kernel/mmu.c @@ -298,18 +298,18 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte) if (!is_rmap_pte(*spte)) return; page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - if (!page->private) { + if (!page_private(page)) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); - page->private = (unsigned long)spte; - } else if (!(page->private & 1)) { + set_page_private(page,(unsigned long)spte); + } else if (!(page_private(page) & 1)) { rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte); desc = mmu_alloc_rmap_desc(vcpu); - desc->shadow_ptes[0] = (u64 *)page->private; + desc->shadow_ptes[0] = (u64 *)page_private(page); desc->shadow_ptes[1] = spte; - page->private = (unsigned long)desc | 1; + set_page_private(page,(unsigned long)desc | 1); } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); while (desc->shadow_ptes[RMAP_EXT-1] && desc->more) desc = desc->more; if (desc->shadow_ptes[RMAP_EXT-1]) { @@ -337,12 +337,12 @@ static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu, if (j != 0) return; if (!prev_desc && !desc->more) - page->private = (unsigned long)desc->shadow_ptes[0]; + set_page_private(page,(unsigned long)desc->shadow_ptes[0]); else if (prev_desc) prev_desc->more = desc->more; else - page->private = (unsigned long)desc->more | 1; + set_page_private(page,(unsigned long)desc->more | 1); mmu_free_rmap_desc(vcpu, desc); } @@ -356,20 +356,20 @@ static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte) if (!is_rmap_pte(*spte)) return; page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - if (!page->private) { + if (!page_private(page)) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); - } else if (!(page->private & 1)) { + } else if (!(page_private(page) & 1)) { rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); - if ((u64 *)page->private != spte) { + if ((u64 *)page_private(page) != spte) { printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", spte, *spte); BUG(); } - page->private = 0; + set_page_private(page,0); } else { rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); - desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); prev_desc = NULL; while (desc) { for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i) @@ -398,11 +398,11 @@ static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) BUG_ON(!slot); page = gfn_to_page(slot, gfn); - while (page->private) { - if (!(page->private & 1)) - spte = (u64 *)page->private; + while (page_private(page)) { + if (!(page_private(page) & 1)) + spte = (u64 *)page_private(page); else { - desc = (struct kvm_rmap_desc *)(page->private & ~1ul); + desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul); spte = desc->shadow_ptes[0]; } BUG_ON(!spte); @@ -1218,7 +1218,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&page_header->link); if ((page = alloc_page(GFP_KERNEL)) == NULL) goto error_1; - page->private = (unsigned long)page_header; + set_page_private(page, (unsigned long)page_header); page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT; memset(__va(page_header->page_hpa), 0, PAGE_SIZE); list_add(&page_header->link, &vcpu->free_pages); diff --git a/kvm/kernel/paging_tmpl.h b/kvm/kernel/paging_tmpl.h index b6b90e9e1..f3bcee904 100644 --- a/kvm/kernel/paging_tmpl.h +++ b/kvm/kernel/paging_tmpl.h @@ -128,8 +128,10 @@ static int FNAME(walk_addr)(struct guest_walker *walker, goto access_error; #endif - if (!(*ptep & PT_ACCESSED_MASK)) - *ptep |= PT_ACCESSED_MASK; /* avoid rmw */ + if (!(*ptep & PT_ACCESSED_MASK)) { + mark_page_dirty(vcpu->kvm, table_gfn); + *ptep |= PT_ACCESSED_MASK; + } if (walker->level == PT_PAGE_TABLE_LEVEL) { walker->gfn = (*ptep & PT_BASE_ADDR_MASK) @@ -185,6 +187,12 @@ static void FNAME(release_walker)(struct guest_walker *walker) kunmap_atomic(walker->table, KM_USER0); } +static void FNAME(mark_pagetable_dirty)(struct kvm *kvm, + struct guest_walker *walker) +{ + mark_page_dirty(kvm, walker->table_gfn[walker->level - 1]); +} + static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte, u64 *shadow_pte, u64 access_bits, gfn_t gfn) { @@ -348,12 +356,15 @@ static int FNAME(fix_write_pf)(struct kvm_vcpu *vcpu, } else if (kvm_mmu_lookup_page(vcpu, gfn)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __FUNCTION__, gfn); + mark_page_dirty(vcpu->kvm, gfn); + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); *guest_ent |= PT_DIRTY_MASK; *write_pt = 1; return 0; } mark_page_dirty(vcpu->kvm, gfn); *shadow_ent |= PT_WRITABLE_MASK; + FNAME(mark_pagetable_dirty)(vcpu->kvm, walker); *guest_ent |= PT_DIRTY_MASK; rmap_add(vcpu, shadow_ent); @@ -430,9 +441,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * mmio: emulate if accessible, otherwise its a guest fault. */ - if (is_io_pte(*shadow_pte)) { + if (is_io_pte(*shadow_pte)) return 1; - } ++kvm_stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); diff --git a/kvm/kernel/svm.c b/kvm/kernel/svm.c index cf5f4979e..9f839e263 100644 --- a/kvm/kernel/svm.c +++ b/kvm/kernel/svm.c @@ -15,6 +15,7 @@ */ #include <linux/module.h> +#include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/highmem.h> #include <linux/profile.h> @@ -75,7 +76,7 @@ struct svm_init_data { static u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; -#define NUM_MSR_MAPS (sizeof(msrpm_ranges) / sizeof(*msrpm_ranges)) +#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) #define MSRS_RANGE_SIZE 2048 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) @@ -1042,22 +1043,22 @@ static int io_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) addr_mask = io_adress(vcpu, _in, &kvm_run->io.address); if (!addr_mask) { - printk(KERN_DEBUG "%s: get io address failed\n", __FUNCTION__); + printk(KERN_DEBUG "%s: get io address failed\n", + __FUNCTION__); return 1; } if (kvm_run->io.rep) { - kvm_run->io.count = vcpu->regs[VCPU_REGS_RCX] & addr_mask; + kvm_run->io.count + = vcpu->regs[VCPU_REGS_RCX] & addr_mask; kvm_run->io.string_down = (vcpu->svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; } - } else { + } else kvm_run->io.value = vcpu->svm->vmcb->save.rax; - } return 0; } - static int nop_on_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { return 1; @@ -1075,6 +1076,12 @@ static int halt_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vcpu->svm->vmcb->save.rip += 3; + return kvm_hypercall(vcpu, kvm_run); +} + static int invalid_op_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { inject_ud(vcpu); @@ -1275,7 +1282,7 @@ static int (*svm_exit_handlers[])(struct kvm_vcpu *vcpu, [SVM_EXIT_TASK_SWITCH] = task_switch_interception, [SVM_EXIT_SHUTDOWN] = shutdown_interception, [SVM_EXIT_VMRUN] = invalid_op_interception, - [SVM_EXIT_VMMCALL] = invalid_op_interception, + [SVM_EXIT_VMMCALL] = vmmcall_interception, [SVM_EXIT_VMLOAD] = invalid_op_interception, [SVM_EXIT_VMSAVE] = invalid_op_interception, [SVM_EXIT_STGI] = invalid_op_interception, @@ -1297,7 +1304,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) __FUNCTION__, vcpu->svm->vmcb->control.exit_int_info, exit_code); - if (exit_code >= sizeof(svm_exit_handlers) / sizeof(*svm_exit_handlers) + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || svm_exit_handlers[exit_code] == 0) { kvm_run->exit_reason = KVM_EXIT_UNKNOWN; printk(KERN_ERR "%s: 0x%x @ 0x%llx cr0 0x%lx rflags 0x%llx\n", @@ -1668,6 +1675,18 @@ static int is_disabled(void) return 0; } +static void +svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xd9; + hypercall[3] = 0xc3; +} + static struct kvm_arch_ops svm_arch_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1716,6 +1735,7 @@ static struct kvm_arch_ops svm_arch_ops = { .run = svm_vcpu_run, .skip_emulated_instruction = skip_emulated_instruction, .vcpu_setup = svm_vcpu_setup, + .patch_hypercall = svm_patch_hypercall, }; static int __init svm_init(void) diff --git a/kvm/kernel/vmx.c b/kvm/kernel/vmx.c index 1b8feea48..936aef68a 100644 --- a/kvm/kernel/vmx.c +++ b/kvm/kernel/vmx.c @@ -19,6 +19,7 @@ #include "vmx.h" #include "kvm_vmx.h" #include <linux/module.h> +#include <linux/kernel.h> #include <linux/mm.h> #include <linux/highmem.h> #include <linux/profile.h> @@ -27,7 +28,6 @@ #include "segment_descriptor.h" - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -76,7 +76,7 @@ static const u32 vmx_msr_index[] = { #endif MSR_EFER, MSR_K6_STAR, }; -#define NR_VMX_MSR (sizeof(vmx_msr_index) / sizeof(*vmx_msr_index)) +#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) static inline int is_page_fault(u32 intr_info) { @@ -418,10 +418,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) case MSR_IA32_SYSENTER_ESP: vmcs_write32(GUEST_SYSENTER_ESP, data); break; - case MSR_IA32_TIME_STAMP_COUNTER: { + case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); break; - } default: msr = find_msr_entry(vcpu, msr_index); if (msr) { @@ -793,6 +792,9 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) */ static void vmx_set_cr0_no_modeswitch(struct kvm_vcpu *vcpu, unsigned long cr0) { + if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK)) + enter_rmode(vcpu); + vcpu->rmode.active = ((cr0 & CR0_PE_MASK) == 0); update_exception_bitmap(vcpu); vmcs_writel(CR0_READ_SHADOW, cr0); @@ -1128,6 +1130,8 @@ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) if (rdmsr_safe(index, &data_low, &data_high) < 0) continue; + if (wrmsr_safe(index, data_low, data_high) < 0) + continue; data = data_low | ((u64)data_high << 32); vcpu->host_msrs[j].index = index; vcpu->host_msrs[j].reserved = 0; @@ -1465,6 +1469,18 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static void +vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) +{ + /* + * Patch in the VMCALL instruction: + */ + hypercall[0] = 0x0f; + hypercall[1] = 0x01; + hypercall[2] = 0xc1; + hypercall[3] = 0xc3; +} + static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u64 exit_qualification; @@ -1641,6 +1657,12 @@ static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 0; } +static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP)+3); + return kvm_hypercall(vcpu, kvm_run); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -1659,6 +1681,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_VMCALL] = handle_vmcall, }; static const int kvm_vmx_max_exit_handlers = @@ -2060,6 +2083,7 @@ static struct kvm_arch_ops vmx_arch_ops = { .run = vmx_vcpu_run, .skip_emulated_instruction = skip_emulated_instruction, .vcpu_setup = vmx_vcpu_setup, + .patch_hypercall = vmx_patch_hypercall, }; static int __init vmx_init(void) diff --git a/kvm/kvm_stat b/kvm/kvm_stat index 80274ae00..ff6cf96f3 100755 --- a/kvm/kvm_stat +++ b/kvm/kvm_stat @@ -1,7 +1,7 @@ #!/usr/bin/python import curses -import os, time +import sys, os, time class Stats: def __init__(self): @@ -18,6 +18,14 @@ class Stats: self.values[key] = (newval, newdelta) return self.values +if not os.access('/sys/kernel/debug', os.F_OK): + print 'Please enable CONFIG_DEBUGFS in your kernel' + sys.exit(1) +if not os.access('/sys/kernel/debug/kvm', os.F_OK): + print "Please mount debugfs ('mount -t debugfs debugfs /sys/kernel/debug')" + print "and ensure the kvm modules are loaded" + sys.exit(1) + stats = Stats() def main(screen, stats): diff --git a/kvm/user/kvmctl.c b/kvm/user/kvmctl.c index 509c12b21..533d4aa0c 100644 --- a/kvm/user/kvmctl.c +++ b/kvm/user/kvmctl.c @@ -205,7 +205,7 @@ void kvm_destroy_phys_mem(kvm_context_t kvm, unsigned long phys_start, } -void kvm_get_dirty_pages(kvm_context_t kvm, int slot, void *buf) +int kvm_get_dirty_pages(kvm_context_t kvm, int slot, void *buf) { int r; struct kvm_dirty_log log = { @@ -216,7 +216,7 @@ void kvm_get_dirty_pages(kvm_context_t kvm, int slot, void *buf) r = ioctl(kvm->fd, KVM_GET_DIRTY_LOG, &log); if (r == -1) - exit(1); + return -errno; } static int more_io(struct kvm_run *run, int first_time) @@ -234,27 +234,35 @@ static int handle_io(kvm_context_t kvm, struct kvm_run *run) int first_time = 1; int delta; struct translation_cache tr; + int _in = (run->io.direction == KVM_EXIT_IO_IN); + int r; translation_cache_init(&tr); - regs.vcpu = run->vcpu; - ioctl(kvm->fd, KVM_GET_REGS, ®s); + if (run->io.string || _in) { + regs.vcpu = run->vcpu; + r = ioctl(kvm->fd, KVM_GET_REGS, ®s); + if (r == -1) + return -errno; + } delta = run->io.string_down ? -run->io.size : run->io.size; while (more_io(run, first_time)) { void *value_addr; - int r; - if (!run->io.string) - value_addr = ®s.rax; - else { + if (!run->io.string) { + if (_in) + value_addr = ®s.rax; + else + value_addr = &run->io.value; + } else { r = translate(kvm, run->vcpu, &tr, run->io.address, &value_addr); if (r) { fprintf(stderr, "failed translating I/O address %x\n", run->io.address); - exit(1); + return r; } } @@ -280,8 +288,8 @@ static int handle_io(kvm_context_t kvm, struct kvm_run *run) break; } default: - fprintf(stderr, "bad I/O size\n"); - exit(1); + fprintf(stderr, "bad I/O size %d\n", run->io.size); + return -EMSGSIZE; } break; } @@ -300,13 +308,13 @@ static int handle_io(kvm_context_t kvm, struct kvm_run *run) *(uint32_t *)value_addr); break; default: - fprintf(stderr, "bad I/O size\n"); - exit(1); + fprintf(stderr, "bad I/O size %d\n", run->io.size); + return -EMSGSIZE; } break; default: - fprintf(stderr, "bad I/O size\n"); - exit(1); + fprintf(stderr, "bad I/O direction %d\n", run->io.direction); + return -EPROTO; } if (run->io.string) { run->io.address += delta; @@ -321,12 +329,22 @@ static int handle_io(kvm_context_t kvm, struct kvm_run *run) } first_time = 0; if (r) { - ioctl(kvm->fd, KVM_SET_REGS, ®s); - return r; + int savedret = r; + r = ioctl(kvm->fd, KVM_SET_REGS, ®s); + if (r == -1) + return -errno; + + return savedret; } } - ioctl(kvm->fd, KVM_SET_REGS, ®s); + if (run->io.string || _in) { + r = ioctl(kvm->fd, KVM_SET_REGS, ®s); + if (r == -1) + return -errno; + + } + run->emulated = 1; return 0; } @@ -439,7 +457,7 @@ void kvm_show_regs(kvm_context_t kvm, int vcpu) r = ioctl(fd, KVM_GET_REGS, ®s); if (r == -1) { perror("KVM_GET_REGS"); - exit(1); + return; } fprintf(stderr, "rax %016llx rbx %016llx rcx %016llx rdx %016llx\n" @@ -522,6 +540,11 @@ static int handle_halt(kvm_context_t kvm, struct kvm_run *kvm_run) return kvm->callbacks->halt(kvm->opaque, kvm_run->vcpu); } +static int handle_shutdown(kvm_context_t kvm, struct kvm_run *kvm_run) +{ + return kvm->callbacks->shutdown(kvm->opaque, kvm_run->vcpu); +} + int try_push_interrupts(kvm_context_t kvm) { return kvm->callbacks->try_push_interrupts(kvm->opaque); @@ -556,8 +579,9 @@ again: kvm_run.emulated = 0; kvm_run.mmio_completed = 0; if (r == -1 && errno != EINTR) { + r = -errno; printf("kvm_run: %m\n"); - exit(1); + return r; } if (r == -1) { r = handle_io_window(kvm, &kvm_run); @@ -567,7 +591,7 @@ again: case KVM_EXIT_TYPE_FAIL_ENTRY: fprintf(stderr, "kvm_run: failed entry, reason %u\n", kvm_run.exit_reason & 0xffff); - exit(1); + return -ENOEXEC; break; case KVM_EXIT_TYPE_VM_EXIT: switch (kvm_run.exit_reason) { @@ -600,6 +624,9 @@ again: break; case KVM_EXIT_IRQ_WINDOW_OPEN: break; + case KVM_EXIT_SHUTDOWN: + r = handle_shutdown(kvm, &kvm_run); + break; default: fprintf(stderr, "unhandled vm exit: 0x%x\n", kvm_run.exit_reason); kvm_show_regs(kvm, vcpu); diff --git a/kvm/user/kvmctl.h b/kvm/user/kvmctl.h index aacdd28c1..936c029ae 100644 --- a/kvm/user/kvmctl.h +++ b/kvm/user/kvmctl.h @@ -59,6 +59,7 @@ struct kvm_callbacks { * on the host CPU. */ int (*halt)(void *opaque, int vcpu); + int (*shutdown)(void *opaque, int vcpu); int (*io_window)(void *opaque); int (*try_push_interrupts)(void *opaque); void (*post_kvm_run)(void *opaque, struct kvm_run *kvm_run); @@ -247,6 +248,6 @@ void *kvm_create_phys_mem(kvm_context_t, unsigned long phys_start, unsigned long len, int slot, int log, int writable); void kvm_destroy_phys_mem(kvm_context_t, unsigned long phys_start, unsigned long len); -void kvm_get_dirty_pages(kvm_context_t, int slot, void *buf); +int kvm_get_dirty_pages(kvm_context_t, int slot, void *buf); #endif diff --git a/qemu-kvm.c b/qemu-kvm.c index 1a0f6e04d..401c7e12c 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -124,6 +124,15 @@ static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs) | (rhs->avl * DESC_AVL_MASK); } +/* the reset values of qemu are not compatible to SVM + * this function is used to fix the segment descriptor values */ +static void fix_realmode_dataseg(struct kvm_segment *seg) +{ + seg->type = 0x02; + seg->present = 1; + seg->s = 1; +} + static void load_regs(CPUState *env) { struct kvm_regs regs; @@ -182,6 +191,14 @@ static void load_regs(CPUState *env) (sregs.cs.selector & 3); sregs.ss.dpl = sregs.ss.selector & 3; } + + if (!(env->cr[0] & CR0_PG_MASK)) { + fix_realmode_dataseg(&sregs.ds); + fix_realmode_dataseg(&sregs.es); + fix_realmode_dataseg(&sregs.fs); + fix_realmode_dataseg(&sregs.gs); + fix_realmode_dataseg(&sregs.ss); + } } set_seg(&sregs.tr, &env->tr); @@ -408,6 +425,7 @@ void kvm_save_registers(CPUState *env) int kvm_cpu_exec(CPUState *env) { + int r; int pending = (!env->ready_for_interrupt_injection || ((env->interrupt_request & CPU_INTERRUPT_HARD) && (env->eflags & IF_MASK))); @@ -422,7 +440,11 @@ int kvm_cpu_exec(CPUState *env) if (!saved_env[0]) saved_env[0] = env; - kvm_run(kvm_context, 0); + r = kvm_run(kvm_context, 0); + if (r < 0) { + printf("kvm_run returned %d\n", r); + exit(1); + } return 0; } @@ -587,6 +609,12 @@ static int kvm_halt(void *opaque, int vcpu) return 1; } + +static int kvm_shutdown(void *opaque, int vcpu) +{ + qemu_system_reset_request(); + return 1; +} static struct kvm_callbacks qemu_kvm_ops = { .cpuid = kvm_cpuid, @@ -606,6 +634,7 @@ static struct kvm_callbacks qemu_kvm_ops = { .writel = kvm_writel, .writeq = kvm_writeq, .halt = kvm_halt, + .shutdown = kvm_shutdown, .io_window = kvm_io_window, .try_push_interrupts = try_push_interrupts, .post_kvm_run = post_kvm_run, @@ -5295,6 +5295,10 @@ int main_loop(void) if (reset_requested) { reset_requested = 0; qemu_system_reset(); +#ifdef USE_KVM + if (kvm_allowed) + kvm_load_registers(env); +#endif ret = EXCP_INTERRUPT; } if (powerdown_requested) { |