aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-01-30 14:44:19 +0000
committerAvi Kivity <avi@qumranet.com>2007-01-30 14:44:19 +0000
commit69aafe1a3bc7bb281b3abdc9aeefb47d8fe29059 (patch)
treeaac4bf86f2aaa9c98bcb1464f41eef22f2d239d2
parent29710616d00f292ae8b8598d15ff96b13b084c06 (diff)
kvm: release: merge from trunk
........ r4323 | avi | 2007-01-24 15:27:48 +0200 (Wed, 24 Jan 2007) | 8 lines kvm: fix gva_to_gpa() gva_to_gpa() needs to be updated to the new walk_addr() calling convention, otherwise it may oops under some circumstances. use the opportunity to remove all the code duplication in gva_to_gpa(), which essentially repeats the calculations in walk_addr(). ........ r4324 | vitalye | 2007-01-24 15:30:38 +0200 (Wed, 24 Jan 2007) | 1 line testing:#0 ........ r4325 | odedr | 2007-01-24 15:34:09 +0200 (Wed, 24 Jan 2007) | 1 line test commit #0 ........ r4326 | odedr | 2007-01-24 15:34:31 +0200 (Wed, 24 Jan 2007) | 1 line test commit #0 ........ r4327 | vitalye | 2007-01-24 15:36:29 +0200 (Wed, 24 Jan 2007) | 1 line test remove #0 ........ r4328 | avi | 2007-01-24 15:54:36 +0200 (Wed, 24 Jan 2007) | 4 lines kvm: vmx: handle triple faults by returning EXIT_REASON_SHUTDOWN to userspace just like svm. ........ r4329 | dor | 2007-01-25 11:45:36 +0200 (Thu, 25 Jan 2007) | 9 lines From Markus Rechberger <markus.rechberger@amd.com>: This patch is a sync with the latest apic code in the qemu repository, this fixes a system crash after linux tries to set up apic after the first reboot. Signed-off-by: Markus Rechberger <markus.rechberger@amd.com> I only changed the apic version_id the register_vmsave so the apic_timer will be serialized. After the change the apic timer works after vm save/load cycle. Was also sent to Qemu devel list. ........ r4330 | dor | 2007-01-25 14:58:29 +0200 (Thu, 25 Jan 2007) | 2 lines Remove forgotten printf ........ r4331 | itaish | 2007-01-25 17:07:49 +0200 (Thu, 25 Jan 2007) | 6 lines Hypercall hardware registers added. Pci interrupt added. hp_reset function added. Transmitted messages (from guest to qemu) are now first accumulated and than sent to the vmchannel as one packet. hp_ioport_read handles some status registers. ........ r4332 | itaish | 2007-01-25 17:18:00 +0200 (Thu, 25 Jan 2007) | 1 line Missed a semicolon ........ r4333 | avi | 2007-01-28 11:54:38 +0200 (Sun, 28 Jan 2007) | 8 lines kvm: fix asm constraint for lldt instruction From: S.Çağlar Onur <caglar@pardus.org.tr> lldt does not accept immediate operands, which "g" allows. Signed-off-by: S.Çağlar Onur <caglar@pardus.org.tr> ........ r4334 | uri | 2007-01-28 12:35:16 +0200 (Sun, 28 Jan 2007) | 6 lines qemu migration: send number-of-pages once in the header (#152) If done in migration_ram_send, when the memory it to be sent a few times (only dirty pages after the first round) the number-of-pages would be sent a few times too. ........ r4335 | uri | 2007-01-28 13:35:38 +0200 (Sun, 28 Jan 2007) | 7 lines qemu migration: add param to migration_ram_send() to send the whole ram (#152) While online, only chunks (some pages) of ram would be sent. While offline, the whole ram is to be transferred. Also removed a redundant if, covered by the for below it. ........ r4336 | avi | 2007-01-28 13:47:10 +0200 (Sun, 28 Jan 2007) | 5 lines kvm: implement smp_call_function_single() for external modules older kernels don't export smp_call_function_single(), so fake one for external module users. ........ r4337 | avi | 2007-01-28 13:50:41 +0200 (Sun, 28 Jan 2007) | 5 lines kvm: add a global list of all virtual machines this will allow us to iterate over all vcpus and see which cpus they are running on. ........ r4338 | avi | 2007-01-28 13:56:13 +0200 (Sun, 28 Jan 2007) | 8 lines kvm: vmx: add vcpu_clear() like the inline code it replaces, this function decaches the vmcs from the cpu it last executed on. in addition: - vcpu_clear() works if the last cpu is also the cpu we're running on - it is faster on larger smps by virtue of using smp_call_function_single() ........ r4339 | uri | 2007-01-28 14:03:35 +0200 (Sun, 28 Jan 2007) | 6 lines qemu migration: send ram in phase 1 only for online migration (#152) For offline migration we need to send the whole ram on phase 3 anyways. Also, after a chunk of pages is sent let the guest run (yield "migration cpu"). Also, for now, if kvm kernel modules are used, make it an offline migration. ........ r4340 | avi | 2007-01-28 14:09:51 +0200 (Sun, 28 Jan 2007) | 6 lines kvm: cpu hotplug support on hotplug, we execute the hardware extension enable sequence. on unplug, we decache any vcpus that last ran on the exiting cpu, and execute the hardware extension disable sequence. ........ r4341 | avi | 2007-01-28 14:17:46 +0200 (Sun, 28 Jan 2007) | 5 lines kvm: host suspend/resume support add the necessary callbacks to suspend and resume a host running kvm. this is just a repeat of the cpu hotplug/unplug work. ........ r4342 | uri | 2007-01-28 14:37:09 +0200 (Sun, 28 Jan 2007) | 4 lines qemu migration: mark all pages as dirty on phase 1, and send only dirty pages Also skip special address range (from 0xa0000 to 0xc0000) only if kvm_allowed. ........ r4343 | uri | 2007-01-28 14:51:57 +0200 (Sun, 28 Jan 2007) | 5 lines qemu migration: use special marker to indicate end of ram transfer (#152) Also, when receiving a page, get the page number only if that special marker was not received. ........ r4344 | avi | 2007-01-28 15:01:55 +0200 (Sun, 28 Jan 2007) | 6 lines kvm: fix what looks like an obvious typo in the file drivers/kvm/svm.c From: Robert P. J. Day <rpjday@mindspring.com> Signed-off-by: Robert P. J. Day <rpjday@mindspring.com> ........ r4345 | uri | 2007-01-28 15:09:22 +0200 (Sun, 28 Jan 2007) | 7 lines qemu migration: add ram_save/ram_load to QEMUFile (#152) As a part of QEMUFile, the "appropriate" ram_save/ram_load function is called: - When saving/loading to/from a file the whole ram is saved/loaded - When migrating, only send/recv dirty pages (for offline migration all the ram pages are dirty). ........ r4346 | uri | 2007-01-28 15:41:11 +0200 (Sun, 28 Jan 2007) | 4 lines qemu migration: let the user know if offline migration is done while online requested (#152) Also add missing curly braces (forgotten in rev 4339) ........ r4347 | itaish | 2007-01-28 19:07:56 +0200 (Sun, 28 Jan 2007) | 1 line txbuffer use a static buffer ........ r4348 | avi | 2007-01-29 13:47:30 +0200 (Mon, 29 Jan 2007) | 6 lines kvm: fix mmu going crazy of guest sets cr0.wp == 0 the kvm mmu relies on cr0.wp being set even if the guest does not set it. the vmx code correctly forces cr0.wp at all times, the svm code does not, so it can't boot solaris without this patch. ........ r4349 | avi | 2007-01-29 14:02:45 +0200 (Mon, 29 Jan 2007) | 2 lines kvm: mmu testsuite: start testsuite at the beginning ........ r4350 | avi | 2007-01-29 14:05:18 +0200 (Mon, 29 Jan 2007) | 4 lines kvm: svm: hack initial cpu csbase to be consistent with intel this allows us to run the mmu testsuite on amd. ........ r4351 | dor | 2007-01-29 14:43:53 +0200 (Mon, 29 Jan 2007) | 6 lines Apic synchonization - the right way: Instead of sync the cr8 only for 64bit mode while exiting to qemu from kvm, do it in the other direction too (qemu -> kvm). This way it can always be done even if the cr8 is unsued, thus on qemu->kvm the cr8 is copied to the tpr and on qemu->kvm the tpr is copied to cr8. Thanks for Yaniv Kamay for the idea. ........ r4352 | avi | 2007-01-29 15:57:51 +0200 (Mon, 29 Jan 2007) | 2 lines kvm: workaround for 2.6.20 and below with !CONFIG_HOTPLUG_CPU ........ r4353 | dor | 2007-01-29 16:08:27 +0200 (Mon, 29 Jan 2007) | 7 lines Add Linux hypercall driver. It's a device that will match the emulation in Qemu. The motivation is to have a communication channel between the host and the guest. The driver uses PCI in order to be loaded automatically by the OS. Soon a balloon driver will be added and will use this mechanism. ........ r4354 | dor | 2007-01-29 16:12:35 +0200 (Mon, 29 Jan 2007) | 2 lines Run dos2unix on the file ........ r4355 | avi | 2007-01-29 17:26:35 +0200 (Mon, 29 Jan 2007) | 4 lines kvm: vmx: reload ds and es even in 64-bit mode or 32-bit userspace will get confused. ........ r4356 | avi | 2007-01-29 17:36:06 +0200 (Mon, 29 Jan 2007) | 4 lines kvm: fix mismatch between 32-bit and 64-bit abi unfortunately requiring a version bump. ........ r4357 | dor | 2007-01-30 09:58:06 +0200 (Tue, 30 Jan 2007) | 4 lines Change the API version to 3. Thanks for Gregory Haskins for pointing it out. ........ r4358 | itaish | 2007-01-30 13:57:32 +0200 (Tue, 30 Jan 2007) | 1 line Longer hypercall messages support, from 0x50 to 0xE0 ........ r4359 | avi | 2007-01-30 14:37:51 +0200 (Tue, 30 Jan 2007) | 4 lines kvm: web: remove wiki forever. ........ r4360 | avi | 2007-01-30 14:58:40 +0200 (Tue, 30 Jan 2007) | 12 lines kvm: fix vcpu_clear() bug From: Ingo Molnar <mingo@elte.hu> if vcpu_clear() is called on a not yet run vcpu then vcpu->cpu will be -1. Check this case and dont call smp_call_function_single(-1). this patch fixes the crash i reported earlier and -trunk now works fine on a 32-bit SMP host. Signed-off-by: Ingo Molnar <mingo@elte.hu> ........ r4361 | avi | 2007-01-30 14:59:57 +0200 (Tue, 30 Jan 2007) | 11 lines kvm: fix vcpu freeing bug From: Ingo Molnar <mingo@elte.hu> vcpu_load() can return NULL and it sometimes does in failure paths (for example when the userspace ABI version is too old) - causing a preemption count underflow in the ->vcpu_free() later on. So check for NULL. Signed-off-by: Ingo Molnar <mingo@elte.hu> ........ r4362 | avi | 2007-01-30 15:01:29 +0200 (Tue, 30 Jan 2007) | 9 lines kvm: qemu: fix configure defaults From: Ingo Molnar <mingo@elte.hu> i always found it weird that the Qemu version that comes with KVM has kqemu enabled on x86 and x86_64 by default but not KVM ;-) Fix this. Signed-off-by: Ingo Molnar <mingo@elte.hu> ........ r4363 | avi | 2007-01-30 15:40:35 +0200 (Tue, 30 Jan 2007) | 2 lines kvm: fix compat register_cpu_notifier() stub ........ r4364 | avi | 2007-01-30 16:43:20 +0200 (Tue, 30 Jan 2007) | 28 lines kvm: vmx: Fix register constraint in launch code From: Herbert Xu <herbert@gondor.apana.org.au> Both "=r" and "=g" breaks my build on i386: $ make CC [M] drivers/kvm/vmx.o {standard input}: Assembler messages: {standard input}:3318: Error: bad register name `%sil' make[1]: *** [drivers/kvm/vmx.o] Error 1 make: *** [_module_drivers/kvm] Error 2 The reason is that setbe requires an 8-bit register but "=r" does not constrain the target register to be one that has an 8-bit version on i386. According to http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10153 the correct constraint is "=q". Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> [pulled from git] ........
-rwxr-xr-xconfigure2
-rw-r--r--cpu-all.h6
-rw-r--r--hw/apic.c15
-rw-r--r--hw/hypercall.c178
-rw-r--r--kvm/drivers/Kbuild2
-rw-r--r--kvm/drivers/Makefile20
-rw-r--r--kvm/drivers/hypercall.c231
-rw-r--r--kvm/kernel/external-module-compat.h48
-rw-r--r--kvm/kernel/include/linux/kvm.h5
-rw-r--r--kvm/kernel/kvm.h4
-rw-r--r--kvm/kernel/kvm_main.c123
-rw-r--r--kvm/kernel/paging_tmpl.h28
-rw-r--r--kvm/kernel/svm.c17
-rw-r--r--kvm/kernel/vmx.c33
-rw-r--r--kvm/kernel/vmx.h1
-rw-r--r--kvm/user/kvmctl.c8
-rw-r--r--kvm/user/kvmctl.h1
-rw-r--r--kvm/user/main.c5
-rw-r--r--kvm/user/test/access.c1
-rw-r--r--migration.c130
-rw-r--r--qemu-kvm.c18
-rw-r--r--vl.c18
-rw-r--r--vl.h2
23 files changed, 786 insertions, 110 deletions
diff --git a/configure b/configure
index 431b08196..fb55702c7 100755
--- a/configure
+++ b/configure
@@ -114,6 +114,7 @@ bsd="yes"
oss="yes"
if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
kqemu="yes"
+ kvm="yes"
fi
;;
NetBSD)
@@ -137,6 +138,7 @@ linux="yes"
user="yes"
if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
kqemu="yes"
+ kvm="yes"
fi
;;
esac
diff --git a/cpu-all.h b/cpu-all.h
index 34b74a736..939c1e8b0 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -895,6 +895,12 @@ static inline void cpu_physical_memory_set_dirty(ram_addr_t addr)
phys_ram_dirty[addr >> TARGET_PAGE_BITS] = 0xff;
}
+static inline void cpu_physical_memory_set_dirty_flags(ram_addr_t addr,
+ int dirty_flags)
+{
+ phys_ram_dirty[addr >> TARGET_PAGE_BITS] |= dirty_flags;
+}
+
void cpu_physical_memory_reset_dirty(ram_addr_t start, ram_addr_t end,
int dirty_flags);
void cpu_tlb_update_dirty(CPUState *env);
diff --git a/hw/apic.c b/hw/apic.c
index c5aaa48df..13a49ad52 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -382,8 +382,6 @@ static void apic_init_ipi(APICState *s)
{
int i;
- for(i = 0; i < APIC_LVT_NB; i++)
- s->lvt[i] = 1 << 16; /* mask LVT */
s->tpr = 0;
s->spurious_vec = 0xff;
s->log_dest = 0;
@@ -391,7 +389,8 @@ static void apic_init_ipi(APICState *s)
memset(s->isr, 0, sizeof(s->isr));
memset(s->tmr, 0, sizeof(s->tmr));
memset(s->irr, 0, sizeof(s->irr));
- memset(s->lvt, 0, sizeof(s->lvt));
+ for(i = 0; i < APIC_LVT_NB; i++)
+ s->lvt[i] = 1 << 16; /* mask LVT */
s->esr = 0;
memset(s->icr, 0, sizeof(s->icr));
s->divide_conf = 0;
@@ -477,9 +476,9 @@ int apic_get_interrupt(CPUState *env)
intno = get_highest_priority_int(s->irr);
if (intno < 0)
return -1;
- reset_bit(s->irr, intno);
if (s->tpr && intno <= s->tpr)
return s->spurious_vec & 0xff;
+ reset_bit(s->irr, intno);
set_bit(s->isr, intno);
apic_update_irq(s);
return intno;
@@ -754,7 +753,7 @@ static int apic_load(QEMUFile *f, void *opaque, int version_id)
APICState *s = opaque;
int i;
- if (version_id != 1)
+ if (version_id > 2)
return -EINVAL;
/* XXX: what if the base changes? (registered memory regions) */
@@ -782,7 +781,9 @@ static int apic_load(QEMUFile *f, void *opaque, int version_id)
qemu_get_be64s(f, &s->initial_count_load_time);
qemu_get_be64s(f, &s->next_time);
- qemu_get_timer(f, s->timer);
+ if (version_id >= 2)
+ qemu_get_timer(f, s->timer);
+
return 0;
}
@@ -831,7 +832,7 @@ int apic_init(CPUState *env)
}
s->timer = qemu_new_timer(vm_clock, apic_timer, s);
- register_savevm("apic", 0, 1, apic_save, apic_load, s);
+ register_savevm("apic", 0, 2, apic_save, apic_load, s);
qemu_register_reset(apic_reset, s);
local_apics[s->id] = s;
diff --git a/hw/hypercall.c b/hw/hypercall.c
index afa953e2e..a1f629ca4 100644
--- a/hw/hypercall.c
+++ b/hw/hypercall.c
@@ -23,25 +23,151 @@
* THE SOFTWARE.
*/
#include "vl.h"
+#include <stddef.h>
+
+#define HP_CMD 0x00 // The command register WR
+#define HP_ISRSTATUS 0x04 // Interrupt status reg RD
+#define HP_TXSIZE 0x08
+#define HP_TXBUFF 0x0c
+#define HP_RXSIZE 0x10
+#define HP_RXBUFF 0x14
+
+// HP_CMD register commands
+#define HP_CMD_DI 1 // disable interrupts
+#define HP_CMD_EI 2 // enable interrupts
+#define HP_CMD_RESET 4 // enable interrupts
+
+
+/* Bits in HP_ISR - Interrupt status register */
+#define HPISR_RX 0x01 // Data is ready to be read
int use_hypercall_dev = 0;
static CharDriverState *vmchannel_hd;
+#define HP_MEM_SIZE 0xE0
+
typedef struct HypercallState {
- int irq;
+ uint32_t cmd;
+ uint32_t isr;
+ uint32_t txsize;
+ uint32_t txbuff;
+ uint32_t rxsize;
+ uint8_t RxBuff[HP_MEM_SIZE];
+ uint8_t txbufferaccu[HP_MEM_SIZE];
+ int txbufferaccu_offset;
+ int irq;
PCIDevice *pci_dev;
} HypercallState;
+HypercallState *pHypercallState = NULL;
+
+static void hp_reset(HypercallState *s)
+{
+ s->cmd = 0;
+ s->isr = 0;
+ s->txsize = 0;
+ s->txbuff = 0;
+ s->rxsize= 0;
+ s->txbufferaccu_offset = 0;
+}
+
static void hp_ioport_write(void *opaque, uint32_t addr, uint32_t val)
{
- //printf("hp_ioport_write, val=0x%x\n", val);
- qemu_chr_write(vmchannel_hd, (const uint8_t*)&val, 1);
+ HypercallState *s = opaque;
+
+ //printf("hp_ioport_write,addr=0x%x, val=0x%x\n",addr, val);
+
+ addr &= 0xff;
+
+ switch(addr)
+ {
+ case HP_CMD:
+ {
+ s->cmd = val;
+ if (val == HP_CMD_RESET){
+ hp_reset(s);
+ return;
+ }
+ break;
+ }
+
+ case HP_TXSIZE:
+ {
+ // handle the case when the we are being called when txsize is not 0
+ if (s->txsize != 0) {
+ printf("txsize is being set, but txsize is not 0!!!\n");
+ }
+ if (val > HP_MEM_SIZE) {
+ printf("txsize is larger than allowed by hw!!!\n");
+ }
+ s->txsize = val;
+ s->txbufferaccu_offset = 0;
+ break;
+ }
+
+ case HP_TXBUFF:
+ {
+ if (s->txsize == 0) {
+ printf("error with txbuff!!!\n");
+ break;
+ }
+
+ s->txbufferaccu[s->txbufferaccu_offset] = val;
+ s->txbufferaccu_offset++;
+ if (s->txbufferaccu_offset >= s->txsize) {
+ printf("tranmit txbuf, Len:0x%x\n", s->txbufferaccu_offset);
+ qemu_chr_write(vmchannel_hd, s->txbufferaccu, s->txsize);
+ s->txbufferaccu_offset = 0;
+ s->txsize = 0;
+ }
+ break;
+ }
+ default:
+ {
+ printf("hp_ioport_write to unhandled address!!!\n");
+ }
+ }
}
static uint32_t hp_ioport_read(void *opaque, uint32_t addr)
{
- //printf("hp_ioport_read\n");
- return 0;
+ HypercallState *s = opaque;
+ int ret;
+
+ if (addr != 0xc204) {
+ //printf("hp_ioport_read addr:0x%x\n",addr);
+ }
+
+ addr &= 0xff;
+
+ if (addr >= offsetof(HypercallState, RxBuff) )
+ {
+ int RxBuffOffset = addr - (offsetof(HypercallState, RxBuff));
+ ret = s->RxBuff[RxBuffOffset];
+ return ret;
+ }
+
+ switch (addr)
+ {
+ case HP_ISRSTATUS:
+ if (s->isr != 0){
+ printf("hp_ioport_read s->isr=0x%x\n", s->isr);
+ }
+ ret = s->isr;
+ if (ret & HPISR_RX) {
+ s->isr &= ~HPISR_RX;
+ }
+ break;
+ case HP_RXSIZE:
+ ret = s->rxsize;
+ break;
+
+ default:
+ ret = 0x00;
+ break;
+ }
+
+ return ret;
}
/***********************************************************/
@@ -58,11 +184,23 @@ static void hp_map(PCIDevice *pci_dev, int region_num,
PCIHypercallState *d = (PCIHypercallState *)pci_dev;
HypercallState *s = &d->hp;
- register_ioport_write(addr, 16, 1, hp_ioport_write, s);
- register_ioport_read(addr, 16, 1, hp_ioport_read, s);
+ register_ioport_write(addr, 0x100, 1, hp_ioport_write, s);
+ register_ioport_read(addr, 0x100, 1, hp_ioport_read, s);
}
+
+static void hypercall_update_irq(HypercallState *s)
+{
+ printf("hypercall_update_irq\n");
+
+ if (s->cmd &= HP_CMD_DI) {
+ return;
+ }
+ /* PCI irq */
+ pci_set_irq(s->pci_dev, 0, 1);
+}
+
void pci_hypercall_init(PCIBus *bus)
{
PCIHypercallState *d;
@@ -95,8 +233,11 @@ void pci_hypercall_init(PCIBus *bus)
pci_register_io_region(&d->dev, 0, 0x100,
PCI_ADDRESS_SPACE_IO, hp_map);
s = &d->hp;
+ pHypercallState = s;
s->irq = 16; /* PCI interrupt */
s->pci_dev = (PCIDevice *)d;
+
+ hp_reset(s);
}
@@ -105,22 +246,37 @@ static int vmchannel_can_read(void *opaque)
return 128;
}
+// input from vmchannel outside caller
static void vmchannel_read(void *opaque, const uint8_t *buf, int size)
{
int i;
+
+ printf("vmchannel_read buf:%p, size:%d\n", buf, size);
+ for(i = 0; i < size; i++) {
+ printf("%x,", buf[i]);
+ }
+ printf("\n");
- //printf("vmchannel_read buf:%p, size:%d\n", buf, size);
+ // if the hypercall device is in interrupts disabled state, don't accept the data
+ if (pHypercallState->cmd &= HP_CMD_DI) {
+ return;
+ }
for(i = 0; i < size; i++) {
- readline_handle_byte(buf[i]);
+ //printf("buf[i%d]=%x\n",i, buf[i]);
+ pHypercallState->RxBuff[i] = buf[i];
}
+ pHypercallState->rxsize = size;
+ pHypercallState->isr = HPISR_RX;
+ hypercall_update_irq(pHypercallState);
}
void vmchannel_init(CharDriverState *hd)
{
vmchannel_hd = hd;
+ //printf("vmchannel_init\n");
use_hypercall_dev = 1;
- qemu_chr_add_read_handler(vmchannel_hd, vmchannel_can_read, vmchannel_read, NULL);
- //vmchannel_start_input();
+ qemu_chr_add_read_handler(vmchannel_hd, vmchannel_can_read, vmchannel_read, &pHypercallState);
+
}
diff --git a/kvm/drivers/Kbuild b/kvm/drivers/Kbuild
new file mode 100644
index 000000000..474c921e9
--- /dev/null
+++ b/kvm/drivers/Kbuild
@@ -0,0 +1,2 @@
+obj-m := hypercall.o
+#hypercall-objs := hypercall.o
diff --git a/kvm/drivers/Makefile b/kvm/drivers/Makefile
new file mode 100644
index 000000000..d0b681d43
--- /dev/null
+++ b/kvm/drivers/Makefile
@@ -0,0 +1,20 @@
+KERNELDIR := /lib/modules/$(shell uname -r)/build
+KVERREL = $(patsubst /lib/modules/%/build,%,$(KERNELDIR))
+
+DESTDIR=
+
+INSTALLDIR = $(patsubst %/build,%/extra,$(KERNELDIR))
+
+all::
+ $(MAKE) -C $(KERNELDIR) M=`pwd` "$$@"
+
+install:
+ mkdir -p $(DESTDIR)/$(INSTALLDIR)
+ cp *.ko $(DESTDIR)/$(INSTALLDIR)
+ /sbin/depmod -a
+
+clean:
+ $(MAKE) -C $(KERNELDIR) M=`pwd` $@
+
+svnclean:
+ svn st | grep '^\?' | awk '{print $2}' | xargs rm -rf
diff --git a/kvm/drivers/hypercall.c b/kvm/drivers/hypercall.c
new file mode 100644
index 000000000..9c9462f66
--- /dev/null
+++ b/kvm/drivers/hypercall.c
@@ -0,0 +1,231 @@
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/compiler.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/ioport.h>
+#include <linux/completion.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include <asm/irq.h>
+
+#define HYPERCALL_DRIVER_NAME "Qumranet hypercall driver"
+#define HYPERCALL_DRIVER_VERSION "1"
+#define PCI_VENDOR_ID_HYPERCALL 0x5002
+#define PCI_DEVICE_ID_HYPERCALL 0x2258
+
+MODULE_AUTHOR ("Dor Laor <dor.laor@qumranet.com>");
+MODULE_DESCRIPTION (HYPERCALL_DRIVER_NAME);
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HYPERCALL_DRIVER_VERSION);
+
+static int debug = 0;
+module_param(debug, int, 0);
+MODULE_PARM_DESC (debug, "toggle debug flag");
+
+#define HYPERCALL_DEBUG 1
+#if HYPERCALL_DEBUG
+# define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __FUNCTION__ , ## args)
+# define assert(expr) \
+ if(unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr,__FILE__,__FUNCTION__,__LINE__); \
+ }
+#else
+# define DPRINTK(fmt, args...)
+# define assert(expr) do {} while (0)
+#endif
+
+static struct pci_device_id hypercall_pci_tbl[] = {
+ {PCI_VENDOR_ID_HYPERCALL, PCI_DEVICE_ID_HYPERCALL, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 },
+ {0,}
+};
+MODULE_DEVICE_TABLE (pci, hypercall_pci_tbl);
+
+struct hypercall_dev {
+ struct pci_dev *pci_dev;
+ u32 state;
+ spinlock_t lock;
+ u8 name[128];
+ u16 irq;
+ u32 regs_len;
+ void __iomem *mmio_addr;
+ unsigned long base_addr; /* device I/O address */
+};
+
+
+
+static void hypercall_cleanup_dev(struct hypercall_dev *dev);
+
+
+static int __devinit hypercall_init_board(struct pci_dev *pdev,
+ struct hypercall_dev **dev_out)
+{
+ unsigned long *ioaddr;
+ struct hypercall_dev *dev;
+ int rc;
+ u32 disable_dev_on_err = 0;
+ unsigned long pio_start, pio_end, pio_flags, pio_len;
+ unsigned long mmio_start, mmio_end, mmio_flags, mmio_len;
+
+ assert(pdev != NULL);
+
+ *dev_out = NULL;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (dev == NULL) {
+ printk (KERN_ERR "%s: Unable to alloc hypercall device\n", pci_name(pdev));
+ return -ENOMEM;
+ }
+ dev->pci_dev = pdev;
+ rc = pci_enable_device(pdev);
+ if (rc)
+ goto err_out;
+ disable_dev_on_err = 1;
+
+ pio_start = pci_resource_start (pdev, 0);
+ pio_end = pci_resource_end (pdev, 0);
+ pio_flags = pci_resource_flags (pdev, 0);
+ pio_len = pci_resource_len (pdev, 0);
+
+ mmio_start = pci_resource_start (pdev, 1);
+ mmio_end = pci_resource_end (pdev, 1);
+ mmio_flags = pci_resource_flags (pdev, 1);
+ mmio_len = pci_resource_len (pdev, 1);
+
+ DPRINTK("PIO region size == 0x%02lX\n", pio_len);
+ DPRINTK("MMIO region size == 0x%02lX\n", mmio_len);
+
+ rc = pci_request_regions (pdev, "hypercall");
+ if (rc)
+ goto err_out;
+
+ pci_set_master (pdev);
+
+#define USE_IO_OPS 1
+#ifdef USE_IO_OPS
+ ioaddr = pci_iomap(pdev, 0, 0);
+ if (!ioaddr) {
+ printk(KERN_ERR "%s: cannot map PIO, aborting\n", pci_name(pdev));
+ rc = -EIO;
+ goto err_out;
+ }
+ dev->base_addr = (unsigned long)ioaddr;
+ dev->regs_len = pio_len;
+#else
+ ioaddr = pci_iomap(pdev, 1, 0);
+ if (ioaddr == NULL) {
+ printk(KERN_ERR "%s: cannot remap MMIO, aborting\n", pci_name(pdev));
+ rc = -EIO;
+ goto err_out;
+ }
+ dev->base_addr = ioaddr;
+ dev->regs_len = mmio_len;
+#endif /* USE_IO_OPS */
+
+ *dev_out = dev;
+ return 0;
+
+err_out:
+ hypercall_cleanup_dev(dev);
+ if (disable_dev_on_err)
+ pci_disable_device(pdev);
+ return rc;
+}
+
+static int __devinit hypercall_init_one(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ struct hypercall_dev *dev;
+ u8 pci_rev;
+
+ assert(pdev != NULL);
+ assert(ent != NULL);
+
+ pci_read_config_byte(pdev, PCI_REVISION_ID, &pci_rev);
+
+ if (pdev->vendor == PCI_VENDOR_ID_HYPERCALL &&
+ pdev->device == PCI_DEVICE_ID_HYPERCALL) {
+ printk(KERN_INFO "pci dev %s (id %04x:%04x rev %02x) is a guest hypercall device\n",
+ pci_name(pdev), pdev->vendor, pdev->device, pci_rev);
+ }
+
+ if (hypercall_init_board(pdev, &dev) != 0)
+ return -1;
+
+ assert(dev != NULL);
+
+ dev->irq = pdev->irq;
+
+ spin_lock_init(&dev->lock);
+ pci_set_drvdata(pdev, dev);
+
+ printk (KERN_INFO "%s: 0x%lx, IRQ %d\n", dev->name, dev->base_addr, dev->irq);
+ return 0;
+}
+
+static void __devexit hypercall_remove_one(struct pci_dev *pdev)
+{
+ struct hypercall_dev *dev = pci_get_drvdata(pdev);
+
+ assert(dev != NULL);
+
+ hypercall_cleanup_dev(dev);
+ pci_disable_device(pdev);
+}
+
+#ifdef CONFIG_PM
+
+static int hypercall_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+ pci_save_state(pdev);
+ pci_set_power_state(pdev, PCI_D3hot);
+ DPRINTK("Power mgmt suspend, set power state to PCI_D3hot\n");
+
+ return 0;
+}
+
+static int hypercall_resume(struct pci_dev *pdev)
+{
+ pci_restore_state(pdev);
+ pci_set_power_state(pdev, PCI_D0);
+ DPRINTK("Power mgmt resume, set power state to PCI_D0\n");
+
+ return 0;
+}
+
+#endif /* CONFIG_PM */
+
+static void hypercall_cleanup_dev(struct hypercall_dev *dev)
+{
+ DPRINTK("cleaning up\n");
+ pci_release_regions(dev->pci_dev);
+ pci_iounmap(dev->pci_dev, (void*)dev->base_addr);
+ kfree(dev);
+}
+
+static struct pci_driver hypercall_pci_driver = {
+ .name = HYPERCALL_DRIVER_NAME,
+ .id_table = hypercall_pci_tbl,
+ .probe = hypercall_init_one,
+ .remove = __devexit_p(hypercall_remove_one),
+#ifdef CONFIG_PM
+ .suspend = hypercall_suspend,
+ .resume = hypercall_resume,
+#endif /* CONFIG_PM */
+};
+
+static int __init hypercall_init_module(void)
+{
+ printk (KERN_INFO HYPERCALL_DRIVER_NAME "\n");
+ return pci_module_init(&hypercall_pci_driver);
+}
+
+static void __exit hypercall_cleanup_module(void)
+{
+ pci_unregister_driver(&hypercall_pci_driver);
+}
+
+module_init(hypercall_init_module);
+module_exit(hypercall_cleanup_module);
diff --git a/kvm/kernel/external-module-compat.h b/kvm/kernel/external-module-compat.h
index 8c50aa85b..830c46436 100644
--- a/kvm/kernel/external-module-compat.h
+++ b/kvm/kernel/external-module-compat.h
@@ -8,7 +8,9 @@
*/
#include <linux/compiler.h>
+#include <linux/version.h>
#include "include/linux/kvm.h"
+#include <linux/cpu.h>
/*
* 2.6.16 does not have GFP_NOWAIT
@@ -31,3 +33,49 @@
#define prof_on 4321
#endif
+/*
+ * smp_call_function_single() is not exported below 2.6.20
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+
+static spinlock_t scfs_lock = SPIN_LOCK_UNLOCKED;
+static int scfs_cpu;
+static void (*scfs_func)(void *info);
+
+static void scfs_thunk(void *info)
+{
+ if (raw_smp_processor_id() == scfs_cpu)
+ scfs_func(info);
+}
+
+static inline int smp_call_function_single1(int cpu, void (*func)(void *info),
+ void *info, int nonatomic, int wait)
+{
+ int r;
+
+ spin_lock(&scfs_lock);
+ scfs_cpu = cpu;
+ scfs_func = func;
+ r = smp_call_function(scfs_thunk, info, nonatomic, wait);
+ spin_unlock(&scfs_lock);
+ return r;
+}
+
+#define smp_call_function_single smp_call_function_single1
+
+#endif
+
+/*
+ * The cpu hotplug stubs are broken if !CONFIG_CPU_HOTPLUG
+ */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21)
+
+#ifndef CONFIG_HOTPLUG_CPU
+#define register_cpu_notifier(nb) (0)
+#endif
+
+#endif
diff --git a/kvm/kernel/include/linux/kvm.h b/kvm/kernel/include/linux/kvm.h
index 1be148f0f..f3604593f 100644
--- a/kvm/kernel/include/linux/kvm.h
+++ b/kvm/kernel/include/linux/kvm.h
@@ -11,7 +11,7 @@
#include <asm/types.h>
#include <linux/ioctl.h>
-#define KVM_API_VERSION 2
+#define KVM_API_VERSION 3
/*
* Architectural interrupt line count, and the size of the bitmap needed
@@ -65,6 +65,8 @@ struct kvm_run {
__u8 ready_for_interrupt_injection;
__u8 if_flag;
__u16 padding2;
+
+ /* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
__u64 apic_base;
@@ -185,6 +187,7 @@ struct kvm_translation {
__u8 valid;
__u8 writeable;
__u8 usermode;
+ __u8 pad[5];
};
/* for KVM_INTERRUPT */
diff --git a/kvm/kernel/kvm.h b/kvm/kernel/kvm.h
index 2db1ca4c6..04574a9d4 100644
--- a/kvm/kernel/kvm.h
+++ b/kvm/kernel/kvm.h
@@ -304,6 +304,7 @@ struct kvm {
int memory_config_version;
int busy;
unsigned long rmap_overflow;
+ struct list_head vm_list;
};
struct kvm_stat {
@@ -340,6 +341,7 @@ struct kvm_arch_ops {
struct kvm_vcpu *(*vcpu_load)(struct kvm_vcpu *vcpu);
void (*vcpu_put)(struct kvm_vcpu *vcpu);
+ void (*vcpu_decache)(struct kvm_vcpu *vcpu);
int (*set_guest_debug)(struct kvm_vcpu *vcpu,
struct kvm_debug_guest *dbg);
@@ -558,7 +560,7 @@ static inline void load_gs(u16 sel)
#ifndef load_ldt
static inline void load_ldt(u16 sel)
{
- asm ("lldt %0" : : "g"(sel));
+ asm ("lldt %0" : : "rm"(sel));
}
#endif
diff --git a/kvm/kernel/kvm_main.c b/kvm/kernel/kvm_main.c
index b10972ed0..f8b70bbce 100644
--- a/kvm/kernel/kvm_main.c
+++ b/kvm/kernel/kvm_main.c
@@ -34,6 +34,8 @@
#include <linux/highmem.h>
#include <linux/file.h>
#include <asm/desc.h>
+#include <linux/sysdev.h>
+#include <linux/cpu.h>
#include "x86_emulate.h"
#include "segment_descriptor.h"
@@ -41,6 +43,9 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+static spinlock_t kvm_lock = SPIN_LOCK_UNLOCKED;
+static struct list_head vm_list = LIST_HEAD_INIT(vm_list);
+
struct kvm_arch_ops *kvm_arch_ops;
struct kvm_stat kvm_stat;
EXPORT_SYMBOL_GPL(kvm_stat);
@@ -230,9 +235,13 @@ static int kvm_dev_open(struct inode *inode, struct file *filp)
struct kvm_vcpu *vcpu = &kvm->vcpus[i];
mutex_init(&vcpu->mutex);
+ vcpu->cpu = -1;
vcpu->kvm = kvm;
vcpu->mmu.root_hpa = INVALID_PAGE;
INIT_LIST_HEAD(&vcpu->free_pages);
+ spin_lock(&kvm_lock);
+ list_add(&kvm->vm_list, &vm_list);
+ spin_unlock(&kvm_lock);
}
filp->private_data = kvm;
return 0;
@@ -272,7 +281,9 @@ static void kvm_free_physmem(struct kvm *kvm)
static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
{
- vcpu_load(vcpu->kvm, vcpu_slot(vcpu));
+ if (!vcpu_load(vcpu->kvm, vcpu_slot(vcpu)))
+ return;
+
kvm_mmu_destroy(vcpu);
vcpu_put(vcpu);
kvm_arch_ops->vcpu_free(vcpu);
@@ -290,6 +301,9 @@ static int kvm_dev_release(struct inode *inode, struct file *filp)
{
struct kvm *kvm = filp->private_data;
+ spin_lock(&kvm_lock);
+ list_del(&kvm->vm_list);
+ spin_unlock(&kvm_lock);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
kfree(kvm);
@@ -544,7 +558,6 @@ static int kvm_dev_ioctl_create_vcpu(struct kvm *kvm, int n)
FX_IMAGE_ALIGN);
vcpu->guest_fx_image = vcpu->host_fx_image + FX_IMAGE_SIZE;
- vcpu->cpu = -1; /* First load will set up TR */
r = kvm_arch_ops->vcpu_create(vcpu);
if (r < 0)
goto out_free_vcpus;
@@ -1360,6 +1373,9 @@ static int kvm_dev_ioctl_run(struct kvm *kvm, struct kvm_run *kvm_run)
if (!vcpu)
return -ENOENT;
+ /* re-sync apic's tpr */
+ vcpu->cr8 = kvm_run->cr8;
+
if (kvm_run->emulated) {
kvm_arch_ops->skip_emulated_instruction(vcpu);
kvm_run->emulated = 0;
@@ -2024,6 +2040,64 @@ static struct notifier_block kvm_reboot_notifier = {
.priority = 0,
};
+/*
+ * Make sure that a cpu that is being hot-unplugged does not have any vcpus
+ * cached on it.
+ */
+static void decache_vcpus_on_cpu(int cpu)
+{
+ struct kvm *vm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ spin_lock(&kvm_lock);
+ list_for_each_entry(vm, &vm_list, vm_list)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = &vm->vcpus[i];
+ /*
+ * If the vcpu is locked, then it is running on some
+ * other cpu and therefore it is not cached on the
+ * cpu in question.
+ *
+ * If it's not locked, check the last cpu it executed
+ * on.
+ */
+ if (mutex_trylock(&vcpu->mutex)) {
+ if (vcpu->cpu == cpu) {
+ kvm_arch_ops->vcpu_decache(vcpu);
+ vcpu->cpu = -1;
+ }
+ mutex_unlock(&vcpu->mutex);
+ }
+ }
+ spin_unlock(&kvm_lock);
+}
+
+static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
+ void *v)
+{
+ int cpu = (long)v;
+
+ switch (val) {
+ case CPU_DEAD:
+ case CPU_UP_CANCELED:
+ decache_vcpus_on_cpu(cpu);
+ smp_call_function_single(cpu, kvm_arch_ops->hardware_disable,
+ NULL, 0, 1);
+ break;
+ case CPU_UP_PREPARE:
+ smp_call_function_single(cpu, kvm_arch_ops->hardware_enable,
+ NULL, 0, 1);
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_hotplug,
+ .priority = 20, /* must be > scheduler priority */
+};
+
static __init void kvm_init_debug(void)
{
struct kvm_stats_debugfs_item *p;
@@ -2043,6 +2117,30 @@ static void kvm_exit_debug(void)
debugfs_remove(debugfs_dir);
}
+static int kvm_suspend(struct sys_device *dev, pm_message_t state)
+{
+ decache_vcpus_on_cpu(raw_smp_processor_id());
+ on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
+ return 0;
+}
+
+static int kvm_resume(struct sys_device *dev)
+{
+ on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
+ return 0;
+}
+
+static struct sysdev_class kvm_sysdev_class = {
+ set_kset_name("kvm"),
+ .suspend = kvm_suspend,
+ .resume = kvm_resume,
+};
+
+static struct sys_device kvm_sysdev = {
+ .id = 0,
+ .cls = &kvm_sysdev_class,
+};
+
hpa_t bad_page_address;
int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
@@ -2070,8 +2168,19 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
return r;
on_each_cpu(kvm_arch_ops->hardware_enable, 0, 0, 1);
+ r = register_cpu_notifier(&kvm_cpu_notifier);
+ if (r)
+ goto out_free_1;
register_reboot_notifier(&kvm_reboot_notifier);
+ r = sysdev_class_register(&kvm_sysdev_class);
+ if (r)
+ goto out_free_2;
+
+ r = sysdev_register(&kvm_sysdev);
+ if (r)
+ goto out_free_3;
+
kvm_chardev_ops.owner = module;
r = misc_register(&kvm_dev);
@@ -2083,7 +2192,13 @@ int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module)
return r;
out_free:
+ sysdev_unregister(&kvm_sysdev);
+out_free_3:
+ sysdev_class_unregister(&kvm_sysdev_class);
+out_free_2:
unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
+out_free_1:
on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
kvm_arch_ops->hardware_unsetup();
return r;
@@ -2092,8 +2207,10 @@ out_free:
void kvm_exit_arch(void)
{
misc_deregister(&kvm_dev);
-
+ sysdev_unregister(&kvm_sysdev);
+ sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier);
+ unregister_cpu_notifier(&kvm_cpu_notifier);
on_each_cpu(kvm_arch_ops->hardware_disable, 0, 0, 1);
kvm_arch_ops->hardware_unsetup();
kvm_arch_ops = NULL;
diff --git a/kvm/kernel/paging_tmpl.h b/kvm/kernel/paging_tmpl.h
index 149fa45fd..b6b90e9e1 100644
--- a/kvm/kernel/paging_tmpl.h
+++ b/kvm/kernel/paging_tmpl.h
@@ -443,31 +443,17 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
{
struct guest_walker walker;
- pt_element_t guest_pte;
- gpa_t gpa;
-
- FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
- guest_pte = *walker.ptep;
- FNAME(release_walker)(&walker);
-
- if (!is_present_pte(guest_pte))
- return UNMAPPED_GVA;
-
- if (walker.level == PT_DIRECTORY_LEVEL) {
- ASSERT((guest_pte & PT_PAGE_SIZE_MASK));
- ASSERT(PTTYPE == 64 || is_pse(vcpu));
+ gpa_t gpa = UNMAPPED_GVA;
+ int r;
- gpa = (guest_pte & PT_DIR_BASE_ADDR_MASK) | (vaddr &
- (PT_LEVEL_MASK(PT_PAGE_TABLE_LEVEL) | ~PAGE_MASK));
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0);
- if (PTTYPE == 32 && is_cpuid_PSE36())
- gpa |= (guest_pte & PT32_DIR_PSE36_MASK) <<
- (32 - PT32_DIR_PSE36_SHIFT);
- } else {
- gpa = (guest_pte & PT_BASE_ADDR_MASK);
- gpa |= (vaddr & ~PAGE_MASK);
+ if (r) {
+ gpa = (gpa_t)walker.gfn << PAGE_SHIFT;
+ gpa |= vaddr & ~PAGE_MASK;
}
+ FNAME(release_walker)(&walker);
return gpa;
}
diff --git a/kvm/kernel/svm.c b/kvm/kernel/svm.c
index 9c70ff65e..cf5f4979e 100644
--- a/kvm/kernel/svm.c
+++ b/kvm/kernel/svm.c
@@ -528,7 +528,13 @@ static void init_vmcb(struct vmcb *vmcb)
save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
save->cs.limit = 0xffff;
- save->cs.base = 0xffff0000;
+ /*
+ * cs.base should really be 0xffff0000, but vmx can't handle that, so
+ * be consistent with it.
+ *
+ * Replace when we have real mode working for vmx.
+ */
+ save->cs.base = 0xf0000;
save->gdtr.limit = 0xffff;
save->idtr.limit = 0xffff;
@@ -603,6 +609,10 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
put_cpu();
}
+static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_cache_regs(struct kvm_vcpu *vcpu)
{
vcpu->regs[VCPU_REGS_RAX] = vcpu->svm->vmcb->save.rax;
@@ -723,7 +733,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
#endif
vcpu->svm->cr0 = cr0;
- vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK;
+ vcpu->svm->vmcb->save.cr0 = cr0 | CR0_PG_MASK | CR0_WP_MASK;
vcpu->cr0 = cr0;
}
@@ -1163,7 +1173,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
case MSR_K6_STAR:
vcpu->svm->vmcb->save.star = data;
break;
-#ifdef CONFIG_X86_64_
+#ifdef CONFIG_X86_64
case MSR_LSTAR:
vcpu->svm->vmcb->save.lstar = data;
break;
@@ -1671,6 +1681,7 @@ static struct kvm_arch_ops svm_arch_ops = {
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
+ .vcpu_decache = svm_vcpu_decache,
.set_guest_debug = svm_guest_debug,
.get_msr = svm_get_msr,
diff --git a/kvm/kernel/vmx.c b/kvm/kernel/vmx.c
index 20ee54678..1b8feea48 100644
--- a/kvm/kernel/vmx.c
+++ b/kvm/kernel/vmx.c
@@ -125,6 +125,15 @@ static void __vcpu_clear(void *arg)
per_cpu(current_vmcs, cpu) = NULL;
}
+static void vcpu_clear(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1)
+ smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1);
+ else
+ __vcpu_clear(vcpu);
+ vcpu->launched = 0;
+}
+
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -202,10 +211,8 @@ static struct kvm_vcpu *vmx_vcpu_load(struct kvm_vcpu *vcpu)
cpu = get_cpu();
- if (vcpu->cpu != cpu) {
- smp_call_function(__vcpu_clear, vcpu, 0, 1);
- vcpu->launched = 0;
- }
+ if (vcpu->cpu != cpu)
+ vcpu_clear(vcpu);
if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
u8 error;
@@ -243,6 +250,11 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
put_cpu();
}
+static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
+{
+ vcpu_clear(vcpu);
+}
+
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
return vmcs_readl(GUEST_RFLAGS);
@@ -502,7 +514,7 @@ static __init int vmx_disabled_by_bios(void)
return (msr & 5) == 1; /* locked but not enabled */
}
-static __init void hardware_enable(void *garbage)
+static void hardware_enable(void *garbage)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
@@ -1373,6 +1385,11 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
return 1;
}
+static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ return 0;
+}
static int get_io_count(struct kvm_vcpu *vcpu, u64 *count)
{
@@ -1633,6 +1650,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run) = {
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
+ [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -1825,7 +1843,7 @@ again:
#endif
"setbe %0 \n\t"
"popf \n\t"
- : "=r" (fail)
+ : "=q" (fail)
: "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
"c"(vcpu),
[rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
@@ -1857,9 +1875,7 @@ again:
fx_restore(vcpu->host_fx_image);
vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
-#ifndef CONFIG_X86_64
asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
-#endif
/*
* Profile KVM exit RIPs:
@@ -2010,6 +2026,7 @@ static struct kvm_arch_ops vmx_arch_ops = {
.vcpu_load = vmx_vcpu_load,
.vcpu_put = vmx_vcpu_put,
+ .vcpu_decache = vmx_vcpu_decache,
.set_guest_debug = set_guest_debug,
.get_msr = vmx_get_msr,
diff --git a/kvm/kernel/vmx.h b/kvm/kernel/vmx.h
index 4c0ab1518..d0dc93df4 100644
--- a/kvm/kernel/vmx.h
+++ b/kvm/kernel/vmx.h
@@ -180,6 +180,7 @@ enum vmcs_field {
#define EXIT_REASON_EXCEPTION_NMI 0
#define EXIT_REASON_EXTERNAL_INTERRUPT 1
+#define EXIT_REASON_TRIPLE_FAULT 2
#define EXIT_REASON_PENDING_INTERRUPT 7
diff --git a/kvm/user/kvmctl.c b/kvm/user/kvmctl.c
index dbef7565e..509c12b21 100644
--- a/kvm/user/kvmctl.c
+++ b/kvm/user/kvmctl.c
@@ -23,7 +23,7 @@
#include <errno.h>
#include "kvmctl.h"
-#define EXPECTED_KVM_API_VERSION 2
+#define EXPECTED_KVM_API_VERSION 3
#if EXPECTED_KVM_API_VERSION != KVM_API_VERSION
#error libkvm: userspace and kernel version mismatch
@@ -532,6 +532,11 @@ static void post_kvm_run(kvm_context_t kvm, struct kvm_run *kvm_run)
kvm->callbacks->post_kvm_run(kvm->opaque, kvm_run);
}
+static void pre_kvm_run(kvm_context_t kvm, struct kvm_run *kvm_run)
+{
+ kvm->callbacks->pre_kvm_run(kvm->opaque, kvm_run);
+}
+
int kvm_run(kvm_context_t kvm, int vcpu)
{
int r;
@@ -544,6 +549,7 @@ int kvm_run(kvm_context_t kvm, int vcpu)
again:
kvm_run.request_interrupt_window = try_push_interrupts(kvm);
+ pre_kvm_run(kvm, &kvm_run);
r = ioctl(fd, KVM_RUN, &kvm_run);
post_kvm_run(kvm, &kvm_run);
diff --git a/kvm/user/kvmctl.h b/kvm/user/kvmctl.h
index 384b50fae..aacdd28c1 100644
--- a/kvm/user/kvmctl.h
+++ b/kvm/user/kvmctl.h
@@ -62,6 +62,7 @@ struct kvm_callbacks {
int (*io_window)(void *opaque);
int (*try_push_interrupts)(void *opaque);
void (*post_kvm_run)(void *opaque, struct kvm_run *kvm_run);
+ void (*pre_kvm_run)(void *opaque, struct kvm_run *kvm_run);
};
/*!
diff --git a/kvm/user/main.c b/kvm/user/main.c
index 0b43d32dd..ed060e9f8 100644
--- a/kvm/user/main.c
+++ b/kvm/user/main.c
@@ -107,6 +107,10 @@ static void test_post_kvm_run(void *opaque, struct kvm_run *kvm_run)
{
}
+static void test_pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
+{
+}
+
static struct kvm_callbacks test_callbacks = {
.cpuid = test_cpuid,
.inb = test_inb,
@@ -120,6 +124,7 @@ static struct kvm_callbacks test_callbacks = {
.io_window = test_io_window,
.try_push_interrupts = test_try_push_interrupts,
.post_kvm_run = test_post_kvm_run,
+ .pre_kvm_run = test_pre_kvm_run,
};
diff --git a/kvm/user/test/access.c b/kvm/user/test/access.c
index 62c13dca1..06dc8c522 100644
--- a/kvm/user/test/access.c
+++ b/kvm/user/test/access.c
@@ -164,7 +164,6 @@ void ac_test_init(ac_test_t *at)
extern char page_fault, kernel_entry;
set_idt_entry(&at->idt[14], &page_fault, 0);
set_idt_entry(&at->idt[0x20], &kernel_entry, 3);
- at->flags[AC_PTE_PRESENT] = at->flags[AC_ACCESS_WRITE] = 1;
}
int ac_test_bump(ac_test_t *at)
diff --git a/migration.c b/migration.c
index d5553585c..5a6a56e0c 100644
--- a/migration.c
+++ b/migration.c
@@ -99,7 +99,7 @@ static void migration_phase_2_dst(migration_state_t *pms);
static void migration_phase_3_dst(migration_state_t *pms);
static void migration_phase_4_dst(migration_state_t *pms);
-static void migration_ram_send(migration_state_t *pms);
+static void migration_ram_send(migration_state_t *pms, int whole_ram);
static void migration_ram_recv(migration_state_t *pms);
typedef void (*QemuMigrationPhaseCB)(migration_state_t *pms);
@@ -620,14 +620,16 @@ static void migration_phase_inc(migration_state_t *pms)
/* four phases for the migration:
* phase 0: initialization
* phase 1: online or offline
- * transfer all RAM pages
- * enable dirty pages logging
+ * transfer all RAM pages (online only)
+ * enable dirty pages logging (for offline migration the whole ram is dirty)
*
* phase 2: online only
* repeat: transfer all dirty pages
*
* phase 3: offline
+ * stop the guest.
* transfer whatever left (dirty pages + non-ram states)
+ * for offline migration dirty pages are the whole memory.
*
* phase 4: offline or online
* The grand finale: decide with host should continue
@@ -706,19 +708,35 @@ static void migration_start_dst(migration_state_t *pms)
static void migration_phase_1_src(migration_state_t *pms)
{
+ int goto_next_phase = 1;
+
if (pms->next_page == 0) {
+ ram_addr_t addr;
qemu_put_be32(pms->f, QEMU_MIGRATION_MAGIC);
qemu_put_be32(pms->f, QEMU_MIGRATION_VERSION);
qemu_put_byte(pms->f, pms->online);
+ qemu_put_be32(pms->f, phys_ram_size >> TARGET_PAGE_BITS);
qemu_set_fd_handler(pms->fd, NULL, migration_main_loop, pms);
+ for (addr=0; addr<phys_ram_size; addr+=TARGET_PAGE_SIZE)
+ cpu_physical_memory_set_dirty_flags(addr, MIG_DIRTY_FLAG);
+ }
+
+ if (pms->online) {
+ migration_ram_send(pms, 0);
+ if (pms->next_page < (phys_ram_size >> TARGET_PAGE_BITS)) {
+ goto_next_phase = 0;
+ }
}
- migration_ram_send(pms);
- if (pms->next_page >= (phys_ram_size >> TARGET_PAGE_BITS)) {
+ if (goto_next_phase) {
+ qemu_put_byte(pms->f, MIG_XFER_PAGE_TYPE_END);
migration_phase_inc(pms);
qemu_set_fd_handler(pms->fd, NULL, NULL, pms);
}
+ else
+ pms->yield = 1;
}
+
static void migration_phase_2_src(migration_state_t *pms)
{
migration_phase_inc(pms);
@@ -768,12 +786,13 @@ static void migration_phase_4_src(migration_state_t *pms)
static void migration_phase_1_dst(migration_state_t *pms)
{
- uint32_t magic, version, online;
+ uint32_t magic, version, online, npages;
if (pms->next_page == 0) {
magic = qemu_get_be32(pms->f);
version = qemu_get_be32(pms->f);
online = qemu_get_byte(pms->f);
+ npages = qemu_get_be32(pms->f);
if ((magic != QEMU_MIGRATION_MAGIC) ||
(version != QEMU_MIGRATION_VERSION)) {
@@ -784,6 +803,12 @@ static void migration_phase_1_dst(migration_state_t *pms)
return;
}
+ if (npages != (phys_ram_size >> TARGET_PAGE_BITS)) {
+ term_printf("phys_memory_mismatch: %uMB %uMB\n",
+ npages >> (20-TARGET_PAGE_BITS), phys_ram_size>>20);
+ migration_cleanup(pms, MIG_STAT_FAIL);
+ return;
+ }
pms->online = online;
term_printf("===>received online=%u\n", online);
}
@@ -880,21 +905,7 @@ static int mig_recv_ram_page(migration_state_t *pms)
uint8_t val;
unsigned buflen;
- val = qemu_get_byte(pms->f);
- page_number = qemu_get_be32(pms->f);
-
- if ((pms->phase != 1) && (page_number != pms->next_page)) {
- term_printf("WARNING: page number mismatch: received %u expected %u\n",
- page_number, pms->next_page);
- return -1;
- }
-
- if (page_number >= (phys_ram_size >> TARGET_PAGE_BITS)) {
- term_printf("mig_recv_ram_page: page_number is too large: %u (max is %u)\n",
- page_number, (phys_ram_size >> TARGET_PAGE_BITS));
- return -1;
- }
-
+ val = qemu_get_byte(pms->f);
switch(val) {
case MIG_XFER_PAGE_TYPE_END: /* go to the next phase */;
pms->next_page = phys_ram_size >> TARGET_PAGE_BITS;
@@ -911,6 +922,13 @@ static int mig_recv_ram_page(migration_state_t *pms)
return -1;
}
+ page_number = qemu_get_be32(pms->f);
+ if (page_number >= (phys_ram_size >> TARGET_PAGE_BITS)) {
+ term_printf("mig_recv_ram_page: page_number is too large: %u (max is %u)\n",
+ page_number, (phys_ram_size >> TARGET_PAGE_BITS));
+ return -1;
+ }
+
ptr += (page_number << TARGET_PAGE_BITS);
qemu_get_buffer(pms->f, ptr, buflen);
@@ -933,26 +951,27 @@ static int mig_recv_ram_page(migration_state_t *pms)
* (needs to be called multiple times).
* State is kept in pms->next_page.
*/
-static void migration_ram_send(migration_state_t *pms)
+static void migration_ram_send(migration_state_t *pms, int whole_ram)
{
unsigned num_pages = (phys_ram_size >> TARGET_PAGE_BITS);
+ unsigned chunk;
+ ram_addr_t addr;
- if (pms->next_page == 0) { /* send memory size */
- qemu_put_be32(pms->f, num_pages);
- }
-
- if (pms->next_page >= num_pages) /* finished already */
- return;
+ if (whole_ram)
+ chunk = num_pages;
+ else
+ chunk = PAGES_CHUNK;
/* send a few pages (or until network buffers full) */
- if (num_pages - pms->next_page > PAGES_CHUNK) {
- num_pages = pms->next_page + PAGES_CHUNK;
+ if (num_pages - pms->next_page > chunk) {
+ num_pages = pms->next_page + chunk;
}
for ( /*none*/ ; pms->next_page < num_pages; pms->next_page++) {
- if ((pms->next_page >= (0xa0000 >> TARGET_PAGE_BITS)) &&
- (pms->next_page < (0xc0000 >> TARGET_PAGE_BITS)))
+ addr = pms->next_page << TARGET_PAGE_BITS;
+ if ((kvm_allowed) && (addr >= 0xa0000) && (addr < 0xc0000))
continue;
- mig_send_ram_page(pms, pms->next_page);
+ if (cpu_physical_memory_get_dirty(addr, MIG_DIRTY_FLAG))
+ mig_send_ram_page(pms, pms->next_page);
}
}
@@ -961,18 +980,13 @@ static void migration_ram_recv(migration_state_t *pms)
{
unsigned num_pages;
int rc = 0;
+ ram_addr_t addr;
- num_pages = qemu_get_be32(pms->f);
- if (num_pages != phys_ram_size >> TARGET_PAGE_BITS) {
- term_printf("phys_memory_mismatch: %uMB %uMB\n",
- num_pages >> (20-TARGET_PAGE_BITS), phys_ram_size>>20);
- migration_cleanup(pms, MIG_STAT_FAIL);
- return;
- }
+ num_pages = phys_ram_size >> TARGET_PAGE_BITS;
- for (/* none */ ; rc==0 && pms->next_page < num_pages; pms->next_page++) {
- if ((pms->next_page >= (0xa0000 >> TARGET_PAGE_BITS)) &&
- (pms->next_page < (0xc0000 >> TARGET_PAGE_BITS)))
+ for (/* none */ ; rc==0 ; pms->next_page++) {
+ addr = pms->next_page << TARGET_PAGE_BITS;
+ if ((kvm_allowed) && (addr >= 0xa0000) && (addr < 0xc0000))
continue;
rc = mig_recv_ram_page(pms);
if (rc < 0) {
@@ -990,8 +1004,14 @@ static void migration_ram_recv(migration_state_t *pms)
void do_migration_getfd(int fd) { TO_BE_IMPLEMENTED; }
void do_migration_start(char *deadoralive)
{
- if (strcmp(deadoralive, "online") == 0)
+ if (strcmp(deadoralive, "online") == 0) {
ms.online = 1;
+ if (kvm_allowed) { /* online migration is not supported yet for kvm */
+ ms.online = 0;
+ term_printf("Currently online migration is not supported for kvm,"
+ " using offline migration\n");
+ }
+ }
else if (strcmp(deadoralive, "offline") == 0)
ms.online = 0;
else {
@@ -1112,6 +1132,24 @@ static int qemu_savevm_method_socket_eof(QEMUFile *f)
return (pms->fd == FD_UNUSED);
}
+
+static void qemu_savevm_method_socket_ram_save(QEMUFile *f, void *opaque)
+{
+ migration_state_t *pms = (migration_state_t*)f->opaque;
+
+ pms->next_page = 0;
+ migration_ram_send(pms, 1);
+ qemu_put_byte(pms->f, MIG_XFER_PAGE_TYPE_END);
+}
+
+static int qemu_savevm_method_socket_ram_load(QEMUFile *f, void *opaque, int ver)
+{
+ migration_state_t *pms = (migration_state_t*)f->opaque;
+
+ pms->next_page = 0;
+ migration_ram_recv(pms);
+ return 0;
+}
QEMUFile qemu_savevm_method_socket = {
.opaque = NULL,
.open = qemu_savevm_method_socket_open,
@@ -1122,7 +1160,9 @@ QEMUFile qemu_savevm_method_socket = {
.get_buffer = qemu_savevm_method_socket_get_buffer,
.tell = qemu_savevm_method_socket_tell,
.seek = qemu_savevm_method_socket_seek,
- .eof = qemu_savevm_method_socket_eof
+ .eof = qemu_savevm_method_socket_eof,
+ .ram_save = qemu_savevm_method_socket_ram_save,
+ .ram_load = qemu_savevm_method_socket_ram_load,
};
diff --git a/qemu-kvm.c b/qemu-kvm.c
index eecf33249..1a0f6e04d 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -199,8 +199,7 @@ static void load_regs(CPUState *env)
sregs.apic_base = cpu_get_apic_base(env);
sregs.efer = env->efer;
- if (env->efer & MSR_EFER_LME)
- sregs.cr8 = cpu_get_apic_tpr(env);
+ sregs.cr8 = cpu_get_apic_tpr(env);
kvm_set_sregs(kvm_context, 0, &sregs);
@@ -284,8 +283,7 @@ static void save_regs(CPUState *env)
cpu_set_apic_base(env, sregs.apic_base);
env->efer = sregs.efer;
- if (env->efer & MSR_EFER_LME)
- cpu_set_apic_tpr(env, sregs.cr8);
+ cpu_set_apic_tpr(env, sregs.cr8);
#define HFLAG_COPY_MASK ~( \
HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
@@ -386,11 +384,18 @@ static void post_kvm_run(void *opaque, struct kvm_run *kvm_run)
env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK;
env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection;
- if (env->efer & MSR_EFER_LME)
- cpu_set_apic_tpr(env, kvm_run->cr8);
+ cpu_set_apic_tpr(env, kvm_run->cr8);
cpu_set_apic_base(env, kvm_run->apic_base);
}
+static void pre_kvm_run(void *opaque, struct kvm_run *kvm_run)
+{
+ CPUState **envs = opaque, *env;
+ env = envs[0];
+
+ kvm_run->cr8 = cpu_get_apic_tpr(env);
+}
+
void kvm_load_registers(CPUState *env)
{
load_regs(env);
@@ -604,6 +609,7 @@ static struct kvm_callbacks qemu_kvm_ops = {
.io_window = kvm_io_window,
.try_push_interrupts = try_push_interrupts,
.post_kvm_run = post_kvm_run,
+ .pre_kvm_run = pre_kvm_run,
};
int kvm_qemu_init()
diff --git a/vl.c b/vl.c
index 5f57c2ca1..50665a8ff 100644
--- a/vl.c
+++ b/vl.c
@@ -163,6 +163,9 @@ int vnc_display = -1;
int acpi_enabled = 1;
int fd_bootchk = 1;
+static void ram_save(QEMUFile *f, void *opaque);
+static int ram_load(QEMUFile *f, void *opaque, int version_id);
+
/***********************************************************/
/* x86 ISA bus support */
@@ -4370,7 +4373,9 @@ QEMUFile qemu_savevm_method_file = {
.get_buffer = qemu_savevm_method_file_get_buffer,
.tell = qemu_savevm_method_file_tell,
.seek = qemu_savevm_method_file_seek,
- .eof = qemu_savevm_method_file_eof
+ .eof = qemu_savevm_method_file_eof,
+ .ram_save = ram_save,
+ .ram_load = ram_load,
};
@@ -4961,6 +4966,15 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
return 0;
}
+static void qemu_ram_save(QEMUFile *f, void* opaque)
+{
+ f->ram_save(f, opaque);
+}
+static int qemu_ram_load(QEMUFile *f, void* opaque, int version_id)
+{
+ return f->ram_load(f, opaque, version_id);
+}
+
/***********************************************************/
/* machine registration */
@@ -6368,7 +6382,7 @@ int main(int argc, char **argv)
}
register_savevm("timer", 0, 1, timer_save, timer_load, NULL);
- register_savevm("ram", 0, 1, ram_save, ram_load, NULL);
+ register_savevm("ram", 0, 1, qemu_ram_save, qemu_ram_load, NULL);
init_ioports();
diff --git a/vl.h b/vl.h
index 9917afa5c..04c83525b 100644
--- a/vl.h
+++ b/vl.h
@@ -414,6 +414,8 @@ struct QEMUFile_s {
int64_t (*tell)(QEMUFile *f);
int64_t (*seek)(QEMUFile *f, int64_t pos, int whence);
int (*eof)(QEMUFile *f);
+ void (*ram_save)(QEMUFile *f, void *opaque);
+ int (*ram_load)(QEMUFile *f, void *opaque, int version_id);
};
extern QEMUFile qemu_savevm_method_file;