aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAvi Kivity <avi@qumranet.com>2007-01-23 10:40:46 +0000
committerAvi Kivity <avi@qumranet.com>2007-01-23 10:40:46 +0000
commite01db5b3ba259da39d9c75c2b5cf4042730fb3d1 (patch)
tree59f4237526bb9d7b7ae20e525430931795a4cef9
parent9ae1ddefd2d83c632ff0602a0215d41a99baf87c (diff)
kvm: release: merge from trunk
........ r4282 | avi | 2007-01-11 12:28:20 +0200 (Thu, 11 Jan 2007) | 8 lines kvm: optimize inline assembly From: Ingo Molnar <mingo@elte.hu> forms like "0(%rsp)" generate an instruction with an unnecessary one byte displacement under certain circumstances. replace with the equivalent "(%rsp)". ........ r4283 | avi | 2007-01-11 14:30:46 +0200 (Thu, 11 Jan 2007) | 2 lines kvm: testsuite: add tests for pte accessed bit ........ r4284 | uri | 2007-01-11 15:55:20 +0200 (Thu, 11 Jan 2007) | 2 lines migration: do not try to send (and print no error) if migration already failed ........ r4285 | avi | 2007-01-11 15:59:47 +0200 (Thu, 11 Jan 2007) | 2 lines kvm: testsuite: write access tests ........ r4286 | avi | 2007-01-11 16:02:22 +0200 (Thu, 11 Jan 2007) | 2 lines kvm: testsuite: pretty print ........ r4287 | uri | 2007-01-11 16:03:54 +0200 (Thu, 11 Jan 2007) | 7 lines qemu migration: reset migration buffer when migration starts Qemu writes one byte at a time. We are buffering them. When migration start we now reset our circular buffer. That way if a migration fails, the next migration may be successful (before the second migration mostly failed). ........ r4288 | avi | 2007-01-11 16:03:54 +0200 (Thu, 11 Jan 2007) | 2 lines kvm: testsuite: test ptes with dirty bit preset ........ r4289 | uri | 2007-01-11 16:43:33 +0200 (Thu, 11 Jan 2007) | 2 lines kvm script: default cdrom to None (empty cdrom) ........ r4291 | uri | 2007-01-14 14:22:12 +0200 (Sun, 14 Jan 2007) | 4 lines qemu savevm/loadvm/migration: save/load ide state Taken from qemu's cvs. ........ r4294 | uri | 2007-01-17 13:46:45 +0200 (Wed, 17 Jan 2007) | 4 lines kvm script: add '--mac' command-line option Useful for migration, when one wants the guest to keep its mac-address. ........ r4295 | uri | 2007-01-17 14:58:30 +0200 (Wed, 17 Jan 2007) | 6 lines kvm configure: prefer gcc for qemu_cc (over compat) if its version is 3.* For some distributions, the installed (default) gcc version is 3.4 while the compat-gcc version is 3.2 (or older). In those cases prefer the more recent gcc. ........ r4296 | uri | 2007-01-17 19:27:12 +0200 (Wed, 17 Jan 2007) | 6 lines qemu migration: split migration process to phases (#152) Offline migration was basically done in one phase (a single function). Live migration would have a few phases. Currently the "live" phases are empty. ........ r4297 | uri | 2007-01-17 19:55:37 +0200 (Wed, 17 Jan 2007) | 4 lines qemu migration: get migration start_time before printing it (#152) fix for previous change (rev 4296) ........ r4298 | uri | 2007-01-17 20:00:13 +0200 (Wed, 17 Jan 2007) | 2 lines qemu migration: send migration header (magic, version, online) (#152) ........ r4299 | uri | 2007-01-21 12:22:22 +0200 (Sun, 21 Jan 2007) | 5 lines kvm svm: fixed a ldtr/idtr typo Found by Leonard Norrgard. http://sourceforge.net/mailarchive/forum.php?thread_id=31499320&forum_id=50582 ........ r4300 | avi | 2007-01-21 14:01:52 +0200 (Sun, 21 Jan 2007) | 4 lines kvm: emulate IA32_MISC_ENABLE msr this allows netbsd 3.1 i386 to get further along installing. ........ r4301 | avi | 2007-01-21 15:51:43 +0200 (Sun, 21 Jan 2007) | 2 lines kvm: mmu testsuite: fix pte for already dirty pages ........ r4302 | avi | 2007-01-21 16:09:57 +0200 (Sun, 21 Jan 2007) | 6 lines kvm: mmu: set pte accessed less eagerly the mmu page table walker sets the accessed bit as soon as it looks at the pte, which is wrong. we need to set the accessed bit only after the access checks have passed. ........ r4303 | uri | 2007-01-21 17:03:34 +0200 (Sun, 21 Jan 2007) | 10 lines qemu migration: ram page transfer functions (#152) These functions are to be used to transfer memory pages while the guest is online. Pages can be retransmitted, order can be insignificant (in phase 1 the whole ram would be transferred, so an order check currently exists). In order to keep the guest alive while pages are transferred, after N pages are sent, the guest gets its chance to run. M qemu/migration.c ........ r4304 | uri | 2007-01-21 17:14:06 +0200 (Sun, 21 Jan 2007) | 4 lines qemu migration: reset (migration) dirty flag for each ram page sent (#152) Later if a page is marked dirty, it'd be retransmitted (not implemented yet). ........ r4305 | avi | 2007-01-21 17:34:08 +0200 (Sun, 21 Jan 2007) | 13 lines kvm: mmu: perform access checks in walk_addr() check pte permission bits in walk_addr(), instead of scattering the checks all over the code. this has three benefits: 1. setting the accessed bit is simplified 2. under some circumstances, we used to pretend a page fault was fixed when it would actually fail the access checks. this caused an unnecessary vmexit. 3. due to the above problem, the accessed bit would be set incorrectly in some places. ........ r4306 | uri | 2007-01-21 17:34:10 +0200 (Sun, 21 Jan 2007) | 5 lines qemu migration: send/recv the whole ram during migration phase 1 (#152) The whole ram is being sent twice now. ram_save/ram_load should be modified to send only dirty pages. ........ r4307 | avi | 2007-01-22 10:52:14 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: 64-bit startup: map memory as user accessible ........ r4308 | avi | 2007-01-22 14:15:56 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: testsuite: add usermode segments ........ r4309 | avi | 2007-01-22 14:54:27 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: testsuite: add a tss so we can have userspace ........ r4310 | avi | 2007-01-22 14:55:03 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: testsuite: test usermode accesses and the pte user bit ........ r4311 | avi | 2007-01-22 15:21:05 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: mmu testsuite: test cr0.wp ........ r4312 | dor | 2007-01-22 15:28:34 +0200 (Mon, 22 Jan 2007) | 7 lines Fix windows acpi/apic installation. Previous code used cr8 as the TPR even if not running in long mode. Now the efer is tested and iff the LME is set we update the TPR from/to cr8. This solves the issues of installing windows over KVM only with -no-acpi flag set. Enjoy. ........ r4313 | avi | 2007-01-22 15:38:10 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: testsuite: fix cr0.wp location ........ r4314 | avi | 2007-01-22 15:39:34 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: testsuite: use set_cr0_wp() instead of raw asm ........ r4315 | avi | 2007-01-22 15:40:44 +0200 (Mon, 22 Jan 2007) | 2 lines kvm: testsuite: move cr0.wp testing to end, so that test output is nicer ........ r4316 | uri | 2007-01-22 17:52:14 +0200 (Mon, 22 Jan 2007) | 2 lines kvm web: migration problems (removed usbdevice + windows crushes, added NFS) ........ r4317 | avi | 2007-01-23 12:35:51 +0200 (Tue, 23 Jan 2007) | 2 lines kvm: qemu: mention kvm in window title if active ........
-rw-r--r--cpu-all.h1
-rw-r--r--hw/ide.c114
-rwxr-xr-xkvm/configure8
-rw-r--r--kvm/kernel/kvm.h1
-rw-r--r--kvm/kernel/kvm_main.c17
-rw-r--r--kvm/kernel/mmu.c10
-rw-r--r--kvm/kernel/paging_tmpl.h68
-rw-r--r--kvm/kernel/svm.c8
-rw-r--r--kvm/kernel/vmx.c12
-rwxr-xr-xkvm/kvm32
-rw-r--r--kvm/user/test/access.c165
-rw-r--r--kvm/user/test/cstart64.S37
-rw-r--r--migration.c448
-rw-r--r--qemu-kvm.c10
-rw-r--r--sdl.c2
15 files changed, 802 insertions, 131 deletions
diff --git a/cpu-all.h b/cpu-all.h
index b469031bb..34b74a736 100644
--- a/cpu-all.h
+++ b/cpu-all.h
@@ -876,6 +876,7 @@ int cpu_memory_rw_debug(CPUState *env, target_ulong addr,
#define VGA_DIRTY_FLAG 0x01
#define CODE_DIRTY_FLAG 0x02
+#define MIG_DIRTY_FLAG 0x04
/* read dirty bit (return 0 or 1) */
static inline int cpu_physical_memory_is_dirty(ram_addr_t addr)
diff --git a/hw/ide.c b/hw/ide.c
index debbc0f41..c8c81c192 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -1970,6 +1970,118 @@ static int guess_disk_lchs(IDEState *s,
return -1;
}
+
+static void pci_ide_save(QEMUFile* f, void *opaque)
+{
+ PCIIDEState *d = opaque;
+ int i;
+
+ generic_pci_save(f, &d->dev);
+
+ for(i = 0; i < 2; i++) {
+ BMDMAState *bm = &d->bmdma[i];
+ qemu_put_8s(f, &bm->cmd);
+ qemu_put_8s(f, &bm->status);
+ qemu_put_be32s(f, &bm->addr);
+ /* XXX: if a transfer is pending, we do not save it yet */
+ }
+
+ /* per IDE interface data */
+ for(i = 0; i < 2; i++) {
+ IDEState *s = &d->ide_if[i * 2];
+ uint8_t drive1_selected;
+ qemu_put_8s(f, &s->cmd);
+ drive1_selected = (s->cur_drive != s);
+ qemu_put_8s(f, &drive1_selected);
+ }
+
+ /* per IDE drive data */
+ for(i = 0; i < 4; i++) {
+ IDEState *s = &d->ide_if[i];
+ qemu_put_be32s(f, &s->mult_sectors);
+ qemu_put_be32s(f, &s->identify_set);
+ if (s->identify_set) {
+ qemu_put_buffer(f, (const uint8_t *)s->identify_data, 512);
+ }
+ qemu_put_8s(f, &s->feature);
+ qemu_put_8s(f, &s->error);
+ qemu_put_be32s(f, &s->nsector);
+ qemu_put_8s(f, &s->sector);
+ qemu_put_8s(f, &s->lcyl);
+ qemu_put_8s(f, &s->hcyl);
+ qemu_put_8s(f, &s->hob_feature);
+ qemu_put_8s(f, &s->hob_nsector);
+ qemu_put_8s(f, &s->hob_sector);
+ qemu_put_8s(f, &s->hob_lcyl);
+ qemu_put_8s(f, &s->hob_hcyl);
+ qemu_put_8s(f, &s->select);
+ qemu_put_8s(f, &s->status);
+ qemu_put_8s(f, &s->lba48);
+
+ qemu_put_8s(f, &s->sense_key);
+ qemu_put_8s(f, &s->asc);
+ /* XXX: if a transfer is pending, we do not save it yet */
+ }
+}
+
+static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
+{
+ PCIIDEState *d = opaque;
+ int ret, i;
+
+ if (version_id != 1)
+ return -EINVAL;
+ ret = generic_pci_load(f, &d->dev, 1);
+ if (ret < 0)
+ return ret;
+
+ for(i = 0; i < 2; i++) {
+ BMDMAState *bm = &d->bmdma[i];
+ qemu_get_8s(f, &bm->cmd);
+ qemu_get_8s(f, &bm->status);
+ qemu_get_be32s(f, &bm->addr);
+ /* XXX: if a transfer is pending, we do not save it yet */
+ }
+
+ /* per IDE interface data */
+ for(i = 0; i < 2; i++) {
+ IDEState *s = &d->ide_if[i * 2];
+ uint8_t drive1_selected;
+ qemu_get_8s(f, &s->cmd);
+ qemu_get_8s(f, &drive1_selected);
+ s->cur_drive = &d->ide_if[i * 2 + (drive1_selected != 0)];
+ }
+
+ /* per IDE drive data */
+ for(i = 0; i < 4; i++) {
+ IDEState *s = &d->ide_if[i];
+ qemu_get_be32s(f, &s->mult_sectors);
+ qemu_get_be32s(f, &s->identify_set);
+ if (s->identify_set) {
+ qemu_get_buffer(f, (uint8_t *)s->identify_data, 512);
+ }
+ qemu_get_8s(f, &s->feature);
+ qemu_get_8s(f, &s->error);
+ qemu_get_be32s(f, &s->nsector);
+ qemu_get_8s(f, &s->sector);
+ qemu_get_8s(f, &s->lcyl);
+ qemu_get_8s(f, &s->hcyl);
+ qemu_get_8s(f, &s->hob_feature);
+ qemu_get_8s(f, &s->hob_nsector);
+ qemu_get_8s(f, &s->hob_sector);
+ qemu_get_8s(f, &s->hob_lcyl);
+ qemu_get_8s(f, &s->hob_hcyl);
+ qemu_get_8s(f, &s->select);
+ qemu_get_8s(f, &s->status);
+ qemu_get_8s(f, &s->lba48);
+
+ qemu_get_8s(f, &s->sense_key);
+ qemu_get_8s(f, &s->asc);
+ /* XXX: if a transfer is pending, we do not save it yet */
+ }
+ return 0;
+}
+
static void ide_init2(IDEState *ide_state,
BlockDriverState *hd0, BlockDriverState *hd1,
SetIRQFunc *set_irq, void *irq_opaque, int irq)
@@ -2386,6 +2498,8 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn)
NULL, NULL);
d->type = IDE_TYPE_PIIX3;
+ register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+
pci_conf = d->dev.config;
pci_conf[0x00] = 0x86; // Intel
pci_conf[0x01] = 0x80;
diff --git a/kvm/configure b/kvm/configure
index 80117c8da..e2bbb6ee3 100755
--- a/kvm/configure
+++ b/kvm/configure
@@ -22,6 +22,14 @@ EOF
exit 1
}
+
+# prefer gcc if its version is 3.* ( over a compat-gcc )
+# do it before parsing command line arguments to enable the user
+# to specify a specific gcc he/she likes.
+if gcc -v 2>&1 | grep -q 'gcc *version *3\.[2-4]\.[0-9]'; then
+ qemu_cc=gcc
+fi
+
while [[ "$1" = -* ]]; do
opt="$1"; shift
arg=
diff --git a/kvm/kernel/kvm.h b/kvm/kernel/kvm.h
index 91e0c75ac..2db1ca4c6 100644
--- a/kvm/kernel/kvm.h
+++ b/kvm/kernel/kvm.h
@@ -242,6 +242,7 @@ struct kvm_vcpu {
u64 pdptrs[4]; /* pae */
u64 shadow_efer;
u64 apic_base;
+ u64 ia32_misc_enable_msr;
int nmsrs;
struct vmx_msr_entry *guest_msrs;
struct vmx_msr_entry *host_msrs;
diff --git a/kvm/kernel/kvm_main.c b/kvm/kernel/kvm_main.c
index be4651abe..b10972ed0 100644
--- a/kvm/kernel/kvm_main.c
+++ b/kvm/kernel/kvm_main.c
@@ -1226,6 +1226,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_APICBASE:
data = vcpu->apic_base;
break;
+ case MSR_IA32_MISC_ENABLE:
+ data = vcpu->ia32_misc_enable_msr;
+ break;
#ifdef CONFIG_X86_64
case MSR_EFER:
data = vcpu->shadow_efer;
@@ -1297,6 +1300,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_APICBASE:
vcpu->apic_base = data;
break;
+ case MSR_IA32_MISC_ENABLE:
+ vcpu->ia32_misc_enable_msr = data;
+ break;
default:
printk(KERN_ERR "kvm: unhandled wrmsr: 0x%x\n", msr);
return 1;
@@ -1600,6 +1606,10 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
+static u32 emulated_msrs[] = {
+ MSR_IA32_MISC_ENABLE,
+};
+
static __init void kvm_init_msr_list(void)
{
u32 dummy[2];
@@ -1925,7 +1935,7 @@ static long kvm_dev_ioctl(struct file *filp,
if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
goto out;
n = msr_list.nmsrs;
- msr_list.nmsrs = num_msrs_to_save;
+ msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
goto out;
r = -E2BIG;
@@ -1935,6 +1945,11 @@ static long kvm_dev_ioctl(struct file *filp,
if (copy_to_user(user_msr_list->indices, &msrs_to_save,
num_msrs_to_save * sizeof(u32)))
goto out;
+ if (copy_to_user(user_msr_list->indices
+ + num_msrs_to_save * sizeof(u32),
+ &emulated_msrs,
+ ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
+ goto out;
r = 0;
break;
}
diff --git a/kvm/kernel/mmu.c b/kvm/kernel/mmu.c
index c6f972914..a05d0609d 100644
--- a/kvm/kernel/mmu.c
+++ b/kvm/kernel/mmu.c
@@ -992,16 +992,6 @@ static inline int fix_read_pf(u64 *shadow_ent)
return 0;
}
-static int may_access(u64 pte, int write, int user)
-{
-
- if (user && !(pte & PT_USER_MASK))
- return 0;
- if (write && !(pte & PT_WRITABLE_MASK))
- return 0;
- return 1;
-}
-
static void paging_free(struct kvm_vcpu *vcpu)
{
nonpaging_free(vcpu);
diff --git a/kvm/kernel/paging_tmpl.h b/kvm/kernel/paging_tmpl.h
index 6bc41950f..afcd2a8f4 100644
--- a/kvm/kernel/paging_tmpl.h
+++ b/kvm/kernel/paging_tmpl.h
@@ -63,13 +63,15 @@ struct guest_walker {
pt_element_t *ptep;
pt_element_t inherited_ar;
gfn_t gfn;
+ u32 error_code;
};
/*
* Fetch a guest pte for a guest virtual address
*/
-static void FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr)
+static int FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr,
+ int write_fault, int user_fault)
{
hpa_t hpa;
struct kvm_memory_slot *slot;
@@ -86,7 +88,7 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
root = *walker->ptep;
if (!(root & PT_PRESENT_MASK))
- return;
+ goto not_present;
--walker->level;
}
#endif
@@ -111,11 +113,18 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
((unsigned long)ptep & PAGE_MASK));
- if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK))
- *ptep |= PT_ACCESSED_MASK;
-
if (!is_present_pte(*ptep))
- break;
+ goto not_present;
+
+ if (write_fault && !is_writeble_pte(*ptep))
+ if (user_fault || is_write_protection(vcpu))
+ goto access_error;
+
+ if (user_fault && !(*ptep & PT_USER_MASK))
+ goto access_error;
+
+ if (!(*ptep & PT_ACCESSED_MASK))
+ *ptep |= PT_ACCESSED_MASK; /* avoid rmw */
if (walker->level == PT_PAGE_TABLE_LEVEL) {
walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
@@ -146,6 +155,21 @@ static void FNAME(walk_addr)(struct guest_walker *walker,
}
walker->ptep = ptep;
pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
+ return 1;
+
+not_present:
+ walker->error_code = 0;
+ goto err;
+
+access_error:
+ walker->error_code = PFERR_PRESENT_MASK;
+
+err:
+ if (write_fault)
+ walker->error_code |= PFERR_WRITE_MASK;
+ if (user_fault)
+ walker->error_code |= PFERR_USER_MASK;
+ return 0;
}
static void FNAME(release_walker)(struct guest_walker *walker)
@@ -347,7 +371,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u32 error_code)
{
int write_fault = error_code & PFERR_WRITE_MASK;
- int pte_present = error_code & PFERR_PRESENT_MASK;
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
u64 *shadow_pte;
@@ -365,19 +388,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
/*
* Look up the shadow pte for the faulting address.
*/
- FNAME(walk_addr)(&walker, vcpu, addr);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
- if (!shadow_pte) {
- pgprintk("%s: not mapped\n", __FUNCTION__);
- inject_page_fault(vcpu, addr, error_code);
+ if (!r) {
+ pgprintk("%s: guest page fault\n", __FUNCTION__);
+ inject_page_fault(vcpu, addr, walker.error_code);
FNAME(release_walker)(&walker);
return 0;
}
+ shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
shadow_pte, *shadow_pte);
@@ -399,22 +422,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
* mmio: emulate if accessible, otherwise its a guest fault.
*/
if (is_io_pte(*shadow_pte)) {
- if (may_access(*shadow_pte, write_fault, user_fault))
- return 1;
- pgprintk("%s: io work, no access\n", __FUNCTION__);
- inject_page_fault(vcpu, addr,
- error_code | PFERR_PRESENT_MASK);
- kvm_mmu_audit(vcpu, "post page fault (io)");
- return 0;
- }
-
- /*
- * pte not present, guest page fault.
- */
- if (pte_present && !fixed && !write_pt) {
- inject_page_fault(vcpu, addr, error_code);
- kvm_mmu_audit(vcpu, "post page fault (guest)");
- return 0;
+ return 1;
}
++kvm_stat.pf_fixed;
@@ -429,7 +437,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
pt_element_t guest_pte;
gpa_t gpa;
- FNAME(walk_addr)(&walker, vcpu, vaddr);
+ FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0);
guest_pte = *walker.ptep;
FNAME(release_walker)(&walker);
diff --git a/kvm/kernel/svm.c b/kvm/kernel/svm.c
index 7397bfbbc..717aabb01 100644
--- a/kvm/kernel/svm.c
+++ b/kvm/kernel/svm.c
@@ -680,14 +680,14 @@ static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- dt->limit = vcpu->svm->vmcb->save.ldtr.limit;
- dt->base = vcpu->svm->vmcb->save.ldtr.base;
+ dt->limit = vcpu->svm->vmcb->save.idtr.limit;
+ dt->base = vcpu->svm->vmcb->save.idtr.base;
}
static void svm_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
- vcpu->svm->vmcb->save.ldtr.limit = dt->limit;
- vcpu->svm->vmcb->save.ldtr.base = dt->base ;
+ vcpu->svm->vmcb->save.idtr.limit = dt->limit;
+ vcpu->svm->vmcb->save.idtr.base = dt->base ;
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
diff --git a/kvm/kernel/vmx.c b/kvm/kernel/vmx.c
index ed1a1460d..20ee54678 100644
--- a/kvm/kernel/vmx.c
+++ b/kvm/kernel/vmx.c
@@ -1784,10 +1784,10 @@ again:
"kvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */
#ifdef CONFIG_X86_64
- "xchg %3, 0(%%rsp) \n\t"
+ "xchg %3, (%%rsp) \n\t"
"mov %%rax, %c[rax](%3) \n\t"
"mov %%rbx, %c[rbx](%3) \n\t"
- "pushq 0(%%rsp); popq %c[rcx](%3) \n\t"
+ "pushq (%%rsp); popq %c[rcx](%3) \n\t"
"mov %%rdx, %c[rdx](%3) \n\t"
"mov %%rsi, %c[rsi](%3) \n\t"
"mov %%rdi, %c[rdi](%3) \n\t"
@@ -1802,24 +1802,24 @@ again:
"mov %%r15, %c[r15](%3) \n\t"
"mov %%cr2, %%rax \n\t"
"mov %%rax, %c[cr2](%3) \n\t"
- "mov 0(%%rsp), %3 \n\t"
+ "mov (%%rsp), %3 \n\t"
"pop %%rcx; pop %%r15; pop %%r14; pop %%r13; pop %%r12;"
"pop %%r11; pop %%r10; pop %%r9; pop %%r8;"
"pop %%rbp; pop %%rdi; pop %%rsi;"
"pop %%rdx; pop %%rbx; pop %%rax \n\t"
#else
- "xchg %3, 0(%%esp) \n\t"
+ "xchg %3, (%%esp) \n\t"
"mov %%eax, %c[rax](%3) \n\t"
"mov %%ebx, %c[rbx](%3) \n\t"
- "pushl 0(%%esp); popl %c[rcx](%3) \n\t"
+ "pushl (%%esp); popl %c[rcx](%3) \n\t"
"mov %%edx, %c[rdx](%3) \n\t"
"mov %%esi, %c[rsi](%3) \n\t"
"mov %%edi, %c[rdi](%3) \n\t"
"mov %%ebp, %c[rbp](%3) \n\t"
"mov %%cr2, %%eax \n\t"
"mov %%eax, %c[cr2](%3) \n\t"
- "mov 0(%%esp), %3 \n\t"
+ "mov (%%esp), %3 \n\t"
"pop %%ecx; popa \n\t"
#endif
diff --git a/kvm/kvm b/kvm/kvm
index bb354e4ee..9fc4771d8 100755
--- a/kvm/kvm
+++ b/kvm/kvm
@@ -31,6 +31,12 @@ optparser.add_option('--no-tap',
default = False,
)
+optparser.add_option('--mac',
+ help = 'use this specific mac addr',
+ dest = 'mac',
+ default = None,
+ )
+
optparser.add_option('--vnc',
help = 'use VNC rather than SDL',
dest = 'vnc',
@@ -51,7 +57,7 @@ optparser.add_option('--image',
optparser.add_option('--cdrom',
help = 'select cdrom image',
dest = 'cdrom',
- default = '/data/mirror/fedora/core/5/x86_64/os/images/boot.iso',
+ default = None,
)
optparser.add_option('--loadvm',
help = 'select saved vm-image',
@@ -154,12 +160,15 @@ if os.access(local_cmd, os.F_OK):
else:
cmd = '/usr/bin/kvm'
-qemu_args = (cmd, '-cdrom', options.cdrom, '-boot', bootdisk,
+qemu_args = (cmd, '-boot', bootdisk,
'-L', '/usr/share/qemu', '-hda', disk, '-m', '384',
'-serial', 'file:/tmp/serial.log',
'-usbdevice', 'tablet'
)
+if options.cdrom:
+ qemu_args += ('-cdrom', options.cdrom,)
+
if not options.kvm:
qemu_args += ('-no-kvm',)
@@ -167,16 +176,17 @@ if options.debugger:
qemu_args += ('-s',)
if not options.notap:
- mac = None
- for line in commands.getoutput('ip link show eth0').splitlines():
- m = re.match(r'.*link/ether (..:..:..:..:..:..).*', line)
- if m:
- mac = m.group(1)
+ mac = options.mac
if not mac:
- raise Exception, 'Unable to determine eth0 mac address'
- mac_components = mac.split(':')
- mac_components[0] = 'a0'
- mac = ':'.join(mac_components)
+ for line in commands.getoutput('ip link show eth0').splitlines():
+ m = re.match(r'.*link/ether (..:..:..:..:..:..).*', line)
+ if m:
+ mac = m.group(1)
+ if not mac:
+ raise Exception, 'Unable to determine eth0 mac address'
+ mac_components = mac.split(':')
+ mac_components[0] = 'a0'
+ mac = ':'.join(mac_components)
qemu_args += ('-net', 'nic,macaddr=%s,model=rtl8139' % (mac,),
'-net', 'tap,script=/etc/kvm/qemu-ifup',)
diff --git a/kvm/user/test/access.c b/kvm/user/test/access.c
index 21bb0da5a..62c13dca1 100644
--- a/kvm/user/test/access.c
+++ b/kvm/user/test/access.c
@@ -11,34 +11,47 @@ typedef unsigned long pt_element_t;
#define PT_PRESENT_MASK ((pt_element_t)1 << 0)
#define PT_WRITABLE_MASK ((pt_element_t)1 << 1)
#define PT_USER_MASK ((pt_element_t)1 << 2)
+#define PT_ACCESSED_MASK ((pt_element_t)1 << 5)
+#define PT_DIRTY_MASK ((pt_element_t)1 << 6)
#define CR0_WP_MASK (1UL << 16)
+#define PFERR_PRESENT_MASK (1U << 0)
+#define PFERR_WRITE_MASK (1U << 1)
+#define PFERR_USER_MASK (1U << 2)
+
/*
* page table access check tests
*/
enum {
AC_PTE_PRESENT,
- // AC_PTE_WRITABLE,
- // AC_PTE_USER,
- // AC_PTE_ACCESSED,
- // AC_PTE_DIRTY,
+ AC_PTE_WRITABLE,
+ AC_PTE_USER,
+ AC_PTE_ACCESSED,
+ AC_PTE_DIRTY,
// AC_PTE_NX,
- // AC_CPU_CR0_WP,
- // AC_CPU_EFER_NX,
-
- // AC_ACCESS_USER,
- // AC_ACCESS_WRITE,
+ AC_ACCESS_USER,
+ AC_ACCESS_WRITE,
// AC_ACCESS_FETCH,
// AC_ACCESS_PTE,
+ // AC_CPU_EFER_NX,
+ AC_CPU_CR0_WP,
+
NR_AC_FLAGS
};
const char *ac_names[] = {
[AC_PTE_PRESENT] = "pte.p",
+ [AC_PTE_ACCESSED] = "pte.a",
+ [AC_PTE_WRITABLE] = "pte.rw",
+ [AC_PTE_USER] = "pte.user",
+ [AC_PTE_DIRTY] = "pte.d",
+ [AC_ACCESS_WRITE] = "write",
+ [AC_ACCESS_USER] = "user",
+ [AC_CPU_CR0_WP] = "cr0.wp",
};
static inline void *va(pt_element_t phys)
@@ -79,6 +92,10 @@ typedef struct {
void *virt;
pt_element_t phys;
pt_element_t pt_pool;
+ pt_element_t *ptep;
+ pt_element_t expected_pte;
+ int expected_fault;
+ unsigned expected_error;
idt_entry_t idt[256];
} ac_test_t;
@@ -111,14 +128,14 @@ unsigned short read_cs()
asm volatile ("mov %%cs, %0" : "=r"(r));
}
-void set_idt_entry(idt_entry_t *e, void *addr)
+void set_idt_entry(idt_entry_t *e, void *addr, int dpl)
{
memset(e, 0, sizeof *e);
e->offset0 = (unsigned long)addr;
e->selector = read_cs();
e->ist = 0;
e->type = 14;
- e->dpl = 0;
+ e->dpl = dpl;
e->p = 1;
e->offset1 = (unsigned long)addr >> 16;
e->offset2 = (unsigned long)addr >> 32;
@@ -136,7 +153,6 @@ void set_cr0_wp(int wp)
void ac_test_init(ac_test_t *at)
{
- printf("init\n");
set_cr0_wp(1);
for (int i = 0; i < NR_AC_FLAGS; ++i)
at->flags[i] = 0;
@@ -144,11 +160,11 @@ void ac_test_init(ac_test_t *at)
at->phys = 32 * 1024 * 1024;
at->pt_pool = 33 * 1024 * 1024;
memset(at->idt, 0, sizeof at->idt);
- printf("lidt\n");
lidt(at->idt, 256);
- extern char page_fault;
- set_idt_entry(&at->idt[14], &page_fault);
- printf("ok\n");
+ extern char page_fault, kernel_entry;
+ set_idt_entry(&at->idt[14], &page_fault, 0);
+ set_idt_entry(&at->idt[0x20], &kernel_entry, 3);
+ at->flags[AC_PTE_PRESENT] = at->flags[AC_ACCESS_WRITE] = 1;
}
int ac_test_bump(ac_test_t *at)
@@ -186,7 +202,6 @@ void ac_test_setup_pte(ac_test_t *at)
{
unsigned long root = read_cr3();
- printf("setting up pte\n");
for (int i = 4; i >= 1; --i) {
pt_element_t *vroot = va(root & PT_BASE_ADDR_MASK);
unsigned index = ((unsigned long)at->virt >> (12 + (i-1) * 9)) & 511;
@@ -200,58 +215,136 @@ void ac_test_setup_pte(ac_test_t *at)
pte = at->phys & PT_BASE_ADDR_MASK;
if (at->flags[AC_PTE_PRESENT])
pte |= PT_PRESENT_MASK;
-
+ if (at->flags[AC_PTE_WRITABLE])
+ pte |= PT_WRITABLE_MASK;
+ if (at->flags[AC_PTE_USER])
+ pte |= PT_USER_MASK;
+ if (at->flags[AC_PTE_ACCESSED])
+ pte |= PT_ACCESSED_MASK;
+ if (at->flags[AC_PTE_DIRTY])
+ pte |= PT_DIRTY_MASK;
+ at->ptep = &vroot[index];
}
vroot[index] = pte;
root = vroot[index];
}
invlpg(at->virt);
+ at->expected_pte = *at->ptep;
+ at->expected_fault = 0;
+ at->expected_error = 0;
+ if (!at->flags[AC_PTE_PRESENT])
+ at->expected_fault = 1;
+ else
+ at->expected_error |= PFERR_PRESENT_MASK;
+
+ if (at->flags[AC_ACCESS_USER]) {
+ at->expected_error |= PFERR_USER_MASK;
+ if (!at->flags[AC_PTE_USER])
+ at->expected_fault = 1;
+ }
+
+ if (at->flags[AC_ACCESS_WRITE]) {
+ at->expected_error |= PFERR_WRITE_MASK;
+ if (!at->flags[AC_PTE_WRITABLE]
+ && (at->flags[AC_CPU_CR0_WP] || at->flags[AC_ACCESS_USER])) {
+ at->expected_fault = 1;
+ } else if (!at->expected_fault) {
+ at->expected_pte |= PT_DIRTY_MASK;
+ }
+ }
+
+ if (!at->expected_fault) {
+ at->expected_pte |= PT_ACCESSED_MASK;
+ }
}
-int ac_test_do_access(ac_test_t *at, unsigned *error_code)
+int ac_test_do_access(ac_test_t *at)
{
static unsigned unique = 42;
- int ret = 1;
+ int fault = 0;
unsigned e;
+ static unsigned char user_stack[4096];
+ unsigned long rsp;
++unique;
- printf("attempting access\n");
unsigned r = unique;
- asm volatile ("fault1: mov (%[addr]), %[reg] \n\t"
- "fixed1:"
- : [reg]"+r"(r), "=m"(*error_code), "+a"(ret), "=b"(e)
- : [addr]"r"(at->virt));
+ set_cr0_wp(at->flags[AC_CPU_CR0_WP]);
+ asm volatile ("mov %%rsp, %%rdx \n\t"
+ "cmp $0, %[user] \n\t"
+ "jz do_access \n\t"
+ "push %%rax; mov %[user_ds], %%ax; mov %%ax, %%ds; pop %%rax \n\t"
+ "pushq %[user_ds] \n\t"
+ "pushq %[user_stack_top] \n\t"
+ "pushfq \n\t"
+ "pushq %[user_cs] \n\t"
+ "pushq $do_access \n\t"
+ "iretq \n"
+ "do_access: \n\t"
+ "cmp $0, %[write] \n\t"
+ "jnz 1f \n\t"
+ "mov (%[addr]), %[reg] \n\t"
+ "jmp done \n\t"
+ "1: mov %[reg], (%[addr]) \n\t"
+ "done: \n"
+ "fixed1: \n"
+ "int %[kernel_entry_vector] \n\t"
+ "back_to_kernel:"
+ : [reg]"+r"(r), "+a"(fault), "=b"(e), "=&d"(rsp)
+ : [addr]"r"(at->virt),
+ [write]"r"(at->flags[AC_ACCESS_WRITE]),
+ [user]"r"(at->flags[AC_ACCESS_USER]),
+ [user_ds]"i"(32+3),
+ [user_cs]"i"(24+3),
+ [user_stack_top]"r"(user_stack + sizeof user_stack),
+ [kernel_entry_vector]"i"(0x20));
asm volatile (".section .text.pf \n\t"
"page_fault: \n\t"
"pop %rbx \n\t"
"movq $fixed1, (%rsp) \n\t"
- "movl $0, %eax \n\t"
+ "movl $1, %eax \n\t"
"iretq \n\t"
".section .text");
- *error_code = e;
- return ret;
+ asm volatile (".section .text.entry \n\t"
+ "kernel_entry: \n\t"
+ "mov %rdx, %rsp \n\t"
+ "jmp back_to_kernel \n\t"
+ ".section .text");
+
+ if (fault && !at->expected_fault) {
+ printf("unexpected fault\n");
+ return 0;
+ }
+ if (!fault && at->expected_fault) {
+ printf("unexpected access\n");
+ return 0;
+ }
+ if (fault && e != at->expected_error) {
+ printf("error code %x expected %x\n", e, at->expected_fault);
+ return 0;
+ }
+ if (*at->ptep != at->expected_pte) {
+ printf("pte %x expected %x\n", *at->ptep, at->expected_pte);
+ return 0;
+ }
+
+ printf("OK\n");
+ return 1;
}
void ac_test_exec(ac_test_t *at)
{
int r;
- unsigned error_code;
printf("test");
for (int i = 0; i < NR_AC_FLAGS; ++i)
if (at->flags[i])
printf(" %s", ac_names[i]);
- printf(" - ");
+ printf(": ");
ac_test_setup_pte(at);
- r = ac_test_do_access(at, &error_code);
- if (r)
- printf("accessed");
- else
- printf("faulted %x", error_code);
- printf("\n");
+ r = ac_test_do_access(at);
}
void ac_test_run()
diff --git a/kvm/user/test/cstart64.S b/kvm/user/test/cstart64.S
index f6a2b71dc..6f9bcad70 100644
--- a/kvm/user/test/cstart64.S
+++ b/kvm/user/test/cstart64.S
@@ -6,23 +6,27 @@
.align 16
stacktop:
+ . = . + 4096
+ .align 16
+ring0stacktop:
+
.data
.align 4096
ptl2:
i = 0
.rept 512
- .quad 0x1e3 | (i << 21)
+ .quad 0x1e7 | (i << 21)
i = i + 1
.endr
.align 4096
ptl3:
- .quad ptl2 + 3
+ .quad ptl2 + 7
.align 4096
ptl4:
- .quad ptl3 + 3
+ .quad ptl3 + 7
.align 4096
@@ -33,8 +37,21 @@ gdt64_desc:
gdt64:
.quad 0
.quad 0x00af9b000000ffff # 64-bit code segment
+ .quad 0x00cf93000000ffff # 64-bit data segment
+ .quad 0x00affb000000ffff # 64-bit code segment (user)
+ .quad 0x00cff3000000ffff # 64-bit data segment (user)
+tss_descr:
+ .quad 0x000089000000ffff # 64-bit avail tss
+ .quad 0 # tss high addr
gdt64_end:
-
+
+tss:
+ .long 0
+ .quad ring0stacktop
+ .quad 0, 0, 0
+ .quad 0, 0, 0, 0, 0, 0, 0, 0
+ .long 0, 0, 0
+
.section .init
.code32
@@ -65,6 +82,18 @@ start64:
lea stacktop, %rsp
mov $0, %eax
mov %ax, %ss
+
+ mov $tss, %rax
+ mov %ax, tss_descr+2
+ shr $16, %rax
+ mov %al, tss_descr+4
+ shr $8, %rax
+ mov %al, tss_descr+7
+ shr $8, %rax
+ mov %eax, tss_descr+8
+ mov $(tss_descr-gdt64), %rax
+ ltr %ax
+
call main
1: hlt
jmp 1b
diff --git a/migration.c b/migration.c
index e28c79e40..d5553585c 100644
--- a/migration.c
+++ b/migration.c
@@ -19,6 +19,8 @@ void socket_set_block(int fd) /* should be in vl.c ? */
#endif
#define FD_UNUSED -1
+#define QEMU_MIGRATION_MAGIC 0x5145564d /* FIXME: our own magic ??? */
+#define QEMU_MIGRATION_VERSION 0x00000001
typedef enum {
NONE = 0,
@@ -36,6 +38,14 @@ typedef enum {
MIG_STAT_CANCEL = 6 /* migration canceled */
} migration_status_t;
+
+/* page types to be used when migrating ram pages */
+enum {
+ MIG_XFER_PAGE_TYPE_REGULAR = 0, /* regular page */
+ MIG_XFER_PAGE_TYPE_HOMOGENEOUS = 1, /* all bytes are the same */
+ MIG_XFER_PAGE_TYPE_END = 15, /* go to the next phase */
+};
+
typedef struct migration_bandwith_params {
int min, max, offline, seconds;
} migration_bandwith_params_t;
@@ -50,6 +60,11 @@ typedef struct migration_state {
migration_role_t role;
int64_t head_counter, tail_counter;
migration_bandwith_params_t bw;
+ int phase;
+ int online;
+ int yield;
+ QEMUFile *f;
+ unsigned next_page;
} migration_state_t;
static migration_state_t ms = {
@@ -61,16 +76,62 @@ static migration_state_t ms = {
.tail = 0,
.head_counter = 0,
.tail_counter = 0,
- .bw = {0, 0, 0, 0}
+ .bw = {0, 0, 0, 0},
+ .phase = 0,
+ .online = 0,
+ .yield = 0,
+ .f = NULL,
};
static const char *reader_default_addr="localhost:4455";
static const char *writer_default_addr="localhost:4456";
/* forward declarations */
-static void migration_start_dst(int online);
static void migration_cleanup(migration_state_t *pms, migration_status_t stat);
+static void migration_start_src(migration_state_t *pms);
+static void migration_phase_1_src(migration_state_t *pms);
+static void migration_phase_2_src(migration_state_t *pms);
+static void migration_phase_3_src(migration_state_t *pms);
+static void migration_phase_4_src(migration_state_t *pms);
+static void migration_start_dst(migration_state_t *pms);
+static void migration_phase_1_dst(migration_state_t *pms);
+static void migration_phase_2_dst(migration_state_t *pms);
+static void migration_phase_3_dst(migration_state_t *pms);
+static void migration_phase_4_dst(migration_state_t *pms);
+
+static void migration_ram_send(migration_state_t *pms);
+static void migration_ram_recv(migration_state_t *pms);
+
+typedef void (*QemuMigrationPhaseCB)(migration_state_t *pms);
+#define MIGRATION_NUM_PHASES 5
+QemuMigrationPhaseCB migration_phase_funcs[2][MIGRATION_NUM_PHASES] = {
+ {
+ migration_start_src,
+ migration_phase_1_src,
+ migration_phase_2_src,
+ migration_phase_3_src,
+ migration_phase_4_src },
+ {
+ migration_start_dst,
+ migration_phase_1_dst,
+ migration_phase_2_dst,
+ migration_phase_3_dst,
+ migration_phase_4_dst }
+};
+/* MIG_ASSERT
+ * assuming pms is defined in the function calling MIG_ASSERT
+ * retuns non-0 if the condition is false, 0 if all is OK
+ */
+#define MIG_ASSERT(p) mig_assert(pms, !!(p), __FUNCTION__, __LINE__)
+int mig_assert(migration_state_t *pms, int cond, const char *fname, int line)
+{
+ if (!cond) {
+ term_printf("assertion failed at %s():%d\n", fname, line);
+ migration_cleanup(&ms, MIG_STAT_FAIL);
+ }
+ return !cond;
+}
static const char *mig_stat_str(migration_status_t mig_stat)
{
@@ -151,6 +212,15 @@ static int parse_host_port_and_message(struct sockaddr_in *saddr,
return 0;
}
+static void migration_reset_buffer(migration_state_t *pms)
+{
+ memset(pms->buff, 0, pms->buffsize);
+ pms->head = 0;
+ pms->tail = 0;
+ pms->head_counter = 0;
+ pms->tail_counter = 0;
+}
+
static void migration_cleanup(migration_state_t *pms, migration_status_t stat)
{
if (pms->fd != FD_UNUSED) {
@@ -242,7 +312,9 @@ static int migration_write_into_socket(void *opaque, int len)
static void migration_start_now(void *opaque)
{
- migration_start_dst(0);
+ migration_state_t *pms = (migration_state_t *)opaque;
+
+ migration_start_dst(pms);
}
static void migration_accept(void *opaque)
@@ -401,6 +473,9 @@ static int migration_write_some(int force)
{
int size, threshold = 1024;
+ if (ms.status != MIG_STAT_START)
+ return -1;
+
if (threshold >= ms.buffsize) /* if buffsize is small */
threshold = ms.buffsize / 2;
size = migration_buffer_bytes_filled(&ms);
@@ -529,14 +604,57 @@ static void migration_disconnect(void *opaque)
migration_cleanup(pms, pms->status);
}
-static void migration_start_common(int online,
- int (*migrate)(const char*, QEMUFile*),
- int cont_on_success)
+static void migration_phase_set(migration_state_t *pms, int phase)
{
- int rc;
- int64_t start_time, end_time;
- const char *dummy = "online_migration";
- migration_state_t *pms = &ms;
+ int64_t t = qemu_get_clock(rt_clock);
+
+ term_printf("migration: starting phase %d at %" PRId64 "\n",
+ phase, t);
+ pms->phase = phase;
+}
+static void migration_phase_inc(migration_state_t *pms)
+{
+ migration_phase_set(pms, pms->phase + 1);
+}
+
+/* four phases for the migration:
+ * phase 0: initialization
+ * phase 1: online or offline
+ * transfer all RAM pages
+ * enable dirty pages logging
+ *
+ * phase 2: online only
+ * repeat: transfer all dirty pages
+ *
+ * phase 3: offline
+ * transfer whatever left (dirty pages + non-ram states)
+ *
+ * phase 4: offline or online
+ * The grand finale: decide with host should continue
+ * send a "to whom it may concern..."
+ *
+ *
+ * The function migration_main_loop just runs the appropriate function
+ * according to phase.
+ */
+
+void migration_main_loop(void *opaque)
+{
+ migration_state_t *pms = (migration_state_t *)opaque;
+
+ pms->yield = 0;
+ while (! pms->yield) {
+ if (pms->status != MIG_STAT_START)
+ pms->phase = MIGRATION_NUM_PHASES-1; /* last phase -- report */
+ if (MIG_ASSERT(pms->phase < MIGRATION_NUM_PHASES))
+ break;
+ migration_phase_funcs[pms->role-1][pms->phase](pms);
+ }
+}
+
+static void migration_start_common(migration_state_t *pms)
+{
+ int64_t start_time;
if (pms->status != MIG_STAT_CONN) {
switch (pms->status) {
@@ -560,47 +678,327 @@ static void migration_start_common(int online,
socket_set_block(pms->fd); /* read as fast as you can */
#endif
- pms->status = MIG_STAT_START;
start_time = qemu_get_clock(rt_clock);
- term_printf("\nstarting migration (at %" PRIx64 ")\n", start_time);
+ term_printf("\nstarting migration (at %" PRId64 ")\n", start_time);
+ migration_phase_set(pms, 0);
+ migration_reset_buffer(pms);
+ pms->status = MIG_STAT_START;
+ pms->next_page = 0;
+ pms->f = &qemu_savevm_method_socket;
+ pms->f->open(pms->f, NULL, NULL);
+
+ migration_phase_inc(pms);
+ migration_main_loop(pms);
+}
+
+static void migration_start_src(migration_state_t *pms)
+{
+ pms->role = WRITER;
+ migration_start_common(pms);
+}
+
+static void migration_start_dst(migration_state_t *pms)
+{
+ pms->role = READER;
+ migration_start_common(pms);
+}
+
+
+static void migration_phase_1_src(migration_state_t *pms)
+{
+ if (pms->next_page == 0) {
+ qemu_put_be32(pms->f, QEMU_MIGRATION_MAGIC);
+ qemu_put_be32(pms->f, QEMU_MIGRATION_VERSION);
+ qemu_put_byte(pms->f, pms->online);
+ qemu_set_fd_handler(pms->fd, NULL, migration_main_loop, pms);
+ }
+
+ migration_ram_send(pms);
+ if (pms->next_page >= (phys_ram_size >> TARGET_PAGE_BITS)) {
+ migration_phase_inc(pms);
+ qemu_set_fd_handler(pms->fd, NULL, NULL, pms);
+ }
+}
+static void migration_phase_2_src(migration_state_t *pms)
+{
+ migration_phase_inc(pms);
+}
+
+static void migration_phase_3_common(migration_state_t *pms,
+ int (*migrate)(const char*, QEMUFile*))
+{
+ const char *dummy = "migrating";
+ int rc;
+
vm_stop(EXCP_INTERRUPT); /* FIXME: use EXCP_MIGRATION ? */
rc = migrate(dummy, &qemu_savevm_method_socket);
- end_time = qemu_get_clock(rt_clock);
- term_printf("migration %s (at %" PRIx64 " (%" PRIx64 "))\n",
- (rc)?"failed":"completed", end_time, end_time - start_time);
if ((rc==0) && (pms->status == MIG_STAT_START))
pms->status = MIG_STAT_SUCC;
else
if (pms->status == MIG_STAT_START)
pms->status = MIG_STAT_FAIL;
- if (((pms->status == MIG_STAT_SUCC) && cont_on_success) ||
- ((pms->status != MIG_STAT_SUCC) && !cont_on_success)) {
+
+ migration_phase_inc(pms);
+}
+static void migration_phase_3_src(migration_state_t *pms)
+{
+ migration_phase_3_common(pms, qemu_savevm);
+}
+static void migration_phase_4_common(migration_state_t *pms, int cont)
+{
+ int64_t end_time = qemu_get_clock(rt_clock);
+ term_printf("migration %s at %" PRId64"\n",
+ (pms->status!=MIG_STAT_SUCC)?"failed":"completed successfully",
+ end_time);
+ if (cont) {
migration_cleanup(pms, pms->status);
vm_start();
}
- else
+ else
if (pms->fd != FD_UNUSED)
qemu_set_fd_handler(pms->fd, migration_disconnect, NULL, pms);
+
+ pms->yield = 1;
}
-static void migration_start_src(int online)
+static void migration_phase_4_src(migration_state_t *pms)
{
- ms.role = WRITER;
+ migration_phase_4_common(pms, pms->status != MIG_STAT_SUCC);
+}
+
+static void migration_phase_1_dst(migration_state_t *pms)
+{
+ uint32_t magic, version, online;
+
+ if (pms->next_page == 0) {
+ magic = qemu_get_be32(pms->f);
+ version = qemu_get_be32(pms->f);
+ online = qemu_get_byte(pms->f);
+
+ if ((magic != QEMU_MIGRATION_MAGIC) ||
+ (version != QEMU_MIGRATION_VERSION)) {
+ term_printf("migration header: recv 0x%x 0x%x expecting 0x%x 0x%x\n",
+ magic, version,
+ QEMU_MIGRATION_MAGIC, QEMU_MIGRATION_VERSION);
+ migration_cleanup(pms, MIG_STAT_FAIL);
+ return;
+ }
+
+ pms->online = online;
+ term_printf("===>received online=%u\n", online);
+ }
- migration_start_common(online, qemu_savevm, 0);
+ migration_ram_recv(pms);
+
+ if (pms->next_page >= (phys_ram_size >> TARGET_PAGE_BITS)) {
+ migration_phase_inc(pms);
+ }
+}
+static void migration_phase_2_dst(migration_state_t *pms)
+{
+ migration_phase_inc(pms);
+}
+static void migration_phase_3_dst(migration_state_t *pms)
+{
+ migration_phase_3_common(pms, qemu_loadvm);
+}
+static void migration_phase_4_dst(migration_state_t *pms)
+{
+ migration_phase_4_common(pms, pms->status == MIG_STAT_SUCC);
+}
+
+
+/*
+ * FIXME: make it share code in vl.c
+ */
+static int ram_page_homogeneous(const uint8_t *buf, const int len)
+{
+ int i, v;
+
+ v = buf[0];
+ for (i=1; i<len; i++)
+ if (buf[i] != v)
+ return 0;
+ return 1;
+}
+
+static void mig_ram_dirty_reset_page(unsigned page_number)
+{
+ ram_addr_t start, end;
+ start = page_number << TARGET_PAGE_BITS;
+ end = start + TARGET_PAGE_SIZE;
+ cpu_physical_memory_reset_dirty(start, end, MIG_DIRTY_FLAG);
+}
+
+/*
+ * Sends a single ram page
+ * As in vl.c a single byte is being sent as data if page is "homogeneous"
+ * Layout:
+ * header:
+ * byte -- migration transfer page type
+ * uint32 -- page number
+ * data
+ * a single byte or the whole page (TARGET_PAGE_SIZE bytes).
+ */
+static void mig_send_ram_page(migration_state_t *pms, unsigned page_number)
+{
+ const uint8_t* ptr = (const uint8_t *)(unsigned long)phys_ram_base;
+ uint8_t val;
+ unsigned buflen;
+
+ if (page_number >= (phys_ram_size >> TARGET_PAGE_BITS)) {
+ term_printf("mig_send_ram_page: page_number is too large: %u (max is %u)\n",
+ page_number, (phys_ram_size >> TARGET_PAGE_BITS));
+ migration_cleanup(pms, MIG_STAT_FAIL);
+ return;
+ }
+
+ ptr += (page_number << TARGET_PAGE_BITS);
+ if (ram_page_homogeneous(ptr, TARGET_PAGE_SIZE)) {
+ val = MIG_XFER_PAGE_TYPE_HOMOGENEOUS;
+ buflen = 1;
+ }
+ else {
+ val = MIG_XFER_PAGE_TYPE_REGULAR;
+ buflen = TARGET_PAGE_SIZE;
+ }
+ qemu_put_byte(pms->f, val);
+ qemu_put_be32(pms->f, page_number);
+ qemu_put_buffer(pms->f, ptr, buflen);
+
+ mig_ram_dirty_reset_page(page_number);
+}
+
+/* returns 0 on success,
+ * 1 if this phase is over
+ * -1 on failure
+ */
+static int mig_recv_ram_page(migration_state_t *pms)
+{
+ uint8_t *ptr = (uint8_t *)(unsigned long)phys_ram_base;
+ unsigned page_number;
+ uint8_t val;
+ unsigned buflen;
+
+ val = qemu_get_byte(pms->f);
+ page_number = qemu_get_be32(pms->f);
+
+ if ((pms->phase != 1) && (page_number != pms->next_page)) {
+ term_printf("WARNING: page number mismatch: received %u expected %u\n",
+ page_number, pms->next_page);
+ return -1;
+ }
+
+ if (page_number >= (phys_ram_size >> TARGET_PAGE_BITS)) {
+ term_printf("mig_recv_ram_page: page_number is too large: %u (max is %u)\n",
+ page_number, (phys_ram_size >> TARGET_PAGE_BITS));
+ return -1;
+ }
+
+ switch(val) {
+ case MIG_XFER_PAGE_TYPE_END: /* go to the next phase */;
+ pms->next_page = phys_ram_size >> TARGET_PAGE_BITS;
+ return 1;
+ case MIG_XFER_PAGE_TYPE_REGULAR:
+ buflen = TARGET_PAGE_SIZE;
+ break;
+ case MIG_XFER_PAGE_TYPE_HOMOGENEOUS:
+ buflen = 1;
+ break;
+ default:
+ term_printf("mig_recv_ram_page: illegal val received %d\n", val);
+ migration_cleanup(pms, MIG_STAT_FAIL);
+ return -1;
+ }
+
+ ptr += (page_number << TARGET_PAGE_BITS);
+ qemu_get_buffer(pms->f, ptr, buflen);
+
+ if (val == MIG_XFER_PAGE_TYPE_HOMOGENEOUS)
+ memset(ptr, ptr[0], TARGET_PAGE_SIZE);
+
+ return 0;
}
-static void migration_start_dst(int online)
+
+/* In order to enable the guest to run while memory is transferred,
+ * the number of page continuously sent is limited by this constant.
+ * When the limit is reached we take a break and continue to send pages
+ * upon another call to migration_ram_send (which would be when data can
+ * be sent over the socket ( using qemu_set_fd_handler() ).
+ */
+#define PAGES_CHUNK ((phys_ram_size >> TARGET_PAGE_BITS) /16 )
+
+/* Sends the whole ram in chunks, each call a few pages are being sent
+ * (needs to be called multiple times).
+ * State is kept in pms->next_page.
+ */
+static void migration_ram_send(migration_state_t *pms)
{
- ms.role = READER;
+ unsigned num_pages = (phys_ram_size >> TARGET_PAGE_BITS);
- migration_start_common(online, qemu_loadvm, 1);
+ if (pms->next_page == 0) { /* send memory size */
+ qemu_put_be32(pms->f, num_pages);
+ }
+
+ if (pms->next_page >= num_pages) /* finished already */
+ return;
+
+ /* send a few pages (or until network buffers full) */
+ if (num_pages - pms->next_page > PAGES_CHUNK) {
+ num_pages = pms->next_page + PAGES_CHUNK;
+ }
+ for ( /*none*/ ; pms->next_page < num_pages; pms->next_page++) {
+ if ((pms->next_page >= (0xa0000 >> TARGET_PAGE_BITS)) &&
+ (pms->next_page < (0xc0000 >> TARGET_PAGE_BITS)))
+ continue;
+ mig_send_ram_page(pms, pms->next_page);
+ }
+}
+
+/* recv the whole ram (first phase) */
+static void migration_ram_recv(migration_state_t *pms)
+{
+ unsigned num_pages;
+ int rc = 0;
+
+ num_pages = qemu_get_be32(pms->f);
+ if (num_pages != phys_ram_size >> TARGET_PAGE_BITS) {
+ term_printf("phys_memory_mismatch: %uMB %uMB\n",
+ num_pages >> (20-TARGET_PAGE_BITS), phys_ram_size>>20);
+ migration_cleanup(pms, MIG_STAT_FAIL);
+ return;
+ }
+
+ for (/* none */ ; rc==0 && pms->next_page < num_pages; pms->next_page++) {
+ if ((pms->next_page >= (0xa0000 >> TARGET_PAGE_BITS)) &&
+ (pms->next_page < (0xc0000 >> TARGET_PAGE_BITS)))
+ continue;
+ rc = mig_recv_ram_page(pms);
+ if (rc < 0) {
+ term_printf("mig_recv_ram_page FAILED after %u pages\n", pms->next_page);
+ migration_cleanup(pms, MIG_STAT_FAIL);
+ return;
+ }
+ }
+
+ if (pms->next_page < num_pages)
+ term_printf("migration_ram_recv: WARNING goto next phase after %u pages (of %u)\n",
+ pms->next_page, num_pages);
}
void do_migration_getfd(int fd) { TO_BE_IMPLEMENTED; }
void do_migration_start(char *deadoralive)
{
- migration_start_src(0);
+ if (strcmp(deadoralive, "online") == 0)
+ ms.online = 1;
+ else if (strcmp(deadoralive, "offline") == 0)
+ ms.online = 0;
+ else {
+ term_printf("migration start: please specify 'online' or 'offline'\n");
+ return;
+ }
+ migration_start_src(&ms);
}
void do_migration_cancel(void)
diff --git a/qemu-kvm.c b/qemu-kvm.c
index b5eee9361..eecf33249 100644
--- a/qemu-kvm.c
+++ b/qemu-kvm.c
@@ -196,9 +196,11 @@ static void load_regs(CPUState *env)
sregs.cr2 = env->cr[2];
sregs.cr3 = env->cr[3];
sregs.cr4 = env->cr[4];
- sregs.cr8 = cpu_get_apic_tpr(env);
+
sregs.apic_base = cpu_get_apic_base(env);
sregs.efer = env->efer;
+ if (env->efer & MSR_EFER_LME)
+ sregs.cr8 = cpu_get_apic_tpr(env);
kvm_set_sregs(kvm_context, 0, &sregs);
@@ -279,10 +281,11 @@ static void save_regs(CPUState *env)
env->cr[3] = sregs.cr3;
env->cr[4] = sregs.cr4;
- cpu_set_apic_tpr(env, sregs.cr8);
cpu_set_apic_base(env, sregs.apic_base);
env->efer = sregs.efer;
+ if (env->efer & MSR_EFER_LME)
+ cpu_set_apic_tpr(env, sregs.cr8);
#define HFLAG_COPY_MASK ~( \
HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
@@ -383,7 +386,8 @@ static void post_kvm_run(void *opaque, struct kvm_run *kvm_run)
env->eflags = (kvm_run->if_flag) ? env->eflags | IF_MASK:env->eflags & ~IF_MASK;
env->ready_for_interrupt_injection = kvm_run->ready_for_interrupt_injection;
- cpu_set_apic_tpr(env, kvm_run->cr8);
+ if (env->efer & MSR_EFER_LME)
+ cpu_set_apic_tpr(env, kvm_run->cr8);
cpu_set_apic_base(env, kvm_run->apic_base);
}
diff --git a/sdl.c b/sdl.c
index bd357354f..e13331c66 100644
--- a/sdl.c
+++ b/sdl.c
@@ -276,7 +276,7 @@ static void sdl_update_caption(void)
strcpy(buf, "QEMU");
#if USE_KVM
if (kvm_allowed) {
- strcat(buf, "/KVM");
+ strcat(buf, "/KVM");
}
#endif
if (!vm_running) {