qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Qemu-devel] [PATCH 3/3] Add KVM support to QEMU


From: Anthony Liguori
Subject: [Qemu-devel] [PATCH 3/3] Add KVM support to QEMU
Date: Tue, 28 Oct 2008 15:13:34 -0500

This patch adds very basic KVM support.  KVM is a kernel module for Linux that
allows userspace programs to make use of hardware virtualization support.  It
current supports x86 hardware virtualization using Intel VT-x or AMD-V.  It
also supports IA64 VT-i, PPC 440, and S390.

This patch only implements the bare minimum support to get a guest booting.  It
has very little impact the rest of QEMU and attempts to integrate nicely with
the rest of QEMU.

Even though this implementation is basic, it is significantly faster than TCG.
Booting and shutting down a Linux guest:

w/TCG:  1:32.36 elapsed  84% CPU

w/KVM:  0:31.14 elapsed  59% CPU

Right now, KVM is disabled by default and must be explicitly enabled with
 -enable-kvm.  We can enable it by default later when we have had better
testing.

Signed-off-by: Anthony Liguori <address@hidden>

diff --git a/KVM_TODO b/KVM_TODO
new file mode 100644
index 0000000..9529049
--- /dev/null
+++ b/KVM_TODO
@@ -0,0 +1,9 @@
+1) Add hooks for load/save of register state
+  o Fixes gdbstub, save/restore, and vmport
+2) Add VGA optimization
+3) Add IO thread
+4) Add guest SMP support
+5) Add TPR optimization
+6) Add support for in-kernel APIC
+7) Add support for in-kernel PIT
+8) Merge in additional changes in kvm-userspace tree
diff --git a/Makefile.target b/Makefile.target
index e2edf9d..903d66d 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -183,6 +183,9 @@ CFLAGS+=-I/opt/SUNWspro/prod/include/cc
 endif
 endif
 
+kvm.o: CFLAGS+=$(KVM_CFLAGS)
+kvm-all.o: CFLAGS+=$(KVM_CFLAGS)
+
 all: $(PROGS)
 
 #########################################################
@@ -475,6 +478,9 @@ ifndef CONFIG_USER_ONLY
 
 OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o machine.o net-checksum.o
 OBJS+=fw_cfg.o aio.o buffered_file.o migration.o migration-tcp.o
+ifdef CONFIG_KVM
+OBJS+=kvm.o kvm-all.o
+endif
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
diff --git a/configure b/configure
index aefa69b..7aed99d 100755
--- a/configure
+++ b/configure
@@ -113,6 +113,7 @@ aio="yes"
 nptl="yes"
 mixemu="no"
 bluez="yes"
+kvm="yes"
 
 # OS specific
 targetos=`uname -s`
@@ -300,6 +301,8 @@ for opt do
   ;;
   --disable-bluez) bluez="no"
   ;;
+  --disable-kvm) kvm="no"
+  ;;
   --enable-profiler) profiler="yes"
   ;;
   --enable-cocoa)
@@ -439,6 +442,7 @@ echo "  --disable-brlapi         disable BrlAPI"
 echo "  --disable-vnc-tls        disable TLS encryption for VNC server"
 echo "  --disable-curses         disable curses output"
 echo "  --disable-bluez          disable bluez stack connectivity"
+echo "  --disable-kvm            disable KVM acceleration support"
 echo "  --disable-nptl           disable usermode NPTL support"
 echo "  --enable-system          enable all system emulation targets"
 echo "  --disable-system         disable all system emulation targets"
@@ -933,6 +937,30 @@ EOF
 fi
 
 ##########################################
+# kvm probe
+if test "$kvm" = "yes" ; then
+    cat > $TMPC <<EOF
+#include <linux/kvm.h>
+#if !defined(KVM_API_VERSION) || \
+    KVM_API_VERSION < 12 || \
+    KVM_API_VERSION > 12 || \
+    !defined(KVM_CAP_USER_MEMORY) || \
+    !defined(KVM_CAP_SET_TSS_ADDR)
+#error Invalid KVM version
+#endif
+int main(void) { return 0; }
+EOF
+  # FIXME make this configurable
+  kvm_cflags=-I/lib/modules/`uname -r`/build/include
+  if $cc $ARCH_CFLAGS -o $TMPE ${OS_CFLAGS} $kvm_cflags $TMPC \
+      2>/dev/null ; then
+    :
+  else
+    kvm="no"
+  fi
+fi
+
+##########################################
 # AIO probe
 if test "$aio" = "yes" ; then
   aio=no
@@ -1018,6 +1046,7 @@ echo "uname -r          $uname_release"
 echo "NPTL support      $nptl"
 echo "vde support       $vde"
 echo "AIO support       $aio"
+echo "KVM support       $kvm"
 
 if test $sdl_too_old = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -1388,6 +1417,15 @@ interp_prefix1=`echo "$interp_prefix" | sed 
"s/%M/$target_cpu/g"`
 echo "#define CONFIG_QEMU_PREFIX \"$interp_prefix1\"" >> $config_h
 gdb_xml_files=""
 
+# FIXME allow i386 to build on x86_64 and vice versa
+if test "$kvm" = "yes" -a "$target_cpu" != "$cpu" ; then
+  kvm="no"
+fi
+# Disable KVM for linux-user
+if test "$kvm" = "yes" -a "$target_softmmu" = "no" ; then
+  kvm="no"
+fi
+
 case "$target_cpu" in
   i386)
     echo "TARGET_ARCH=i386" >> $config_mak
@@ -1397,6 +1435,11 @@ case "$target_cpu" in
     then
       echo "#define USE_KQEMU 1" >> $config_h
     fi
+    if test "$kvm" = "yes" ; then
+      echo "CONFIG_KVM=yes" >> $config_mak
+      echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak
+      echo "#define CONFIG_KVM" >> $config_h
+    fi
     gcc3minver=`$cc --version 2> /dev/null| fgrep "(GCC) 3." | awk '{ print $3 
}' | cut -f2 -d.`
     if test -n "$gcc3minver" && test $gcc3minver -gt 3
     then
@@ -1414,6 +1457,11 @@ case "$target_cpu" in
     then
       echo "#define USE_KQEMU 1" >> $config_h
     fi
+    if test "$kvm" = "yes" ; then
+      echo "CONFIG_KVM=yes" >> $config_mak
+      echo "KVM_CFLAGS=$kvm_cflags" >> $config_mak
+      echo "#define CONFIG_KVM 1" >> $config_h
+    fi
   ;;
   alpha)
     echo "TARGET_ARCH=alpha" >> $config_mak
diff --git a/cpu-defs.h b/cpu-defs.h
index 5dcac74..46d4487 100644
--- a/cpu-defs.h
+++ b/cpu-defs.h
@@ -142,6 +142,9 @@ typedef struct icount_decr_u16 {
 } icount_decr_u16;
 #endif
 
+struct kvm_run;
+struct KVMState;
+
 #define CPU_TEMP_BUF_NLONGS 128
 #define CPU_COMMON                                                      \
     struct TranslationBlock *current_tb; /* currently executing TB  */  \
@@ -199,6 +202,9 @@ typedef struct icount_decr_u16 {
     /* user data */                                                     \
     void *opaque;                                                       \
                                                                         \
-    const char *cpu_model_str;
+    const char *cpu_model_str;                                          \
+    struct KVMState *kvm_state;                                         \
+    struct kvm_run *kvm_run;                                            \
+    int kvm_fd;
 
 #endif
diff --git a/cpu-exec.c b/cpu-exec.c
index 6d4dcdd..04b3021 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -22,6 +22,7 @@
 #include "exec.h"
 #include "disas.h"
 #include "tcg.h"
+#include "kvm.h"
 
 #if !defined(CONFIG_SOFTMMU)
 #undef EAX
@@ -361,6 +362,19 @@ int cpu_exec(CPUState *env1)
             }
 #endif
 
+            if (kvm_enabled()) {
+                int ret;
+                ret = kvm_cpu_exec(env);
+                if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) {
+                    env->interrupt_request &= ~CPU_INTERRUPT_EXIT;
+                    env->exception_index = EXCP_INTERRUPT;
+                    cpu_loop_exit();
+                } else if (env->halted) {
+                    cpu_loop_exit();
+                } else
+                    longjmp(env->jmp_env, 1);
+            }
+
             next_tb = 0; /* force lookup of first TB */
             for(;;) {
                 interrupt_request = env->interrupt_request;
diff --git a/exec.c b/exec.c
index f1fcec8..2623ac6 100644
--- a/exec.c
+++ b/exec.c
@@ -39,6 +39,7 @@
 #include "tcg.h"
 #include "hw/hw.h"
 #include "osdep.h"
+#include "kvm.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
 #endif
@@ -2211,6 +2212,9 @@ void cpu_register_physical_memory(target_phys_addr_t 
start_addr,
         kqemu_set_phys_mem(start_addr, size, phys_offset);
     }
 #endif
+    if (kvm_enabled())
+        kvm_set_phys_mem(start_addr, size, phys_offset);
+
     size = (size + TARGET_PAGE_SIZE - 1) & TARGET_PAGE_MASK;
     end_addr = start_addr + (target_phys_addr_t)size;
     for(addr = start_addr; addr != end_addr; addr += TARGET_PAGE_SIZE) {
diff --git a/hw/acpi.c b/hw/acpi.c
index 45963d3..66a5faa 100644
--- a/hw/acpi.c
+++ b/hw/acpi.c
@@ -23,6 +23,7 @@
 #include "sysemu.h"
 #include "i2c.h"
 #include "smbus.h"
+#include "kvm.h"
 
 //#define DEBUG
 
@@ -501,6 +502,12 @@ i2c_bus *piix4_pm_init(PCIBus *bus, int devfn, uint32_t 
smb_io_base,
 
     register_ioport_write(ACPI_DBG_IO_ADDR, 4, 4, acpi_dbg_writel, s);
 
+    if (kvm_enabled()) {
+        /* Mark SMM as already inited to prevent SMM from running.  KVM does 
not
+         * support SMM mode. */
+        pci_conf[0x5B] = 0x02;
+    }
+
     /* XXX: which specification is used ? The i82731AB has different
        mappings */
     pci_conf[0x5f] = (parallel_hds[0] != NULL ? 0x80 : 0) | 0x10;
diff --git a/kvm-all.c b/kvm-all.c
new file mode 100644
index 0000000..4379071
--- /dev/null
+++ b/kvm-all.c
@@ -0,0 +1,377 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/kvm.h>
+
+#include "qemu-common.h"
+#include "sysemu.h"
+#include "kvm.h"
+
+//#define DEBUG_KVM
+
+#ifdef DEBUG_KVM
+#define dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef struct kvm_userspace_memory_region KVMSlot;
+
+int kvm_allowed = 0;
+
+struct KVMState
+{
+    KVMSlot slots[32];
+    int fd;
+    int vmfd;
+};
+
+static KVMState *kvm_state;
+
+static KVMSlot *kvm_alloc_slot(KVMState *s)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        if (s->slots[i].memory_size == 0)
+            return &s->slots[i];
+    }
+
+    return NULL;
+}
+
+static KVMSlot *kvm_lookup_slot(KVMState *s, target_phys_addr_t start_addr)
+{
+    int i;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
+        KVMSlot *mem = &s->slots[i];
+
+        if (start_addr >= mem->guest_phys_addr &&
+            start_addr < (mem->guest_phys_addr + mem->memory_size))
+            return mem;
+    }
+
+    return NULL;
+}
+
+int kvm_init_vcpu(CPUState *env)
+{
+    KVMState *s = kvm_state;
+    long mmap_size;
+    int ret;
+
+    dprintf("kvm_init_vcpu\n");
+
+    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU,
+                       (void *)(unsigned long)env->cpu_index);
+    if (ret < 0) {
+        dprintf("kvm_create_vcpu failed\n");
+        goto err;
+    }
+
+    env->kvm_fd = ret;
+    env->kvm_state = s;
+
+    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
+    if (mmap_size < 0) {
+        dprintf("KVM_GET_VCPU_MMAP_SIZE failed\n");
+        goto err;
+    }
+
+    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                        env->kvm_fd, 0);
+    if (env->kvm_run == MAP_FAILED) {
+        ret = -errno;
+        dprintf("mmap'ing vcpu state failed\n");
+        goto err;
+    }
+
+    ret = kvm_arch_init_vcpu(env);
+
+err:
+    return ret;
+}
+
+int kvm_init(int smp_cpus)
+{
+    KVMState *s;
+    int ret;
+    int i;
+
+    if (smp_cpus > 1)
+        return -EINVAL;
+
+    s = qemu_mallocz(sizeof(KVMState));
+    if (s == NULL)
+        return -ENOMEM;
+
+    for (i = 0; i < ARRAY_SIZE(s->slots); i++)
+        s->slots[i].slot = i;
+
+    s->vmfd = -1;
+    s->fd = open("/dev/kvm", O_RDWR);
+    if (s->fd == -1) {
+        fprintf(stderr, "Could not access KVM kernel module: %m\n");
+        ret = -errno;
+        goto err;
+    }
+
+    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
+    if (ret < KVM_API_VERSION) {
+        if (ret > 0)
+            ret = -EINVAL;
+        fprintf(stderr, "kvm version too old\n");
+        goto err;
+    }
+
+    if (ret > KVM_API_VERSION) {
+        ret = -EINVAL;
+        fprintf(stderr, "kvm version not supported\n");
+        goto err;
+    }
+
+    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
+    if (s->vmfd < 0)
+        goto err;
+
+    /* initially, KVM allocated its own memory and we had to jump through
+     * hooks to make phys_ram_base point to this.  Modern versions of KVM
+     * just use a user allocated buffer so we can use phys_ram_base
+     * unmodified.  Make sure we have a sufficiently modern version of KVM.
+     */
+    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, (void *)KVM_CAP_USER_MEMORY);
+    if (ret <= 0) {
+        if (ret == 0)
+            ret = -EINVAL;
+        fprintf(stderr, "kvm does not support KVM_CAP_USER_MEMORY\n");
+        goto err;
+    }
+
+    ret = kvm_arch_init(s, smp_cpus);
+    if (ret < 0)
+        goto err;
+
+    kvm_state = s;
+
+    return 0;
+
+err:
+    if (s) {
+        if (s->vmfd != -1)
+            close(s->vmfd);
+        if (s->fd != -1)
+            close(s->fd);
+    }
+    qemu_free(s);
+
+    return ret;
+}
+
+static int kvm_handle_io(CPUState *env, uint16_t port, void *data,
+                         int direction, int size, uint32_t count)
+{
+    int i;
+    uint8_t *ptr = data;
+
+    for (i = 0; i < count; i++) {
+        if (direction == KVM_EXIT_IO_IN) {
+            switch (size) {
+            case 1:
+                stb_p(ptr, cpu_inb(env, port));
+                break;
+            case 2:
+                stw_p(ptr, cpu_inw(env, port));
+                break;
+            case 4:
+                stl_p(ptr, cpu_inl(env, port));
+                break;
+            }
+        } else {
+            switch (size) {
+            case 1:
+                cpu_outb(env, port, ldub_p(ptr));
+                break;
+            case 2:
+                cpu_outw(env, port, lduw_p(ptr));
+                break;
+            case 4:
+                cpu_outl(env, port, ldl_p(ptr));
+                break;
+            }
+        }
+
+        ptr += size;
+    }
+
+    return 1;
+}
+
+int kvm_cpu_exec(CPUState *env)
+{
+    struct kvm_run *run = env->kvm_run;
+    int ret;
+
+    dprintf("kvm_cpu_exec()\n");
+
+    do {
+        kvm_arch_pre_run(env, run);
+
+        if ((env->interrupt_request & CPU_INTERRUPT_EXIT)) {
+            dprintf("interrupt exit requested\n");
+            ret = 0;
+            break;
+        }
+
+        dprintf("setting tpr\n");
+        run->cr8 = cpu_get_apic_tpr(env);
+
+        ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
+        kvm_arch_post_run(env, run);
+
+        if (ret == -EINTR || ret == -EAGAIN) {
+            dprintf("io window exit\n");
+            ret = 0;
+            break;
+        }
+
+        if (ret < 0) {
+            dprintf("kvm run failed %s\n", strerror(-ret));
+            abort();
+        }
+
+        ret = 0; /* exit loop */
+        switch (run->exit_reason) {
+        case KVM_EXIT_IO:
+            dprintf("handle_io\n");
+            ret = kvm_handle_io(env, run->io.port,
+                                (uint8_t *)run + run->io.data_offset,
+                                run->io.direction,
+                                run->io.size,
+                                run->io.count);
+            break;
+        case KVM_EXIT_MMIO:
+            dprintf("handle_mmio\n");
+            cpu_physical_memory_rw(run->mmio.phys_addr,
+                                   run->mmio.data,
+                                   run->mmio.len,
+                                   run->mmio.is_write);
+            ret = 1;
+            break;
+        case KVM_EXIT_IRQ_WINDOW_OPEN:
+            dprintf("irq_window_open\n");
+            break;
+        case KVM_EXIT_SHUTDOWN:
+            dprintf("shutdown\n");
+            qemu_system_reset_request();
+            ret = 1;
+            break;
+        case KVM_EXIT_UNKNOWN:
+            dprintf("kvm_exit_unknown\n");
+            break;
+        case KVM_EXIT_FAIL_ENTRY:
+            dprintf("kvm_exit_fail_entry\n");
+            break;
+        case KVM_EXIT_EXCEPTION:
+            dprintf("kvm_exit_exception\n");
+            break;
+        case KVM_EXIT_DEBUG:
+            dprintf("kvm_exit_debug\n");
+            break;
+        default:
+            dprintf("kvm_arch_handle_exit\n");
+            ret = kvm_arch_handle_exit(env, run);
+            break;
+        }
+    } while (ret > 0);
+
+    return ret;
+}
+
+void kvm_set_phys_mem(target_phys_addr_t start_addr,
+                      ram_addr_t size,
+                      ram_addr_t phys_offset)
+{
+    KVMState *s = kvm_state;
+    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
+    KVMSlot *mem;
+
+    /* KVM does not support read-only slots */
+    phys_offset &= ~IO_MEM_ROM;
+
+    mem = kvm_lookup_slot(s, start_addr);
+    if (mem) {
+        if (flags == IO_MEM_UNASSIGNED) {
+            mem->memory_size = 0;
+            mem->guest_phys_addr = start_addr;
+            mem->userspace_addr = 0;
+            mem->flags = 0;
+
+            kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem);
+        } else if (start_addr >= mem->guest_phys_addr &&
+                   (start_addr + size) <= (mem->guest_phys_addr + 
mem->memory_size))
+            return;
+    }
+
+    /* KVM does not need to know about this memory */
+    if (flags >= IO_MEM_UNASSIGNED)
+        return;
+
+    mem = kvm_alloc_slot(s);
+    mem->memory_size = size;
+    mem->guest_phys_addr = start_addr;
+    mem->userspace_addr = (unsigned long)(phys_ram_base + phys_offset);
+    mem->flags = 0;
+
+    kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, mem);
+    /* FIXME deal with errors */
+}
+
+int kvm_ioctl(KVMState *s, int type, void *data)
+{
+    int ret;
+
+    ret = ioctl(s->fd, type, data);
+    if (ret == -1)
+        ret = -errno;
+
+    return ret;
+}
+
+int kvm_vm_ioctl(KVMState *s, int type, void *data)
+{
+    int ret;
+
+    ret = ioctl(s->vmfd, type, data);
+    if (ret == -1)
+        ret = -errno;
+
+    return ret;
+}
+
+int kvm_vcpu_ioctl(CPUState *env, int type, void *data)
+{
+    int ret;
+
+    ret = ioctl(env->kvm_fd, type, data);
+    if (ret == -1)
+        ret = -errno;
+
+    return ret;
+}
diff --git a/kvm.h b/kvm.h
new file mode 100644
index 0000000..37102b4
--- /dev/null
+++ b/kvm.h
@@ -0,0 +1,68 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_KVM_H
+#define QEMU_KVM_H
+
+#include "config.h"
+
+#ifdef CONFIG_KVM
+extern int kvm_allowed;
+
+#define kvm_enabled() (kvm_allowed)
+#else
+#define kvm_enabled() (0)
+#endif
+
+struct kvm_run;
+
+/* external API */
+
+int kvm_init(int smp_cpus);
+
+int kvm_init_vcpu(CPUState *env);
+
+int kvm_cpu_exec(CPUState *env);
+
+void kvm_set_phys_mem(target_phys_addr_t start_addr,
+                      ram_addr_t size,
+                      ram_addr_t phys_offset);
+
+/* internal API */
+
+struct KVMState;
+typedef struct KVMState KVMState;
+
+int kvm_ioctl(KVMState *s, int type, void *data);
+
+int kvm_vm_ioctl(KVMState *s, int type, void *data);
+
+int kvm_vcpu_ioctl(CPUState *env, int type, void *data);
+
+/* Arch specific hooks */
+
+int kvm_arch_post_run(CPUState *env, struct kvm_run *run);
+
+int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run);
+
+int kvm_arch_pre_run(CPUState *env, struct kvm_run *run);
+
+int kvm_arch_get_registers(CPUState *env);
+
+int kvm_arch_put_registers(CPUState *env);
+
+int kvm_arch_init(KVMState *s, int smp_cpus);
+
+int kvm_arch_init_vcpu(CPUState *env);
+
+#endif
diff --git a/monitor.c b/monitor.c
index f0a0bc3..dc90a2b 100644
--- a/monitor.c
+++ b/monitor.c
@@ -37,6 +37,7 @@
 #include <dirent.h>
 #include "qemu-timer.h"
 #include "migration.h"
+#include "kvm.h"
 
 //#define DEBUG
 //#define DEBUG_COMPLETION
@@ -1263,6 +1264,19 @@ static void do_info_kqemu(void)
 #endif
 }
 
+static void do_info_kvm(void)
+{
+#ifdef CONFIG_KVM
+    term_printf("kvm support: ");
+    if (kvm_enabled())
+       term_printf("enabled\n");
+    else
+       term_printf("disabled\n");
+#else
+    term_printf("kvm support: not compiled\n");
+#endif
+}
+
 #ifdef CONFIG_PROFILER
 
 int64_t kqemu_time;
@@ -1495,6 +1509,8 @@ static const term_cmd_t info_cmds[] = {
       "", "show dynamic compiler info", },
     { "kqemu", "", do_info_kqemu,
       "", "show kqemu information", },
+    { "kvm", "", do_info_kvm,
+      "", "show kvm information", },
     { "usb", "", usb_info,
       "", "show guest USB devices", },
     { "usbhost", "", usb_host_info,
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 263a477..167bae2 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -587,6 +587,8 @@ typedef struct CPUX86State {
     target_ulong kernelgsbase;
 #endif
 
+    uint64_t tsc;
+
     uint64_t pat;
 
     /* exception/interrupt handling */
@@ -617,6 +619,10 @@ typedef struct CPUX86State {
     int kqemu_enabled;
     int last_io_time;
 #endif
+
+    /* For KVM */
+    uint64_t interrupt_bitmap[256 / 64];
+
     /* in order to simplify APIC support, we leave this pointer to the
        user */
     struct APICState *apic_state;
diff --git a/target-i386/helper.c b/target-i386/helper.c
index 905ae9b..e550f74 100644
--- a/target-i386/helper.c
+++ b/target-i386/helper.c
@@ -29,6 +29,7 @@
 #include "exec-all.h"
 #include "svm.h"
 #include "qemu-common.h"
+#include "kvm.h"
 
 //#define DEBUG_MMU
 
@@ -115,6 +116,8 @@ CPUX86State *cpu_x86_init(const char *cpu_model)
 #ifdef USE_KQEMU
     kqemu_init(env);
 #endif
+    if (kvm_enabled())
+        kvm_init_vcpu(env);
     return env;
 }
 
@@ -1288,6 +1291,40 @@ target_phys_addr_t cpu_get_phys_page_debug(CPUState 
*env, target_ulong addr)
 }
 #endif /* !CONFIG_USER_ONLY */
 
+#if defined(CONFIG_KVM)
+static void host_cpuid(uint32_t function, uint32_t *eax, uint32_t *ebx,
+                       uint32_t *ecx, uint32_t *edx)
+{
+    uint32_t vec[4];
+
+#ifdef __x86_64__
+    asm volatile("cpuid"
+                : "=a"(vec[0]), "=b"(vec[1]),
+                  "=c"(vec[2]), "=d"(vec[3])
+                : "0"(function) : "cc");
+#else
+    asm volatile("pusha \n\t"
+                "cpuid \n\t"
+                "mov %%eax, 0(%1) \n\t"
+                "mov %%ebx, 4(%1) \n\t"
+                "mov %%ecx, 8(%1) \n\t"
+                "mov %%edx, 12(%1) \n\t"
+                "popa"
+                : : "a"(function), "S"(vec)
+                : "memory", "cc");
+#endif
+
+    if (eax)
+       *eax = vec[0];
+    if (ebx)
+       *ebx = vec[1];
+    if (ecx)
+       *ecx = vec[2];
+    if (edx)
+       *edx = vec[3];
+}
+#endif
+
 void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
                    uint32_t *eax, uint32_t *ebx,
                    uint32_t *ecx, uint32_t *edx)
@@ -1307,12 +1344,23 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
         *ebx = env->cpuid_vendor1;
         *edx = env->cpuid_vendor2;
         *ecx = env->cpuid_vendor3;
+
+        /* sysenter isn't supported on compatibility mode on AMD.  and syscall
+         * isn't supported in compatibility mode on Intel.  so advertise the
+         * actuall cpu, and say goodbye to migration between different vendors
+         * is you use compatibility mode. */
+        if (kvm_enabled())
+            host_cpuid(0, NULL, ebx, ecx, edx);
         break;
     case 1:
         *eax = env->cpuid_version;
         *ebx = (env->cpuid_apic_id << 24) | 8 << 8; /* CLFLUSH size in quad 
words, Linux wants it. */
         *ecx = env->cpuid_ext_features;
         *edx = env->cpuid_features;
+
+        /* "Hypervisor present" bit required for Microsoft SVVP */
+        if (kvm_enabled())
+            *ecx |= (1 << 31);
         break;
     case 2:
         /* cache info: needed for Pentium Pro compatibility */
@@ -1390,6 +1438,31 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index,
         *ebx = 0;
         *ecx = env->cpuid_ext3_features;
         *edx = env->cpuid_ext2_features;
+
+        if (kvm_enabled()) {
+            uint32_t h_eax, h_edx;
+
+            host_cpuid(0x80000001, &h_eax, NULL, NULL, &h_edx);
+
+            /* disable CPU features that the host does not support */
+
+            /* long mode */
+            if ((h_edx & 0x20000000) == 0 /* || !lm_capable_kernel */)
+                *edx &= ~0x20000000;
+            /* syscall */
+            if ((h_edx & 0x00000800) == 0)
+                *edx &= ~0x00000800;
+            /* nx */
+            if ((h_edx & 0x00100000) == 0)
+                *edx &= ~0x00100000;
+
+            /* disable CPU features that KVM cannot support */
+
+            /* svm */
+            *ecx &= ~4UL;
+            /* 3dnow */
+            *edx = ~0xc0000000;
+        }
         break;
     case 0x80000002:
     case 0x80000003:
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
new file mode 100644
index 0000000..ff372af
--- /dev/null
+++ b/target-i386/kvm.c
@@ -0,0 +1,635 @@
+/*
+ * QEMU KVM support
+ *
+ * Copyright (C) 2006-2008 Qumranet Technologies
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+
+#include <linux/kvm.h>
+
+#include "qemu-common.h"
+#include "sysemu.h"
+#include "kvm.h"
+#include "cpu.h"
+
+//#define DEBUG_KVM
+
+#ifdef DEBUG_KVM
+#define dprintf(fmt, ...) \
+    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
+#else
+#define dprintf(fmt, ...) \
+    do { } while (0)
+#endif
+
+int kvm_arch_init_vcpu(CPUState *env)
+{
+    struct {
+        struct kvm_cpuid cpuid;
+        struct kvm_cpuid_entry entries[100];
+    } __attribute__((packed)) cpuid_data;
+    int limit, i, cpuid_i;
+    uint32_t eax, ebx, ecx, edx;
+
+    cpuid_i = 0;
+
+    cpu_x86_cpuid(env, 0, &eax, &ebx, &ecx, &edx);
+    limit = eax;
+
+    for (i = 0; i < limit; i++) {
+        struct kvm_cpuid_entry *c = &cpuid_data.entries[cpuid_i++];
+
+        cpu_x86_cpuid(env, i, &eax, &ebx, &ecx, &edx);
+        c->function = i;
+        c->eax = eax;
+        c->ebx = ebx;
+        c->ecx = ecx;
+        c->edx = edx;
+    }
+
+    cpu_x86_cpuid(env, 0x80000000, &eax, &ebx, &ecx, &edx);
+    limit = eax;
+
+    for (i = 0x80000000; i < limit; i++) {
+        struct kvm_cpuid_entry *c = &cpuid_data.entries[cpuid_i++];
+
+        cpu_x86_cpuid(env, i, &eax, &ebx, &ecx, &edx);
+        c->function = i;
+        c->eax = eax;
+        c->ebx = ebx;
+        c->ecx = ecx;
+        c->edx = edx;
+    }
+
+    cpuid_data.cpuid.nent = cpuid_i;
+
+    return kvm_vcpu_ioctl(env, KVM_SET_CPUID, &cpuid_data);
+}
+
+static int kvm_has_msr_star(CPUState *env)
+{
+    static int has_msr_star;
+    int ret;
+
+    /* first time */
+    if (has_msr_star == 0) {        
+        struct kvm_msr_list msr_list, *kvm_msr_list;
+
+        has_msr_star = -1;
+
+        /* Obtain MSR list from KVM.  These are the MSRs that we must
+         * save/restore */
+        ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, &msr_list);
+        if (ret < 0)
+            return 0;
+
+        msr_list.nmsrs = 0;
+        kvm_msr_list = qemu_mallocz(sizeof(msr_list) +
+                                    msr_list.nmsrs * 
sizeof(msr_list.indices[0]));
+        if (kvm_msr_list == NULL)
+            return 0;
+
+        ret = kvm_ioctl(env->kvm_state, KVM_GET_MSR_INDEX_LIST, kvm_msr_list);
+        if (ret >= 0) {
+            int i;
+
+            for (i = 0; i < kvm_msr_list->nmsrs; i++) {
+                if (kvm_msr_list->indices[i] == MSR_STAR) {
+                    has_msr_star = 1;
+                    break;
+                }
+            }
+        }
+
+        free(kvm_msr_list);
+    }
+
+    if (has_msr_star == 1)
+        return 1;
+    return 0;
+}
+
+int kvm_arch_init(KVMState *s, int smp_cpus)
+{
+    int ret;
+
+    /* create vm86 tss.  KVM uses vm86 mode to emulate 16-bit code
+     * directly.  In order to use vm86 mode, a TSS is needed.  Since this
+     * must be part of guest physical memory, we need to allocate it.  Older
+     * versions of KVM just assumed that it would be at the end of physical
+     * memory but that doesn't work with more than 4GB of memory.  We simply
+     * refuse to work with those older versions of KVM. */
+    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, (void *)KVM_CAP_SET_TSS_ADDR);
+    if (ret <= 0) {
+        fprintf(stderr, "kvm does not support KVM_CAP_SET_TSS_ADDR\n");
+        return ret;
+    }
+
+    /* this address is 3 pages before the bios, and the bios should present
+     * as unavaible memory.  FIXME, need to ensure the e820 map deals with
+     * this?
+     */
+    return kvm_vm_ioctl(s, KVM_SET_TSS_ADDR, (void *)0xfffbd000);
+}
+                    
+static void set_v8086_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
+{
+    lhs->selector = rhs->selector;
+    lhs->base = rhs->base;
+    lhs->limit = rhs->limit;
+    lhs->type = 3;
+    lhs->present = 1;
+    lhs->dpl = 3;
+    lhs->db = 0;
+    lhs->s = 1;
+    lhs->l = 0;
+    lhs->g = 0;
+    lhs->avl = 0;
+    lhs->unusable = 0;
+}
+
+static void set_seg(struct kvm_segment *lhs, const SegmentCache *rhs)
+{
+    unsigned flags = rhs->flags;
+    lhs->selector = rhs->selector;
+    lhs->base = rhs->base;
+    lhs->limit = rhs->limit;
+    lhs->type = (flags >> DESC_TYPE_SHIFT) & 15;
+    lhs->present = (flags & DESC_P_MASK) != 0;
+    lhs->dpl = rhs->selector & 3;
+    lhs->db = (flags >> DESC_B_SHIFT) & 1;
+    lhs->s = (flags & DESC_S_MASK) != 0;
+    lhs->l = (flags >> DESC_L_SHIFT) & 1;
+    lhs->g = (flags & DESC_G_MASK) != 0;
+    lhs->avl = (flags & DESC_AVL_MASK) != 0;
+    lhs->unusable = 0;
+}
+
+static void get_seg(SegmentCache *lhs, const struct kvm_segment *rhs)
+{
+    lhs->selector = rhs->selector;
+    lhs->base = rhs->base;
+    lhs->limit = rhs->limit;
+    lhs->flags =
+       (rhs->type << DESC_TYPE_SHIFT)
+       | (rhs->present * DESC_P_MASK)
+       | (rhs->dpl << DESC_DPL_SHIFT)
+       | (rhs->db << DESC_B_SHIFT)
+       | (rhs->s * DESC_S_MASK)
+       | (rhs->l << DESC_L_SHIFT)
+       | (rhs->g * DESC_G_MASK)
+       | (rhs->avl * DESC_AVL_MASK);
+}
+
+static void kvm_getput_reg(__u64 *kvm_reg, target_ulong *qemu_reg, int set)
+{
+    if (set)
+        *kvm_reg = *qemu_reg;
+    else
+        *qemu_reg = *kvm_reg;
+}
+
+static int kvm_getput_regs(CPUState *env, int set)
+{
+    struct kvm_regs regs;
+    int ret = 0;
+
+    if (!set) {
+        ret = kvm_vcpu_ioctl(env, KVM_GET_REGS, &regs);
+        if (ret < 0)
+            return ret;
+    }
+
+    kvm_getput_reg(&regs.rax, &env->regs[R_EAX], set);
+    kvm_getput_reg(&regs.rbx, &env->regs[R_EBX], set);
+    kvm_getput_reg(&regs.rcx, &env->regs[R_ECX], set);
+    kvm_getput_reg(&regs.rdx, &env->regs[R_EDX], set);
+    kvm_getput_reg(&regs.rsi, &env->regs[R_ESI], set);
+    kvm_getput_reg(&regs.rdi, &env->regs[R_EDI], set);
+    kvm_getput_reg(&regs.rsp, &env->regs[R_ESP], set);
+    kvm_getput_reg(&regs.rbp, &env->regs[R_EBP], set);
+#ifdef TARGET_X86_64
+    kvm_getput_reg(&regs.r8, &env->regs[8], set);
+    kvm_getput_reg(&regs.r9, &env->regs[9], set);
+    kvm_getput_reg(&regs.r10, &env->regs[10], set);
+    kvm_getput_reg(&regs.r11, &env->regs[11], set);
+    kvm_getput_reg(&regs.r12, &env->regs[12], set);
+    kvm_getput_reg(&regs.r13, &env->regs[13], set);
+    kvm_getput_reg(&regs.r14, &env->regs[14], set);
+    kvm_getput_reg(&regs.r15, &env->regs[15], set);
+#endif
+
+    kvm_getput_reg(&regs.rflags, &env->eflags, set);
+    kvm_getput_reg(&regs.rip, &env->eip, set);
+
+    if (set)
+        ret = kvm_vcpu_ioctl(env, KVM_SET_REGS, &regs);
+
+    return ret;
+}
+
+static int kvm_put_fpu(CPUState *env)
+{
+    struct kvm_fpu fpu;
+    int i;
+
+    memset(&fpu, 0, sizeof fpu);
+    fpu.fsw = env->fpus & ~(7 << 11);
+    fpu.fsw |= (env->fpstt & 7) << 11;
+    fpu.fcw = env->fpuc;
+    for (i = 0; i < 8; ++i)
+       fpu.ftwx |= (!env->fptags[i]) << i;
+    memcpy(fpu.fpr, env->fpregs, sizeof env->fpregs);
+    memcpy(fpu.xmm, env->xmm_regs, sizeof env->xmm_regs);
+    fpu.mxcsr = env->mxcsr;
+
+    return kvm_vcpu_ioctl(env, KVM_SET_FPU, &fpu);
+}
+
+static int kvm_put_sregs(CPUState *env)
+{
+    struct kvm_sregs sregs;
+
+    memcpy(sregs.interrupt_bitmap,
+           env->interrupt_bitmap,
+           sizeof(sregs.interrupt_bitmap));
+
+    if ((env->eflags & VM_MASK)) {
+           set_v8086_seg(&sregs.cs, &env->segs[R_CS]);
+           set_v8086_seg(&sregs.ds, &env->segs[R_DS]);
+           set_v8086_seg(&sregs.es, &env->segs[R_ES]);
+           set_v8086_seg(&sregs.fs, &env->segs[R_FS]);
+           set_v8086_seg(&sregs.gs, &env->segs[R_GS]);
+           set_v8086_seg(&sregs.ss, &env->segs[R_SS]);
+    } else {
+           set_seg(&sregs.cs, &env->segs[R_CS]);
+           set_seg(&sregs.ds, &env->segs[R_DS]);
+           set_seg(&sregs.es, &env->segs[R_ES]);
+           set_seg(&sregs.fs, &env->segs[R_FS]);
+           set_seg(&sregs.gs, &env->segs[R_GS]);
+           set_seg(&sregs.ss, &env->segs[R_SS]);
+
+           if (env->cr[0] & CR0_PE_MASK) {
+               /* force ss cpl to cs cpl */
+               sregs.ss.selector = (sregs.ss.selector & ~3) |
+                       (sregs.cs.selector & 3);
+               sregs.ss.dpl = sregs.ss.selector & 3;
+           }
+    }
+
+    set_seg(&sregs.tr, &env->tr);
+    set_seg(&sregs.ldt, &env->ldt);
+
+    sregs.idt.limit = env->idt.limit;
+    sregs.idt.base = env->idt.base;
+    sregs.gdt.limit = env->gdt.limit;
+    sregs.gdt.base = env->gdt.base;
+
+    sregs.cr0 = env->cr[0];
+    sregs.cr2 = env->cr[2];
+    sregs.cr3 = env->cr[3];
+    sregs.cr4 = env->cr[4];
+
+    sregs.cr8 = cpu_get_apic_tpr(env);
+    sregs.apic_base = cpu_get_apic_base(env);
+
+    sregs.efer = env->efer;
+
+    return kvm_vcpu_ioctl(env, KVM_SET_SREGS, &sregs);
+}
+
+static void kvm_msr_entry_set(struct kvm_msr_entry *entry,
+                              uint32_t index, uint64_t value)
+{
+    entry->index = index;
+    entry->data = value;
+}
+
+static int kvm_put_msrs(CPUState *env)
+{
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entries[100];
+    } msr_data;
+    struct kvm_msr_entry *msrs = msr_data.entries;
+    int n = 0;
+
+    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_CS, env->sysenter_cs);
+    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_ESP, env->sysenter_esp);
+    kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip);
+    if (kvm_has_msr_star(env))
+       kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star);
+    kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc);
+#ifdef TARGET_X86_64
+    /* FIXME if lm capable */
+    kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar);
+    kvm_msr_entry_set(&msrs[n++], MSR_KERNELGSBASE, env->kernelgsbase);
+    kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask);
+    kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar);
+#endif
+    msr_data.info.nmsrs = n;
+
+    return kvm_vcpu_ioctl(env, KVM_SET_MSRS, &msr_data);
+
+}
+
+
+static int kvm_get_fpu(CPUState *env)
+{
+    struct kvm_fpu fpu;
+    int i, ret;
+
+    ret = kvm_vcpu_ioctl(env, KVM_GET_FPU, &fpu);
+    if (ret < 0)
+        return ret;
+
+    env->fpstt = (fpu.fsw >> 11) & 7;
+    env->fpus = fpu.fsw;
+    env->fpuc = fpu.fcw;
+    for (i = 0; i < 8; ++i)
+       env->fptags[i] = !((fpu.ftwx >> i) & 1);
+    memcpy(env->fpregs, fpu.fpr, sizeof env->fpregs);
+    memcpy(env->xmm_regs, fpu.xmm, sizeof env->xmm_regs);
+    env->mxcsr = fpu.mxcsr;
+
+    return 0;
+}
+
+static int kvm_get_sregs(CPUState *env)
+{
+    struct kvm_sregs sregs;
+    uint32_t hflags;
+    int ret;
+
+    ret = kvm_vcpu_ioctl(env, KVM_GET_SREGS, &sregs);
+    if (ret < 0)
+        return ret;
+
+    memcpy(env->interrupt_bitmap, 
+           sregs.interrupt_bitmap,
+           sizeof(sregs.interrupt_bitmap));
+
+    get_seg(&env->segs[R_CS], &sregs.cs);
+    get_seg(&env->segs[R_DS], &sregs.ds);
+    get_seg(&env->segs[R_ES], &sregs.es);
+    get_seg(&env->segs[R_FS], &sregs.fs);
+    get_seg(&env->segs[R_GS], &sregs.gs);
+    get_seg(&env->segs[R_SS], &sregs.ss);
+
+    get_seg(&env->tr, &sregs.tr);
+    get_seg(&env->ldt, &sregs.ldt);
+
+    env->idt.limit = sregs.idt.limit;
+    env->idt.base = sregs.idt.base;
+    env->gdt.limit = sregs.gdt.limit;
+    env->gdt.base = sregs.gdt.base;
+
+    env->cr[0] = sregs.cr0;
+    env->cr[2] = sregs.cr2;
+    env->cr[3] = sregs.cr3;
+    env->cr[4] = sregs.cr4;
+
+    cpu_set_apic_base(env, sregs.apic_base);
+
+    env->efer = sregs.efer;
+    //cpu_set_apic_tpr(env, sregs.cr8);
+
+#define HFLAG_COPY_MASK ~( \
+                       HF_CPL_MASK | HF_PE_MASK | HF_MP_MASK | HF_EM_MASK | \
+                       HF_TS_MASK | HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK | \
+                       HF_OSFXSR_MASK | HF_LMA_MASK | HF_CS32_MASK | \
+                       HF_SS32_MASK | HF_CS64_MASK | HF_ADDSEG_MASK)
+
+
+
+    hflags = (env->segs[R_CS].flags >> DESC_DPL_SHIFT) & HF_CPL_MASK;
+    hflags |= (env->cr[0] & CR0_PE_MASK) << (HF_PE_SHIFT - CR0_PE_SHIFT);
+    hflags |= (env->cr[0] << (HF_MP_SHIFT - CR0_MP_SHIFT)) &
+           (HF_MP_MASK | HF_EM_MASK | HF_TS_MASK);
+    hflags |= (env->eflags & (HF_TF_MASK | HF_VM_MASK | HF_IOPL_MASK));
+    hflags |= (env->cr[4] & CR4_OSFXSR_MASK) <<
+           (HF_OSFXSR_SHIFT - CR4_OSFXSR_SHIFT);
+
+    if (env->efer & MSR_EFER_LMA) {
+        hflags |= HF_LMA_MASK;
+    }
+
+    if ((hflags & HF_LMA_MASK) && (env->segs[R_CS].flags & DESC_L_MASK)) {
+        hflags |= HF_CS32_MASK | HF_SS32_MASK | HF_CS64_MASK;
+    } else {
+        hflags |= (env->segs[R_CS].flags & DESC_B_MASK) >>
+               (DESC_B_SHIFT - HF_CS32_SHIFT);
+        hflags |= (env->segs[R_SS].flags & DESC_B_MASK) >>
+               (DESC_B_SHIFT - HF_SS32_SHIFT);
+        if (!(env->cr[0] & CR0_PE_MASK) ||
+                   (env->eflags & VM_MASK) ||
+                   !(hflags & HF_CS32_MASK)) {
+                hflags |= HF_ADDSEG_MASK;
+            } else {
+                hflags |= ((env->segs[R_DS].base |
+                                env->segs[R_ES].base |
+                                env->segs[R_SS].base) != 0) <<
+                    HF_ADDSEG_SHIFT;
+            }
+    }
+    env->hflags = (env->hflags & HFLAG_COPY_MASK) | hflags;
+    env->cc_src = env->eflags & (CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+    env->df = 1 - (2 * ((env->eflags >> 10) & 1));
+    env->cc_op = CC_OP_EFLAGS;
+    env->eflags &= ~(DF_MASK | CC_O | CC_S | CC_Z | CC_A | CC_P | CC_C);
+
+    return 0;
+}
+
+static int kvm_get_msrs(CPUState *env)
+{
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entries[100];
+    } msr_data;
+    struct kvm_msr_entry *msrs = msr_data.entries;
+    int ret, i, n;
+
+    n = 0;
+    msrs[n++].index = MSR_IA32_SYSENTER_CS;
+    msrs[n++].index = MSR_IA32_SYSENTER_ESP;
+    msrs[n++].index = MSR_IA32_SYSENTER_EIP;
+    if (kvm_has_msr_star(env))
+       msrs[n++].index = MSR_STAR;
+    msrs[n++].index = MSR_IA32_TSC;
+#ifdef TARGET_X86_64
+    /* FIXME lm_capable_kernel */
+    msrs[n++].index = MSR_CSTAR;
+    msrs[n++].index = MSR_KERNELGSBASE;
+    msrs[n++].index = MSR_FMASK;
+    msrs[n++].index = MSR_LSTAR;
+#endif
+    msr_data.info.nmsrs = n;
+    ret = kvm_vcpu_ioctl(env, KVM_GET_MSRS, &msr_data);
+    if (ret < 0)
+        return ret;
+
+    for (i = 0; i < ret; i++) {
+        switch (msrs[i].index) {
+        case MSR_IA32_SYSENTER_CS:
+            env->sysenter_cs = msrs[i].data;
+            break;
+        case MSR_IA32_SYSENTER_ESP:
+            env->sysenter_esp = msrs[i].data;
+            break;
+        case MSR_IA32_SYSENTER_EIP:
+            env->sysenter_eip = msrs[i].data;
+            break;
+        case MSR_STAR:
+            env->star = msrs[i].data;
+            break;
+#ifdef TARGET_X86_64
+        case MSR_CSTAR:
+            env->cstar = msrs[i].data;
+            break;
+        case MSR_KERNELGSBASE:
+            env->kernelgsbase = msrs[i].data;
+            break;
+        case MSR_FMASK:
+            env->fmask = msrs[i].data;
+            break;
+        case MSR_LSTAR:
+            env->lstar = msrs[i].data;
+            break;
+#endif
+        case MSR_IA32_TSC:
+            env->tsc = msrs[i].data;
+            break;
+        }
+    }
+
+    return 0;
+}
+
+int kvm_arch_put_registers(CPUState *env)
+{
+    int ret;
+
+    ret = kvm_getput_regs(env, 1);
+    if (ret < 0)
+        return ret;
+
+    ret = kvm_put_fpu(env);
+    if (ret < 0)
+        return ret;
+
+    ret = kvm_put_sregs(env);
+    if (ret < 0)
+        return ret;
+
+    ret = kvm_put_msrs(env);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+int kvm_arch_get_registers(CPUState *env)
+{
+    int ret;
+
+    ret = kvm_getput_regs(env, 0);
+    if (ret < 0)
+        return ret;
+
+    ret = kvm_get_fpu(env);
+    if (ret < 0)
+        return ret;
+
+    ret = kvm_get_sregs(env);
+    if (ret < 0)
+        return ret;
+
+    ret = kvm_get_msrs(env);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+int kvm_arch_pre_run(CPUState *env, struct kvm_run *run)
+{
+    /* Try to inject an interrupt if the guest can accept it */
+    if (run->ready_for_interrupt_injection &&
+        (env->interrupt_request & CPU_INTERRUPT_HARD) &&
+        (env->eflags & IF_MASK)) {
+        int irq;
+
+        env->interrupt_request &= ~CPU_INTERRUPT_HARD;
+        irq = cpu_get_pic_interrupt(env);
+        if (irq >= 0) {
+            struct kvm_interrupt intr;
+            intr.irq = irq;
+            /* FIXME: errors */
+            dprintf("injected interrupt %d\n", irq);
+            kvm_vcpu_ioctl(env, KVM_INTERRUPT, &intr);
+        }
+    }
+
+    /* If we have an interrupt but the guest is not ready to receive an
+     * interrupt, request an interrupt window exit.  This will
+     * cause a return to userspace as soon as the guest is ready to
+     * receive interrupts. */
+    if ((env->interrupt_request & CPU_INTERRUPT_HARD))
+        run->request_interrupt_window = 1;
+    else
+        run->request_interrupt_window = 0;
+
+    return 0;
+}
+
+int kvm_arch_post_run(CPUState *env, struct kvm_run *run)
+{
+    if (run->if_flag)
+        env->eflags |= IF_MASK;
+    else
+        env->eflags &= ~IF_MASK;
+    
+    cpu_set_apic_tpr(env, run->cr8);
+    cpu_set_apic_base(env, run->apic_base);
+
+    return 0;
+}
+
+static int kvm_handle_halt(CPUState *env)
+{
+    if (!((env->interrupt_request & CPU_INTERRUPT_HARD) &&
+          (env->eflags & IF_MASK)) &&
+        !(env->interrupt_request & CPU_INTERRUPT_NMI)) {
+        env->halted = 1;
+        env->exception_index = EXCP_HLT;
+        return 0;
+    }
+
+    return 1;
+}
+
+int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run)
+{
+    int ret = 0;
+
+    switch (run->exit_reason) {
+    case KVM_EXIT_HLT:
+        dprintf("handle_hlt\n");
+        ret = kvm_handle_halt(env);
+        break;
+    }
+
+    return ret;
+}
diff --git a/vl.c b/vl.c
index 74ae652..ecda8d5 100644
--- a/vl.c
+++ b/vl.c
@@ -39,6 +39,7 @@
 #include "block.h"
 #include "audio/audio.h"
 #include "migration.h"
+#include "kvm.h"
 
 #include <unistd.h>
 #include <fcntl.h>
@@ -8258,6 +8259,9 @@ static void help(int exitcode)
            "-kernel-kqemu   enable KQEMU full virtualization (default is user 
mode only)\n"
            "-no-kqemu       disable KQEMU kernel module usage\n"
 #endif
+#ifdef CONFIG_KVM
+           "-enable-kvm     enable KVM full virtualization support\n"
+#endif
 #ifdef TARGET_I386
            "-no-acpi        disable ACPI\n"
 #endif
@@ -8363,6 +8367,7 @@ enum {
     QEMU_OPTION_pidfile,
     QEMU_OPTION_no_kqemu,
     QEMU_OPTION_kernel_kqemu,
+    QEMU_OPTION_enable_kvm,
     QEMU_OPTION_win2k_hack,
     QEMU_OPTION_usb,
     QEMU_OPTION_usbdevice,
@@ -8449,6 +8454,9 @@ static const QEMUOption qemu_options[] = {
     { "no-kqemu", 0, QEMU_OPTION_no_kqemu },
     { "kernel-kqemu", 0, QEMU_OPTION_kernel_kqemu },
 #endif
+#ifdef CONFIG_KVM
+    { "enable-kvm", 0, QEMU_OPTION_enable_kvm },
+#endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
     { "g", 1, QEMU_OPTION_g },
 #endif
@@ -9271,6 +9279,14 @@ int main(int argc, char **argv)
                 kqemu_allowed = 2;
                 break;
 #endif
+#ifdef CONFIG_KVM
+            case QEMU_OPTION_enable_kvm:
+                kvm_allowed = 1;
+#ifdef USE_KQEMU
+                kqemu_allowed = 0;
+#endif
+                break;
+#endif
             case QEMU_OPTION_usb:
                 usb_enabled = 1;
                 break;
@@ -9405,6 +9421,14 @@ int main(int argc, char **argv)
         }
     }
 
+#if defined(CONFIG_KVM) && defined(USE_KQEMU)
+    if (kvm_allowed && kqemu_allowed) {
+        fprintf(stderr,
+                "You can not enable both KVM and kqemu at the same time\n");
+        exit(1);
+    }
+#endif
+
     if (smp_cpus > machine->max_cpus) {
         fprintf(stderr, "Number of SMP cpus requested (%d), exceeds max cpus "
                 "supported by machine `%s' (%d)\n", smp_cpus,  machine->name,
@@ -9710,6 +9734,16 @@ int main(int argc, char **argv)
         }
     }
 
+    if (kvm_enabled()) {
+        int ret;
+
+        ret = kvm_init(smp_cpus);
+        if (ret < 0) {
+            fprintf(stderr, "failed to initialize KVM\n");
+            exit(1);
+        }
+    }
+
     machine->init(ram_size, vga_ram_size, boot_devices, ds,
                   kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
 




reply via email to

[Prev in Thread] Current Thread [Next in Thread]