Merge branch 'kvm-updates/2.6.31' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.31' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits) KVM: Prevent overflow in largepages calculation KVM: Disable large pages on misaligned memory slots KVM: Add VT-x machine check support KVM: VMX: Rename rmode.active to rmode.vm86_active KVM: Move "exit due to NMI" handling into vmx_complete_interrupts() KVM: Disable CR8 intercept if tpr patching is active KVM: Do not migrate pending software interrupts. KVM: inject NMI after IRET from a previous NMI, not before. KVM: Always request IRQ/NMI window if an interrupt is pending KVM: Do not re-execute INTn instruction. KVM: skip_emulated_instruction() decode instruction if size is not known KVM: Remove irq_pending bitmap KVM: Do not allow interrupt injection from userspace if there is a pending event. KVM: Unprotect a page if #PF happens during NMI injection. KVM: s390: Verify memory in kvm run KVM: s390: Sanity check on validity intercept KVM: s390: Unlink vcpu on destroy - v2 KVM: s390: optimize float int lock: spin_lock_bh --> spin_lock KVM: s390: use hrtimer for clock wakeup from idle - v2 KVM: s390: Fix memory slot versus run - v3 ...

Merge branch 'kvm-updates/2.6.31' of git://git.kernel.org/pub/scm/virt/kvm/kvm
* 'kvm-updates/2.6.31' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits) KVM: Prevent overflow in largepages calculation KVM: Disable large pages on misaligned memory slots KVM: Add VT-x machine check support KVM: VMX: Rename rmode.active to rmode.vm86_active KVM: Move "exit due to NMI" handling into vmx_complete_interrupts() KVM: Disable CR8 intercept if tpr patching is active KVM: Do not migrate pending software interrupts. KVM: inject NMI after IRET from a previous NMI, not before. KVM: Always request IRQ/NMI window if an interrupt is pending KVM: Do not re-execute INTn instruction. KVM: skip_emulated_instruction() decode instruction if size is not known KVM: Remove irq_pending bitmap KVM: Do not allow interrupt injection from userspace if there is a pending event. KVM: Unprotect a page if #PF happens during NMI injection. KVM: s390: Verify memory in kvm run KVM: s390: Sanity check on validity intercept KVM: s390: Unlink vcpu on destroy - v2 KVM: s390: optimize float int lock: spin_lock_bh --> spin_lock KVM: s390: use hrtimer for clock wakeup from idle - v2 KVM: s390: Fix memory slot versus run - v3 ...
6cd8e300 · Linus Torvalds · ddbb8684 · 09f8ca74 · 6cd8e300 · 6cd8e300
Commit 6cd8e300 authored Jun 11, 2009 by Linus Torvalds
55 changed files
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -371,6 +371,7 @@ struct kvm_vcpu_arch {
 	int last_run_cpu;
 	int vmm_tr_slot;
 	int vm_tr_slot;
+	int sn_rtc_tr_slot;
 #define KVM_MP_STATE_RUNNABLE          0
 #define KVM_MP_STATE_UNINITIALIZED     1
@@ -465,6 +466,7 @@ struct kvm_arch {
 	unsigned long	vmm_init_rr;
 	int		online_vcpus;
+	int		is_sn2;
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
@@ -472,6 +474,7 @@ struct kvm_arch {
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct hlist_head irq_ack_notifier_list;
 	unsigned long irq_sources_bitmap;
@@ -578,6 +581,8 @@ struct kvm_vmm_info{
 	kvm_vmm_entry 	*vmm_entry;
 	kvm_tramp_entry *tramp_entry;
 	unsigned long 	vmm_ivt;
+	unsigned long	patch_mov_ar;
+	unsigned long	patch_mov_ar_sn2;
 };
 int kvm_highest_pending_irq(struct kvm_vcpu *vcpu);
@@ -585,7 +590,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
-static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {}
 #endif /* __ASSEMBLY__*/
 #endif
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -146,6 +146,8 @@
 #define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
 #define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
 #define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
+#define PAGE_KERNEL_UC	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX | \
+				 _PAGE_MA_UC)
 # ifndef __ASSEMBLY__

--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -610,6 +610,9 @@ static struct irqaction ipi_irqaction = {
 	.name =		"IPI"
 };
+/*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
 static struct irqaction resched_irqaction = {
 	.handler =	dummy_handler,
 	.flags =	IRQF_DISABLED,

--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -23,7 +23,7 @@ if VIRTUALIZATION
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM && EXPERIMENTAL
+	depends on HAVE_KVM && MODULES && EXPERIMENTAL
 	# for device assignment:
 	depends on PCI
 	select PREEMPT_NOTIFIERS

--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
--- a/arch/ia64/kvm/kvm_fw.c
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -21,6 +21,9 @@
 #include <linux/kvm_host.h>
 #include <linux/smp.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/shub_mmr.h>
 #include "vti.h"
 #include "misc.h"
@@ -188,12 +191,35 @@ static struct ia64_pal_retval pal_freq_base(struct kvm_vcpu *vcpu)
 	return result;
 }
-static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+/*
+ * On the SGI SN2, the ITC isn't stable. Emulation backed by the SN2
+ * RTC is used instead. This function patches the ratios from SAL
+ * to match the RTC before providing them to the guest.
+ */
+static void sn2_patch_itc_freq_ratios(struct ia64_pal_retval *result)
 {
+	struct pal_freq_ratio *ratio;
+	unsigned long sal_freq, sal_drift, factor;
+	result->status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
+					    &sal_freq, &sal_drift);
+	ratio = (struct pal_freq_ratio *)&result->v2;
+	factor = ((sal_freq * 3) + (sn_rtc_cycles_per_second / 2)) /
+		sn_rtc_cycles_per_second;
+	ratio->num = 3;
+	ratio->den = factor;
+}
+static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+{
 	struct ia64_pal_retval result;
 	PAL_CALL(result, PAL_FREQ_RATIOS, 0, 0, 0);
+	if (vcpu->kvm->arch.is_sn2)
+		sn2_patch_itc_freq_ratios(&result);
 	return result;
 }

--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -20,6 +20,10 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+#define kvm_apic_present(x) (true)
 #endif
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -11,6 +11,7 @@
 #include <asm/asmmacro.h>
 #include <asm/processor.h>
+#include <asm/kvm_host.h>
 #include "vti.h"
 #include "asm-offsets.h"
@@ -140,6 +141,35 @@ GLOBAL_ENTRY(kvm_asm_mov_from_ar)
 	;;
 END(kvm_asm_mov_from_ar)
+/*
+ * Special SGI SN2 optimized version of mov_from_ar using the SN2 RTC
+ * clock as it's source for emulating the ITC. This version will be
+ * copied on top of the original version if the host is determined to
+ * be an SN2.
+ */
+GLOBAL_ENTRY(kvm_asm_mov_from_ar_sn2)
+	add r18=VMM_VCPU_ITC_OFS_OFFSET, r21
+	movl r19 = (KVM_VMM_BASE+(1<<KVM_VMM_SHIFT))
+	add r16=VMM_VCPU_LAST_ITC_OFFSET,r21
+	extr.u r17=r25,6,7
+	mov r24=b0
+	;;
+	ld8 r18=[r18]
+	ld8 r19=[r19]
+	addl r20=@gprel(asm_mov_to_reg),gp
+	;;
+	add r19=r19,r18
+	shladd r17=r17,4,r20
+	;;
+	adds r30=kvm_resume_to_guest-asm_mov_to_reg,r20
+	st8 [r16] = r19
+	mov b0=r17
+	br.sptk.few b0
+	;;
+END(kvm_asm_mov_from_ar_sn2)
 // mov r1=rr[r3]
 GLOBAL_ENTRY(kvm_asm_mov_from_rr)

--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -652,20 +652,25 @@ void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
 		unsigned long isr, unsigned long iim)
 {
 	struct kvm_vcpu *v = current_vcpu;
+	long psr;
 	if (ia64_psr(regs)->cpl == 0) {
 		/* Allow hypercalls only when cpl = 0.  */
 		if (iim == DOMN_PAL_REQUEST) {
+			local_irq_save(psr);
 			set_pal_call_data(v);
 			vmm_transition(v);
 			get_pal_call_result(v);
 			vcpu_increment_iip(v);
+			local_irq_restore(psr);
 			return;
 		} else if (iim == DOMN_SAL_REQUEST) {
+			local_irq_save(psr);
 			set_sal_call_data(v);
 			vmm_transition(v);
 			get_sal_call_result(v);
 			vcpu_increment_iip(v);
+			local_irq_restore(psr);
 			return;
 		}
 	}

--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -788,13 +788,29 @@ void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
 		setfpreg(reg, val, regs);   /* FIXME: handle NATs later*/
 }
+/*
+ * The Altix RTC is mapped specially here for the vmm module
+ */
+#define SN_RTC_BASE	(u64 *)(KVM_VMM_BASE+(1UL<<KVM_VMM_SHIFT))
+static long kvm_get_itc(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	struct kvm *kvm = (struct kvm *)KVM_VM_BASE;
+	if (kvm->arch.is_sn2)
+		return (*SN_RTC_BASE);
+	else
+#endif
+		return ia64_getreg(_IA64_REG_AR_ITC);
+}
 /************************************************************************
 * lsapic timer
 ***********************************************************************/
 u64 vcpu_get_itc(struct kvm_vcpu *vcpu)
 {
 	unsigned long guest_itc;
-	guest_itc = VMX(vcpu, itc_offset) + ia64_getreg(_IA64_REG_AR_ITC);
+	guest_itc = VMX(vcpu, itc_offset) + kvm_get_itc(vcpu);
 	if (guest_itc >= VMX(vcpu, last_itc)) {
 		VMX(vcpu, last_itc) = guest_itc;
@@ -809,7 +825,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 	struct kvm_vcpu *v;
 	struct kvm *kvm;
 	int i;
-	long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC);
+	long itc_offset = val - kvm_get_itc(vcpu);
 	unsigned long vitv = VCPU(vcpu, itv);
 	kvm = (struct kvm *)KVM_VM_BASE;

--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -30,15 +30,19 @@ MODULE_AUTHOR("Intel");
 MODULE_LICENSE("GPL");
 extern char kvm_ia64_ivt;
+extern char kvm_asm_mov_from_ar;
+extern char kvm_asm_mov_from_ar_sn2;
 extern fpswa_interface_t *vmm_fpswa_interface;
 long vmm_sanity = 1;
 struct kvm_vmm_info vmm_info = {
-	.module	     = THIS_MODULE,
+	.module			= THIS_MODULE,
-	.vmm_entry   = vmm_entry,
+	.vmm_entry		= vmm_entry,
-	.tramp_entry = vmm_trampoline,
+	.tramp_entry		= vmm_trampoline,
-	.vmm_ivt     = (unsigned long)&kvm_ia64_ivt,
+	.vmm_ivt		= (unsigned long)&kvm_ia64_ivt,
+	.patch_mov_ar		= (unsigned long)&kvm_asm_mov_from_ar,
+	.patch_mov_ar_sn2	= (unsigned long)&kvm_asm_mov_from_ar_sn2,
 };
 static int __init  kvm_vmm_init(void)

--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -95,7 +95,7 @@ GLOBAL_ENTRY(kvm_vmm_panic)
 	;;
 	srlz.i    // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -249,7 +249,7 @@ ENTRY(kvm_break_fault)
 	;;
 	srlz.i         // guarantee that interruption collection is on
 	;;
-	//(p15)ssm psr.i               // restore psr.i
+	(p15)ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -439,7 +439,7 @@ kvm_dispatch_vexirq:
 	;;
 	srlz.i // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	adds r3=8,r2                // set up second base pointer
 	;;
 	KVM_SAVE_REST
@@ -819,7 +819,7 @@ ENTRY(kvm_dtlb_miss_dispatch)
 	;;
 	srlz.i     // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
 	;;
 	KVM_SAVE_REST
@@ -842,7 +842,7 @@ ENTRY(kvm_itlb_miss_dispatch)
 	;;
 	srlz.i   // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -871,7 +871,7 @@ ENTRY(kvm_dispatch_reflection)
 	;;
 	srlz.i   // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -898,7 +898,7 @@ ENTRY(kvm_dispatch_virtualization_fault)
 	;;
 	srlz.i    // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
 	;;
 	KVM_SAVE_REST
@@ -920,7 +920,7 @@ ENTRY(kvm_dispatch_interrupt)
 	;;
 	srlz.i
 	;;
-	//(p15) ssm psr.i
+	(p15) ssm psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -1333,7 +1333,7 @@ hostret =   r24
 	;;
 (p7)    srlz.i
 	;;
-//(p6)    ssm psr.i
+(p6)    ssm psr.i
 	;;
 	mov rp=rpsave
 	mov ar.pfs=pfssave

--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -254,7 +254,8 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte)
 			"(p7) st8 [%2]=r9;;"
 			"ssm psr.ic;;"
 			"srlz.d;;"
-			/* "ssm psr.i;;" Once interrupts in vmm open, need fix*/
+			"ssm psr.i;;"
+			"srlz.d;;"
 			: "=r"(ret) : "r"(iha), "r"(pte):"memory");
 	return ret;

--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -41,6 +41,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	return !!(v->arch.pending_exceptions);
 }
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 	return !(v->arch.msr & MSR_WE);

--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -13,6 +13,8 @@
 #ifndef ASM_KVM_HOST_H
 #define ASM_KVM_HOST_H
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <asm/debug.h>
 #include <asm/cpuid.h>
@@ -210,7 +212,8 @@ struct kvm_vcpu_arch {
 	s390_fp_regs      guest_fpregs;
 	unsigned int      guest_acrs[NUM_ACRS];
 	struct kvm_s390_local_interrupt local_int;
-	struct timer_list ckc_timer;
+	struct hrtimer    ckc_timer;
+	struct tasklet_struct tasklet;
 	union  {
 		cpuid_t	  cpu_id;
 		u64	  stidp_data;

--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 static int handle_validity(struct kvm_vcpu *vcpu)
 {
 	int viwhy = vcpu->arch.sie_block->ipb >> 16;
+	int rc;
 	vcpu->stat.exit_validity++;
-	if (viwhy == 0x37) {
+	if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix
-		fault_in_pages_writeable((char __user *)
+		<= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){
-					 vcpu->kvm->arch.guest_origin +
+		rc = fault_in_pages_writeable((char __user *)
-					 vcpu->arch.sie_block->prefix,
+			 vcpu->kvm->arch.guest_origin +
-					 PAGE_SIZE);
+			 vcpu->arch.sie_block->prefix,
-		return 0;
+			 2*PAGE_SIZE);
-	}
+		if (rc)
-	VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
+			/* user will receive sigsegv, exit to user */
-		   viwhy);
+			rc = -ENOTSUPP;
-	return -ENOTSUPP;
+	} else
+		rc = -ENOTSUPP;
+	if (rc)
+		VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
+			   viwhy);
+	return rc;
 }
 static int handle_instruction(struct kvm_vcpu *vcpu)

--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -12,6 +12,8 @@
 #include <asm/lowcore.h>
 #include <asm/uaccess.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <linux/signal.h>
 #include "kvm-s390.h"
@@ -299,13 +301,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	}
 	if ((!rc) && atomic_read(&fi->active)) {
-		spin_lock_bh(&fi->lock);
+		spin_lock(&fi->lock);
 		list_for_each_entry(inti, &fi->list, list)
 			if (__interrupt_is_deliverable(vcpu, inti)) {
 				rc = 1;
 				break;
 			}
-		spin_unlock_bh(&fi->lock);
+		spin_unlock(&fi->lock);
 	}
 	if ((!rc) && (vcpu->arch.sie_block->ckc <
@@ -318,6 +320,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return rc;
 }
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return 0;
@@ -355,14 +363,12 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		return 0;
 	}
-	sltime = (vcpu->arch.sie_block->ckc - now) / (0xf4240000ul / HZ) + 1;
+	sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9;
-	vcpu->arch.ckc_timer.expires = jiffies + sltime;
+	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
+	VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
-	add_timer(&vcpu->arch.ckc_timer);
-	VCPU_EVENT(vcpu, 5, "enabled wait timer:%llx jiffies", sltime);
 no_timer:
-	spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+	spin_lock(&vcpu->arch.local_int.float_int->lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	add_wait_queue(&vcpu->arch.local_int.wq, &wait);
 	while (list_empty(&vcpu->arch.local_int.list) &&
@@ -371,33 +377,46 @@ no_timer:
 		!signal_pending(current)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_bh(&vcpu->arch.local_int.lock);
-		spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+		spin_unlock(&vcpu->arch.local_int.float_int->lock);
 		vcpu_put(vcpu);
 		schedule();
 		vcpu_load(vcpu);
-		spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+		spin_lock(&vcpu->arch.local_int.float_int->lock);
 		spin_lock_bh(&vcpu->arch.local_int.lock);
 	}
 	__unset_cpu_idle(vcpu);
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&vcpu->wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
-	spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+	spin_unlock(&vcpu->arch.local_int.float_int->lock);
-	del_timer(&vcpu->arch.ckc_timer);
+	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
 	return 0;
 }
-void kvm_s390_idle_wakeup(unsigned long data)
+void kvm_s390_tasklet(unsigned long parm)
 {
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) parm;
-	spin_lock_bh(&vcpu->arch.local_int.lock);
+	spin_lock(&vcpu->arch.local_int.lock);
 	vcpu->arch.local_int.timer_due = 1;
 	if (waitqueue_active(&vcpu->arch.local_int.wq))
 		wake_up_interruptible(&vcpu->arch.local_int.wq);
-	spin_unlock_bh(&vcpu->arch.local_int.lock);
+	spin_unlock(&vcpu->arch.local_int.lock);
 }
+/*
+ * low level hrtimer wake routine. Because this runs in hardirq context
+ * we schedule a tasklet to do the real work.
+ */
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu;
+	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+	tasklet_schedule(&vcpu->arch.tasklet);
+	return HRTIMER_NORESTART;
+}
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
@@ -436,7 +455,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 	if (atomic_read(&fi->active)) {
 		do {
 			deliver = 0;
-			spin_lock_bh(&fi->lock);
+			spin_lock(&fi->lock);
 			list_for_each_entry_safe(inti, n, &fi->list, list) {
 				if (__interrupt_is_deliverable(vcpu, inti)) {
 					list_del(&inti->list);
@@ -447,7 +466,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 			}
 			if (list_empty(&fi->list))
 				atomic_set(&fi->active, 0);
-			spin_unlock_bh(&fi->lock);
+			spin_unlock(&fi->lock);
 			if (deliver) {
 				__do_deliver_interrupt(vcpu, inti);
 				kfree(inti);
@@ -512,7 +531,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 	mutex_lock(&kvm->lock);
 	fi = &kvm->arch.float_int;
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	list_add_tail(&inti->list, &fi->list);
 	atomic_set(&fi->active, 1);
 	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
@@ -529,7 +548,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 	if (waitqueue_active(&li->wq))
 		wake_up_interruptible(&li->wq);
 	spin_unlock_bh(&li->lock);
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	mutex_unlock(&kvm->lock);
 	return 0;
 }

--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/fs.h>
+#include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
@@ -195,6 +196,10 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
+		(__u64) vcpu->arch.sie_block)
+		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
+	smp_mb();
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu);
@@ -283,8 +288,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin;
 	vcpu->arch.sie_block->ecb   = 2;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
-	setup_timer(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup,
+	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
-		 (unsigned long) vcpu);
+	tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
+		     (unsigned long) vcpu);
+	vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
 	get_cpu_id(&vcpu->arch.cpu_id);
 	vcpu->arch.cpu_id.version = 0xff;
 	return 0;
@@ -307,19 +314,21 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	vcpu->arch.sie_block->icpua = id;
 	BUG_ON(!kvm->arch.sca);
-	BUG_ON(kvm->arch.sca->cpu[id].sda);
+	if (!kvm->arch.sca->cpu[id].sda)
-	kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+	else
+		BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
 	vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-	spin_lock_bh(&kvm->arch.float_int.lock);
+	spin_lock(&kvm->arch.float_int.lock);
 	kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
 	init_waitqueue_head(&vcpu->arch.local_int.wq);
 	vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
-	spin_unlock_bh(&kvm->arch.float_int.lock);
+	spin_unlock(&kvm->arch.float_int.lock);
 	rc = kvm_vcpu_init(vcpu, kvm, id);
 	if (rc)
@@ -478,6 +487,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vcpu_load(vcpu);
+	/* verify, that memory has been registered */
+	if (!vcpu->kvm->arch.guest_memsize) {
+		vcpu_put(vcpu);
+		return -EINVAL;
+	}
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -657,6 +672,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot old,
 				int user_alloc)
 {
+	int i;
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
 	   page boundary in userland and which has to end at a page boundary.
@@ -664,7 +681,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	   vmas. It is okay to mmap() and munmap() stuff in this slot after
 	   doing this call at any time */
-	if (mem->slot)
+	if (mem->slot || kvm->arch.guest_memsize)
 		return -EINVAL;
 	if (mem->guest_phys_addr)
@@ -676,15 +693,39 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		return -EINVAL;
+	if (!user_alloc)
+		return -EINVAL;
+	/* lock all vcpus */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (!kvm->vcpus[i])
+			continue;
+		if (!mutex_trylock(&kvm->vcpus[i]->mutex))
+			goto fail_out;
+	}
 	kvm->arch.guest_origin = mem->userspace_addr;
 	kvm->arch.guest_memsize = mem->memory_size;
-	/* FIXME: we do want to interrupt running CPUs and update their memory
+	/* update sie control blocks, and unlock all vcpus */
-	   configuration now to avoid race conditions. But hey, changing the
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-	   memory layout while virtual CPUs are running is usually bad
+		if (kvm->vcpus[i]) {
-	   programming practice. */
+			kvm->vcpus[i]->arch.sie_block->gmsor =
+				kvm->arch.guest_origin;
+			kvm->vcpus[i]->arch.sie_block->gmslm =
+				kvm->arch.guest_memsize +
+				kvm->arch.guest_origin +
+				VIRTIODESCSPACE - 1ul;
+			mutex_unlock(&kvm->vcpus[i]->mutex);
+		}
+	}
 	return 0;
+fail_out:
+	for (; i >= 0; i--)
+		mutex_unlock(&kvm->vcpus[i]->mutex);
+	return -EINVAL;
 }
 void kvm_arch_flush_shadow(struct kvm *kvm)

--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -14,6 +14,7 @@
 #ifndef ARCH_S390_KVM_S390_H
 #define ARCH_S390_KVM_S390_H
+#include <linux/hrtimer.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
@@ -41,7 +42,8 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
 }
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
-void kvm_s390_idle_wakeup(unsigned long data);
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
+void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 int kvm_s390_inject_vm(struct kvm *kvm,
 		struct kvm_s390_interrupt *s390int);

--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -204,11 +204,11 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 	int cpus = 0;
 	int n;
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	for (n = 0; n < KVM_MAX_VCPUS; n++)
 		if (fi->local_int[n])
 			cpus++;
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	/* deal with other level 3 hypervisors */
 	if (stsi(mem, 3, 2, 2) == -ENOSYS)

--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -52,7 +52,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 	if (cpu_addr >= KVM_MAX_VCPUS)
 		return 3; /* not operational */
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	if (fi->local_int[cpu_addr] == NULL)
 		rc = 3; /* not operational */
 	else if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
@@ -64,7 +64,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 		*reg |= SIGP_STAT_STOPPED;
 		rc = 1; /* status stored */
 	}
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
 	return rc;
@@ -86,7 +86,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 	inti->type = KVM_S390_INT_EMERGENCY;
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if (li == NULL) {
 		rc = 3; /* not operational */
@@ -102,7 +102,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
 unlock:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
 	return rc;
 }
@@ -123,7 +123,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
 	inti->type = KVM_S390_SIGP_STOP;
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if (li == NULL) {
 		rc = 3; /* not operational */
@@ -142,7 +142,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
 unlock:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
 	return rc;
 }
@@ -188,7 +188,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 	if (!inti)
 		return 2; /* busy */
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if ((cpu_addr >= KVM_MAX_VCPUS) || (li == NULL)) {
@@ -220,7 +220,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 out_li:
 	spin_unlock_bh(&li->lock);
 out_fi:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	return rc;
 }

--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -116,6 +116,8 @@
 #define X86_FEATURE_XMM4_1	(4*32+19) /* "sse4_1" SSE-4.1 */
 #define X86_FEATURE_XMM4_2	(4*32+20) /* "sse4_2" SSE-4.2 */
 #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
 #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */

--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256

--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned cr4_pge:1;
+		unsigned nxe:1;
 	};
 };
@@ -212,7 +213,6 @@ struct kvm_mmu_page {
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
-	bool global;
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
@@ -261,13 +261,11 @@ struct kvm_mmu {
 	union kvm_mmu_page_role base_role;
 	u64 *pae_root;
+	u64 rsvd_bits_mask[2][4];
 };
 struct kvm_vcpu_arch {
 	u64 host_tsc;
-	int interrupt_window_open;
-	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	/*
 	 * rip and regs accesses must go through
 	 * kvm_{register,rip}_{read,write} functions.
@@ -286,6 +284,7 @@ struct kvm_vcpu_arch {
 	u64 shadow_efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	int32_t apic_arb_prio;
 	int mp_state;
 	int sipi_vector;
 	u64 ia32_misc_enable_msr;
@@ -320,6 +319,8 @@ struct kvm_vcpu_arch {
 	struct kvm_pio_request pio;
 	void *pio_data;
+	u8 event_exit_inst_len;
 	struct kvm_queued_exception {
 		bool pending;
 		bool has_error_code;
@@ -329,11 +330,12 @@ struct kvm_vcpu_arch {
 	struct kvm_queued_interrupt {
 		bool pending;
+		bool soft;
 		u8 nr;
 	} interrupt;
 	struct {
-		int active;
+		int vm86_active;
 		u8 save_iopl;
 		struct kvm_save_segment {
 			u16 selector;
@@ -356,9 +358,9 @@ struct kvm_vcpu_arch {
 	unsigned int time_offset;
 	struct page *time_page;
+	bool singlestep; /* guest is single stepped by KVM */
 	bool nmi_pending;
 	bool nmi_injected;
-	bool nmi_window_open;
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
@@ -392,15 +394,14 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
-	struct list_head oos_global_pages;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
 	struct hlist_head irq_ack_notifier_list;
 	int vapics_in_nmi_mode;
-	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
 	struct page *apic_access_page;
@@ -423,7 +424,6 @@ struct kvm_vm_stat {
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
 	u32 mmu_unsync;
-	u32 mmu_unsync_global;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
@@ -443,7 +443,6 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
-	u32 request_nmi_exits;
 	u32 irq_exits;
 	u32 host_state_reload;
 	u32 efer_reload;
@@ -511,20 +510,22 @@ struct kvm_x86_ops {
 	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
-	int (*get_irq)(struct kvm_vcpu *vcpu);
+	void (*set_irq)(struct kvm_vcpu *vcpu);
-	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
-	bool (*exception_injected)(struct kvm_vcpu *vcpu);
+	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
-	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
-	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
+	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-				       struct kvm_run *run);
+	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
-	int (*get_mt_mask_shift)(void);
+	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 };
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -538,7 +539,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -552,6 +553,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret);
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 extern bool tdp_enabled;
@@ -563,6 +565,7 @@ enum emulation_result {
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
+#define EMULTYPE_SKIP		    (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
 			unsigned long cr2, u16 error_code, int emulation_type);
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
@@ -638,7 +641,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -769,6 +771,8 @@ enum {
 #define HF_GIF_MASK		(1 << 0)
 #define HF_HIF_MASK		(1 << 1)
 #define HF_VINTR_MASK		(1 << 2)
+#define HF_NMI_MASK		(1 << 3)
+#define HF_IRET_MASK		(1 << 4)
 /*
 * Hardware virtualization extension instructions may fault if a
@@ -791,5 +795,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 #endif /* _ASM_X86_KVM_HOST_H */
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
 	struct fetch_cache fetch;
 };
+#define X86_SHADOW_INT_MOV_SS  1
+#define X86_SHADOW_INT_STI     2
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
@@ -152,6 +155,9 @@ struct x86_emulate_ctxt {
 	int mode;
 	u32 cs_base;
+	/* interruptibility state, as a result of execution of STI or MOV SS */
+	int interruptibility;
 	/* decode cache */
 	struct decode_cache decode;
 };

--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EVTINJ_VALID_ERR (1 << 11)
 #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
 #define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
 #define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI

--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MCE_DURING_VMENTRY	 41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
 #define EXIT_REASON_EPT_VIOLATION       48

--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
 out2:
 	atomic_dec(&mce_entry);
 }
+EXPORT_SYMBOL_GPL(do_machine_check);
 #ifdef CONFIG_X86_MCE_INTEL
 /***

--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
+#include <asm/timer.h>
 #define MMU_QUEUE_SIZE 1024
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
 		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
 		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
 	}
+#ifdef CONFIG_X86_IO_APIC
+	no_timer_check = 1;
+#endif
 }
 void __init kvm_guest_init(void)

--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	/*
+	 * KVM uses this interrupt to force a cpu out of guest mode
+	 */
 }
 void smp_call_function_interrupt(struct pt_regs *regs)

--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
 	  Provides support for KVM on Intel processors equipped with the VT
 	  extensions.
+	  To compile this as a module, choose M here: the module
+	  will be called kvm-intel.
 config KVM_AMD
 	tristate "KVM for AMD processors support"
 	depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
+	  To compile this as a module, choose M here: the module
+	  will be called kvm-amd.
 config KVM_TRACE
 	bool "KVM trace support"
 	depends on KVM && SYSFS

--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
-	i8254.o
+	i8254.o timer.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o

--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,37 @@ static int pit_get_gate(struct kvm *kvm, int channel)
 	return kvm->arch.vpit->pit_state.channels[channel].gate;
 }
+static s64 __kpit_elapsed(struct kvm *kvm)
+{
+	s64 elapsed;
+	ktime_t remaining;
+	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+	/*
+	 * The Counter does not stop when it reaches zero. In
+	 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+	 * the highest count, either FFFF hex for binary counting
+	 * or 9999 for BCD counting, and continues counting.
+	 * Modes 2 and 3 are periodic; the Counter reloads
+	 * itself with the initial count and continues counting
+	 * from there.
+	 */
+	remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+	elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+	elapsed = mod_64(elapsed, ps->pit_timer.period);
+	return elapsed;
+}
+static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+			int channel)
+{
+	if (channel == 0)
+		return __kpit_elapsed(kvm);
+	return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
 static int pit_get_count(struct kvm *kvm, int channel)
 {
 	struct kvm_kpit_channel_state *c =
@@ -107,7 +138,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
 	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 	switch (c->mode) {
@@ -137,7 +168,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
 	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
-	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 	switch (c->mode) {
@@ -193,28 +224,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
-static int __pit_timer_fn(struct kvm_kpit_state *ps)
-{
-	struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
-	struct kvm_kpit_timer *pt = &ps->pit_timer;
-	if (!atomic_inc_and_test(&pt->pending))
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-	if (!pt->reinject)
-		atomic_set(&pt->pending, 1);
-	if (vcpu0 && waitqueue_active(&vcpu0->wq))
-		wake_up_interruptible(&vcpu0->wq);
-	hrtimer_add_expires_ns(&pt->timer, pt->period);
-	pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
-	if (pt->period)
-		ps->channels[0].count_load_time = ktime_get();
-	return (pt->period == 0 ? 0 : 1);
-}
 int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -235,21 +244,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	spin_unlock(&ps->inject_lock);
 }
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
-	struct kvm_kpit_state *ps;
-	int restart_timer = 0;
-	ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
-	restart_timer = __pit_timer_fn(ps);
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -263,15 +257,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
-static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+static void destroy_pit_timer(struct kvm_timer *pt)
 {
 	pr_debug("pit: execute del timer!\n");
 	hrtimer_cancel(&pt->timer);
 }
+static bool kpit_is_periodic(struct kvm_timer *ktimer)
+{
+	struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
+						 pit_timer);
+	return ps->is_periodic;
+}
+static struct kvm_timer_ops kpit_ops = {
+	.is_periodic = kpit_is_periodic,
+};
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
-	struct kvm_kpit_timer *pt = &ps->pit_timer;
+	struct kvm_timer *pt = &ps->pit_timer;
 	s64 interval;
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -280,8 +285,14 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
-	pt->period = (is_period == 0) ? 0 : interval;
+	pt->period = interval;
-	pt->timer.function = pit_timer_fn;
+	ps->is_periodic = is_period;
+	pt->timer.function = kvm_timer_fn;
+	pt->t_ops = &kpit_ops;
+	pt->kvm = ps->pit->kvm;
+	pt->vcpu_id = 0;
 	atomic_set(&pt->pending, 0);
 	ps->irq_ack = 1;
@@ -298,23 +309,23 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
 	/*
-	 * Though spec said the state of 8254 is undefined after power-up,
+	 * The largest possible initial count is 0; this is equivalent
-	 * seems some tricky OS like Windows XP depends on IRQ0 interrupt
+	 * to 216 for binary counting and 104 for BCD counting.
-	 * when booting up.
-	 * So here setting initialize rate for it, and not a specific number
 	 */
 	if (val == 0)
 		val = 0x10000;
-	ps->channels[channel].count_load_time = ktime_get();
 	ps->channels[channel].count = val;
-	if (channel != 0)
+	if (channel != 0) {
+		ps->channels[channel].count_load_time = ktime_get();
 		return;
+	}
 	/* Two types of timer
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
 	switch (ps->channels[0].mode) {
+	case 0:
 	case 1:
        /* FIXME: enhance mode 4 precision */
 	case 4:

--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,15 +3,6 @@
 #include "iodev.h"
-struct kvm_kpit_timer {
-	struct hrtimer timer;
-	int irq;
-	s64 period; /* unit: ns */
-	s64 scheduled;
-	atomic_t pending;
-	bool reinject;
-};
 struct kvm_kpit_channel_state {
 	u32 count; /* can be 65536 */
 	u16 latched_count;
@@ -30,7 +21,8 @@ struct kvm_kpit_channel_state {
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
-	struct kvm_kpit_timer pit_timer;
+	struct kvm_timer pit_timer;
+	bool is_periodic;
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;

--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
 #include "irq.h"
 #include "i8254.h"
+#include "x86.h"
 /*
 * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
 	struct kvm_pic *s;
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.interrupt.pending;
 	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);	/* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	struct kvm_pic *s;
 	int vector;
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.interrupt.nr;
 	vector = kvm_get_apic_interrupt(v);	/* APIC */
 	if (vector == -1) {
 		if (kvm_apic_accept_pic_intr(v)) {

--- a/arch/x86/kvm/kvm_timer.h
+++ b/arch/x86/kvm/kvm_timer.h
+struct kvm_timer {
+	struct hrtimer timer;
+	s64 period; 				/* unit: ns */
+	atomic_t pending;			/* accumulated triggered timers */
+	bool reinject;
+	struct kvm_timer_ops *t_ops;
+	struct kvm *kvm;
+	int vcpu_id;
+};
+struct kvm_timer_ops {
+        bool (*is_periodic)(struct kvm_timer *);
+};
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
 #define __KVM_X86_LAPIC_H
 #include "iodev.h"
+#include "kvm_timer.h"
 #include <linux/kvm_host.h>
 struct kvm_lapic {
 	unsigned long base_address;
 	struct kvm_io_device dev;
-	struct {
+	struct kvm_timer lapic_timer;
-		atomic_t pending;
+	u32 divide_count;
-		s64 period;	/* unit: ns */
-		u32 divide_count;
-		struct hrtimer dev;
-	} timer;
 	struct kvm_vcpu *vcpu;
 	struct page *regs_page;
 	void *regs;
@@ -34,12 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);

--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr0 & X86_CR0_PG;
 }
+static inline int is_present_pte(unsigned long pte)
+{
+	return pte & PT_PRESENT_MASK;
+}
 #endif
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
+	int rsvd_fault = 0;
 	pgprintk("%s: addr %lx\n", __func__, addr);
 walk:
@@ -157,6 +158,10 @@ walk:
 		if (!is_present_pte(pte))
 			goto not_present;
+		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
+		if (rsvd_fault)
+			goto access_error;
 		if (write_fault && !is_writeble_pte(pte))
 			if (user_fault || is_write_protection(vcpu))
 				goto access_error;
@@ -209,7 +214,6 @@ walk:
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
 		walker->ptes[walker->level - 1] = pte;
 	}
@@ -233,6 +237,8 @@ err:
 		walker->error_code |= PFERR_USER_MASK;
 	if (fetch_fault)
 		walker->error_code |= PFERR_FETCH_MASK;
+	if (rsvd_fault)
+		walker->error_code |= PFERR_RSVD_MASK;
 	return 0;
 }
@@ -262,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage,
-		     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
+		     gpte_to_gfn(gpte), pfn, true);
-		     pfn, true);
 }
 /*
@@ -297,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 				     user_fault, write_fault,
 				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
 				     ptwrite, largepage,
-				     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
 				     gw->gfn, pfn, false);
 			break;
 		}
@@ -380,7 +384,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		return r;
 	/*
-	 * Look up the shadow pte for the faulting address.
+	 * Look up the guest pte for the faulting address.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
 			     fetch_fault);
@@ -586,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
+			 is_dirty_pte(gpte), 0, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false);
 	}

--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/hrtimer.h>
+#include <asm/atomic.h>
+#include "kvm_timer.h"
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+	int restart_timer = 0;
+	wait_queue_head_t *q = &vcpu->wq;
+	/* FIXME: this code should not know anything about vcpus */
+	if (!atomic_inc_and_test(&ktimer->pending))
+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+	if (!ktimer->reinject)
+		atomic_set(&ktimer->pending, 1);
+	if (waitqueue_active(q))
+		wake_up_interruptible(q);
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		restart_timer = 1;
+	}
+	return restart_timer;
+}
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+	int restart_timer;
+	struct kvm_vcpu *vcpu;
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+	if (!vcpu)
+		return HRTIMER_NORESTART;
+	restart_timer = __kvm_timer_fn(vcpu, ktimer);
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.exception.pending = false;
 }
-static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+	bool soft)
 {
 	vcpu->arch.interrupt.pending = true;
+	vcpu->arch.interrupt.soft = soft;
 	vcpu->arch.interrupt.nr = vector;
 }
@@ -19,4 +21,14 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.interrupt.pending = false;
 }
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+		vcpu->arch.nmi_injected;
+}
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
 #endif
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -119,7 +119,7 @@ struct kvm_run {
 			__u32 error_code;
 		} ex;
 		/* KVM_EXIT_IO */
-		struct kvm_io {
+		struct {
 #define KVM_EXIT_IO_IN  0
 #define KVM_EXIT_IO_OUT 1
 			__u8 direction;
@@ -224,10 +224,10 @@ struct kvm_interrupt {
 /* for KVM_GET_DIRTY_LOG */
 struct kvm_dirty_log {
 	__u32 slot;
-	__u32 padding;
+	__u32 padding1;
 	union {
 		void __user *dirty_bitmap; /* one bit per page */
-		__u64 padding;
+		__u64 padding2;
 	};
 };
@@ -409,6 +409,10 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
 #endif
+#ifdef __KVM_HAVE_MSIX
+#define KVM_CAP_DEVICE_MSIX 28
+#endif
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
@@ -482,11 +486,18 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
 				   struct kvm_assigned_pci_dev)
 #define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR \
+			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY \
+			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 /*
 * ioctls for vcpu fds
@@ -577,6 +588,8 @@ struct kvm_debug_guest {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
 	__u32 busnr;
@@ -587,6 +600,17 @@ struct kvm_assigned_pci_dev {
 	};
 };
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK   0xff00
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
 	__u32 host_irq;
@@ -602,9 +626,19 @@ struct kvm_assigned_irq {
 	};
 };
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
+struct kvm_assigned_msix_nr {
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+#define KVM_MAX_MSIX_PER_DEV		512
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
 #endif
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -38,6 +38,7 @@
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
 #define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_KICK               9
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
@@ -72,7 +73,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	int   cpu;
 	struct kvm_run *run;
-	int guest_mode;
 	unsigned long requests;
 	unsigned long guest_debug;
 	int fpu_active;
@@ -298,6 +298,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 void kvm_free_physmem(struct kvm *kvm);
@@ -319,6 +320,13 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
+#define KVM_ASSIGNED_MSIX_PENDING		0x1
+struct kvm_guest_msix_entry {
+	u32 vector;
+	u16 entry;
+	u16 flags;
+};
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct work_struct interrupt_work;
@@ -326,18 +334,18 @@ struct kvm_assigned_dev_kernel {
 	int assigned_dev_id;
 	int host_busnr;
 	int host_devfn;
+	unsigned int entries_nr;
 	int host_irq;
 	bool host_irq_disabled;
+	struct msix_entry *host_msix_entries;
 	int guest_irq;
-#define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
+	struct kvm_guest_msix_entry *guest_msix_entries;
-#define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
-#define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
-#define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
+	spinlock_t assigned_dev_lock;
 };
 struct kvm_irq_mask_notifier {
@@ -360,6 +368,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY	0x1
 #ifdef CONFIG_IOMMU_API
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
 			unsigned long npages);

--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -40,4 +40,31 @@ typedef unsigned long  hfn_t;
 typedef hfn_t pfn_t;
+union kvm_ioapic_redirect_entry {
+	u64 bits;
+	struct {
+		u8 vector;
+		u8 delivery_mode:3;
+		u8 dest_mode:1;
+		u8 delivery_status:1;
+		u8 polarity:1;
+		u8 remote_irr:1;
+		u8 trig_mode:1;
+		u8 mask:1;
+		u8 reserve:7;
+		u8 reserved[4];
+		u8 dest_id;
+	} fields;
+};
+struct kvm_lapic_irq {
+	u32 vector;
+	u32 delivery_mode;
+	u32 dest_mode;
+	u32 level;
+	u32 trig_mode;
+	u32 shorthand;
+	u32 dest_id;
+};
 #endif /* __KVM_TYPES_H__ */
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -40,22 +40,7 @@ struct kvm_ioapic {
 	u32 id;
 	u32 irr;
 	u32 pad;
-	union ioapic_redir_entry {
+	union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
-		u64 bits;
-		struct {
-			u8 vector;
-			u8 delivery_mode:3;
-			u8 dest_mode:1;
-			u8 delivery_status:1;
-			u8 polarity:1;
-			u8 remote_irr:1;
-			u8 trig_mode:1;
-			u8 mask:1;
-			u8 reserve:7;
-			u8 reserved[4];
-			u8 dest_id;
-		} fields;
-	} redirtbl[IOAPIC_NUM_PINS];
 	struct kvm_io_device dev;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
@@ -79,13 +64,13 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
-				       unsigned long bitmap);
+		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
-				u8 dest_mode);
+		struct kvm_lapic_irq *irq);
 #endif
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c