Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest

* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (31 commits) lguest: add support for indirect ring entries lguest: suppress notifications in example Launcher lguest: try to batch interrupts on network receive lguest: avoid sending interrupts to Guest when no activity occurs. lguest: implement deferred interrupts in example Launcher lguest: remove obsolete LHREQ_BREAK call lguest: have example Launcher service all devices in separate threads lguest: use eventfds for device notification eventfd: export eventfd_signal and eventfd_fget for lguest lguest: allow any process to send interrupts lguest: PAE fixes lguest: PAE support lguest: Add support for kvm_hypercall4() lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated lguest: map switcher with executable page table entries lguest: fix writev returning short on console output lguest: clean up length-used value in example launcher lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition. lguest: beyond ARRAY_SIZE of cpu->arch.gdt ...

Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-lguest: (31 commits) lguest: add support for indirect ring entries lguest: suppress notifications in example Launcher lguest: try to batch interrupts on network receive lguest: avoid sending interrupts to Guest when no activity occurs. lguest: implement deferred interrupts in example Launcher lguest: remove obsolete LHREQ_BREAK call lguest: have example Launcher service all devices in separate threads lguest: use eventfds for device notification eventfd: export eventfd_signal and eventfd_fget for lguest lguest: allow any process to send interrupts lguest: PAE fixes lguest: PAE support lguest: Add support for kvm_hypercall4() lguest: replace hypercall name LHCALL_SET_PMD with LHCALL_SET_PGD lguest: use native_set_* macros, which properly handle 64-bit entries when PAE is activated lguest: map switcher with executable page table entries lguest: fix writev returning short on console output lguest: clean up length-used value in example launcher lguest: Segment selectors are 16-bit long. Fix lg_cpu.ss1 definition. lguest: beyond ARRAY_SIZE of cpu->arch.gdt ...
7f3591cf · Linus Torvalds · 16ffc3ee · d1f0132e · 7f3591cf · 7f3591cf
Commit 7f3591cf authored Jun 12, 2009 by Linus Torvalds
21 changed files
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
 # This creates the demonstration utility "lguest" which runs a Linux guest.
-CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
+CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -I../../include -I../../arch/x86/include -U_FORTIFY_SOURCE
-LDLIBS:=-lz
 all: lguest

--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.txt
+++ b/Documentation/lguest/lguest.txt
@@ -37,7 +37,6 @@ Running Lguest:
     "Paravirtualized guest support" = Y
        "Lguest guest support" = Y
     "High Memory Support" = off/4GB
-     "PAE (Physical Address Extension) Support" = N
     "Alignment value to which kernel should be aligned" = 0x100000
        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
         CONFIG_PHYSICAL_ALIGN=0x100000)

--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -17,8 +17,13 @@
 /* Pages for switcher itself, then two pages per cpu */
 #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
-/* We map at -4M for ease of mapping into the guest (one PTE page). */
+/* We map at -4M (-2M when PAE is activated) for ease of mapping
+ * into the guest (one PTE page). */
+#ifdef CONFIG_X86_PAE
+#define SWITCHER_ADDR 0xFFE00000
+#else
 #define SWITCHER_ADDR 0xFFC00000
+#endif
 /* Found in switcher.S */
 extern unsigned long default_idt_entries[];

--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -12,11 +12,13 @@
 #define LHCALL_TS		8
 #define LHCALL_SET_CLOCKEVENT	9
 #define LHCALL_HALT		10
+#define LHCALL_SET_PMD		13
 #define LHCALL_SET_PTE		14
-#define LHCALL_SET_PMD		15
+#define LHCALL_SET_PGD		15
 #define LHCALL_LOAD_TLS		16
 #define LHCALL_NOTIFY		17
 #define LHCALL_LOAD_GDT_ENTRY	18
+#define LHCALL_SEND_INTERRUPTS	19
 #define LGUEST_TRAP_ENTRY 0x1F
@@ -32,10 +34,10 @@
 * operations?  There are two ways: the direct way is to make a "hypercall",
 * to make requests of the Host Itself.
 *
- * We use the KVM hypercall mechanism. Eighteen hypercalls are
+ * We use the KVM hypercall mechanism. Seventeen hypercalls are
 * available: the hypercall number is put in the %eax register, and the
- * arguments (when required) are placed in %ebx, %ecx and %edx.  If a return
+ * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
- * value makes sense, it's returned in %eax.
+ * If a return value makes sense, it's returned in %eax.
 *
 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
 * Host, rather than returning failure.  This reflects Winston Churchill's
@@ -47,8 +49,9 @@
 #define LHCALL_RING_SIZE 64
 struct hcall_args {
-	/* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
+	/* These map directly onto eax, ebx, ecx, edx and esi
-	unsigned long arg0, arg1, arg2, arg3;
+	 * in struct lguest_regs */
+	unsigned long arg0, arg1, arg2, arg3, arg4;
 };
 #endif /* !__ASSEMBLY__ */

--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -126,6 +126,7 @@ void foo(void)
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
 	OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
+	OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
 	OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
 	BLANK();

--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
 	bool "Lguest guest support"
 	select PARAVIRT
 	depends on X86_32
-	depends on !X86_PAE
 	select VIRTIO
 	select VIRTIO_RING
 	select VIRTIO_CONSOLE

--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
 	.globl lgstart_##name; .globl lgend_##name
 LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
 LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*:*/
+/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
+ * matter for save_fl and irq_disable later).  If we write our routines
+ * carefully in assembler, we can avoid clobbering any registers and avoid
+ * jumping through the wrapper functions.
+ *
+ * I skipped over our first piece of assembler, but this one is worth studying
+ * in a bit more detail so I'll describe in easy stages.  First, the routine
+ * to enable interrupts: */
+ENTRY(lg_irq_enable)
+	/* The reverse of irq_disable, this sets lguest_data.irq_enabled to
+	 * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
+	movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
+	/* But now we need to check if the Host wants to know: there might have
+	 * been interrupts waiting to be delivered, in which case it will have
+	 * set lguest_data.irq_pending to X86_EFLAGS_IF.  If it's not zero, we
+	 * jump to send_interrupts, otherwise we're done. */
+	testl $0, lguest_data+LGUEST_DATA_irq_pending
+	jnz send_interrupts
+	/* One cool thing about x86 is that you can do many things without using
+	 * a register.  In this case, the normal path hasn't needed to save or
+	 * restore any registers at all! */
+	ret
+send_interrupts:
+	/* OK, now we need a register: eax is used for the hypercall number,
+	 * which is LHCALL_SEND_INTERRUPTS.
+	 *
+	 * We used not to bother with this pending detection at all, which was
+	 * much simpler.  Sooner or later the Host would realize it had to
+	 * send us an interrupt.  But that turns out to make performance 7
+	 * times worse on a simple tcp benchmark.  So now we do this the hard
+	 * way. */
+	pushl %eax
+	movl $LHCALL_SEND_INTERRUPTS, %eax
+	/* This is a vmcall instruction (same thing that KVM uses).  Older
+	 * assembler versions might not know the "vmcall" instruction, so we
+	 * create one manually here. */
+	.byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+	popl %eax
+	ret
+/* Finally, the "popf" or "restore flags" routine.  The %eax register holds the
+ * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
+ * enabling interrupts again, if it's 0 we're leaving them off. */
+ENTRY(lg_restore_fl)
+	/* This is just "lguest_data.irq_enabled = flags;" */
+	movl %eax, lguest_data+LGUEST_DATA_irq_enabled
+	/* Now, if the %eax value has enabled interrupts and
+	 * lguest_data.irq_pending is set, we want to tell the Host so it can
+	 * deliver any outstanding interrupts.  Fortunately, both values will
+	 * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
+	 * instruction will AND them together for us.  If both are set, we
+	 * jump to send_interrupts. */
+	testl lguest_data+LGUEST_DATA_irq_pending, %eax
+	jnz send_interrupts
+	/* Again, the normal path has used no extra registers.  Clever, huh? */
+	ret
 /* These demark the EIP range where host should never deliver interrupts. */
 .global lguest_noirq_start

--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
 config LGUEST
 	tristate "Linux hypervisor example code"
-	depends on X86_32 && EXPERIMENTAL && !X86_PAE && FUTEX
+	depends on X86_32 && EXPERIMENTAL && EVENTFD
 	select HVC_DRIVER
 	---help---
 	  This is a very simple module which allows you to run

--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -95,7 +95,7 @@ static __init int map_switcher(void)
 	 * array of struct pages.  It increments that pointer, but we don't
 	 * care. */
 	pagep = switcher_page;
-	err = map_vm_area(switcher_vma, PAGE_KERNEL, &pagep);
+	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
 	if (err) {
 		printk("lguest: map_vm_area failed: %i\n", err);
 		goto free_vma;
@@ -188,6 +188,9 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 {
 	/* We stop running once the Guest is dead. */
 	while (!cpu->lg->dead) {
+		unsigned int irq;
+		bool more;
 		/* First we run any hypercalls the Guest wants done. */
 		if (cpu->hcall)
 			do_hypercalls(cpu);
@@ -195,23 +198,23 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 		/* It's possible the Guest did a NOTIFY hypercall to the
 		 * Launcher, in which case we return from the read() now. */
 		if (cpu->pending_notify) {
-			if (put_user(cpu->pending_notify, user))
+			if (!send_notify_to_eventfd(cpu)) {
-				return -EFAULT;
+				if (put_user(cpu->pending_notify, user))
-			return sizeof(cpu->pending_notify);
+					return -EFAULT;
+				return sizeof(cpu->pending_notify);
+			}
 		}
 		/* Check for signals */
 		if (signal_pending(current))
 			return -ERESTARTSYS;
-		/* If Waker set break_out, return to Launcher. */
-		if (cpu->break_out)
-			return -EAGAIN;
 		/* Check if there are any interrupts which can be delivered now:
 		 * if so, this sets up the hander to be executed when we next
 		 * run the Guest. */
-		maybe_do_interrupt(cpu);
+		irq = interrupt_pending(cpu, &more);
+		if (irq < LGUEST_IRQS)
+			try_deliver_interrupt(cpu, irq, more);
 		/* All long-lived kernel loops need to check with this horrible
 		 * thing called the freezer.  If the Host is trying to suspend,
@@ -224,10 +227,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
 			break;
 		/* If the Guest asked to be stopped, we sleep.  The Guest's
-		 * clock timer or LHREQ_BREAK from the Waker will wake us. */
+		 * clock timer will wake us. */
 		if (cpu->halted) {
 			set_current_state(TASK_INTERRUPTIBLE);
-			schedule();
+			/* Just before we sleep, make sure no interrupt snuck in
+			 * which we should be doing. */
+			if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
+				set_current_state(TASK_RUNNING);
+			else
+				schedule();
 			continue;
 		}

--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -37,6 +37,10 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		/* This call does nothing, except by breaking out of the Guest
 		 * it makes us process all the asynchronous hypercalls. */
 		break;
+	case LHCALL_SEND_INTERRUPTS:
+		/* This call does nothing too, but by breaking out of the Guest
+		 * it makes us process any pending interrupts. */
+		break;
 	case LHCALL_LGUEST_INIT:
 		/* You can't get here unless you're already initialized.  Don't
 		 * do that. */
@@ -73,11 +77,21 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
 		guest_set_stack(cpu, args->arg1, args->arg2, args->arg3);
 		break;
 	case LHCALL_SET_PTE:
+#ifdef CONFIG_X86_PAE
+		guest_set_pte(cpu, args->arg1, args->arg2,
+				__pte(args->arg3 | (u64)args->arg4 << 32));
+#else
 		guest_set_pte(cpu, args->arg1, args->arg2, __pte(args->arg3));
+#endif
+		break;
+	case LHCALL_SET_PGD:
+		guest_set_pgd(cpu->lg, args->arg1, args->arg2);
 		break;
+#ifdef CONFIG_X86_PAE
 	case LHCALL_SET_PMD:
 		guest_set_pmd(cpu->lg, args->arg1, args->arg2);
 		break;
+#endif
 	case LHCALL_SET_CLOCKEVENT:
 		guest_set_clockevent(cpu, args->arg1);
 		break;

--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -128,30 +128,39 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
 /*H:205
 * Virtual Interrupts.
 *
- * maybe_do_interrupt() gets called before every entry to the Guest, to see if
+ * interrupt_pending() returns the first pending interrupt which isn't blocked
- * we should divert the Guest to running an interrupt handler. */
+ * by the Guest.  It is called before every entry to the Guest, and just before
-void maybe_do_interrupt(struct lg_cpu *cpu)
+ * we go to sleep when the Guest has halted itself. */
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
 {
 	unsigned int irq;
 	DECLARE_BITMAP(blk, LGUEST_IRQS);
-	struct desc_struct *idt;
 	/* If the Guest hasn't even initialized yet, we can do nothing. */
 	if (!cpu->lg->lguest_data)
-		return;
+		return LGUEST_IRQS;
 	/* Take our "irqs_pending" array and remove any interrupts the Guest
 	 * wants blocked: the result ends up in "blk". */
 	if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
 			   sizeof(blk)))
-		return;
+		return LGUEST_IRQS;
 	bitmap_andnot(blk, cpu->irqs_pending, blk, LGUEST_IRQS);
 	/* Find the first interrupt. */
 	irq = find_first_bit(blk, LGUEST_IRQS);
-	/* None?  Nothing to do */
+	*more = find_next_bit(blk, LGUEST_IRQS, irq+1);
-	if (irq >= LGUEST_IRQS)
-		return;
+	return irq;
+}
+/* This actually diverts the Guest to running an interrupt handler, once an
+ * interrupt has been identified by interrupt_pending(). */
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
+{
+	struct desc_struct *idt;
+	BUG_ON(irq >= LGUEST_IRQS);
 	/* They may be in the middle of an iret, where they asked us never to
 	 * deliver interrupts. */
@@ -170,8 +179,12 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 		u32 irq_enabled;
 		if (get_user(irq_enabled, &cpu->lg->lguest_data->irq_enabled))
 			irq_enabled = 0;
-		if (!irq_enabled)
+		if (!irq_enabled) {
+			/* Make sure they know an IRQ is pending. */
+			put_user(X86_EFLAGS_IF,
+				 &cpu->lg->lguest_data->irq_pending);
 			return;
+		}
 	}
 	/* Look at the IDT entry the Guest gave us for this interrupt.  The
@@ -194,6 +207,25 @@ void maybe_do_interrupt(struct lg_cpu *cpu)
 	 * here is a compromise which means at least it gets updated every
 	 * timer interrupt. */
 	write_timestamp(cpu);
+	/* If there are no other interrupts we want to deliver, clear
+	 * the pending flag. */
+	if (!more)
+		put_user(0, &cpu->lg->lguest_data->irq_pending);
+}
+/* And this is the routine when we want to set an interrupt for the Guest. */
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
+{
+	/* Next time the Guest runs, the core code will see if it can deliver
+	 * this interrupt. */
+	set_bit(irq, cpu->irqs_pending);
+	/* Make sure it sees it; it might be asleep (eg. halted), or
+	 * running the Guest right now, in which case kick_process()
+	 * will knock it out. */
+	if (!wake_up_process(cpu->tsk))
+		kick_process(cpu->tsk);
 }
 /*:*/
@@ -510,10 +542,7 @@ static enum hrtimer_restart clockdev_fn(struct hrtimer *timer)
 	struct lg_cpu *cpu = container_of(timer, struct lg_cpu, hrt);
 	/* Remember the first interrupt is the timer interrupt. */
-	set_bit(0, cpu->irqs_pending);
+	set_interrupt(cpu, 0);
-	/* If the Guest is actually stopped, we need to wake it up. */
-	if (cpu->halted)
-		wake_up_process(cpu->tsk);
 	return HRTIMER_NORESTART;
 }

--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -49,7 +49,7 @@ struct lg_cpu {
 	u32 cr2;
 	int ts;
 	u32 esp1;
-	u8 ss1;
+	u16 ss1;
 	/* Bitmap of what has changed: see CHANGED_* above. */
 	int changed;
@@ -71,9 +71,7 @@ struct lg_cpu {
 	/* Virtual clock device */
 	struct hrtimer hrt;
-	/* Do we need to stop what we're doing and return to userspace? */
+	/* Did the Guest tell us to halt? */
-	int break_out;
-	wait_queue_head_t break_wq;
 	int halted;
 	/* Pending virtual interrupts */
@@ -82,6 +80,16 @@ struct lg_cpu {
 	struct lg_cpu_arch arch;
 };
+struct lg_eventfd {
+	unsigned long addr;
+	struct file *event;
+};
+struct lg_eventfd_map {
+	unsigned int num;
+	struct lg_eventfd map[];
+};
 /* The private info the thread maintains about the guest. */
 struct lguest
 {
@@ -102,6 +110,8 @@ struct lguest
 	unsigned int stack_pages;
 	u32 tsc_khz;
+	struct lg_eventfd_map *eventfds;
 	/* Dead? */
 	const char *dead;
 };
@@ -137,9 +147,13 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
 * in the kernel. */
 #define pgd_flags(x)	(pgd_val(x) & ~PAGE_MASK)
 #define pgd_pfn(x)	(pgd_val(x) >> PAGE_SHIFT)
+#define pmd_flags(x)    (pmd_val(x) & ~PAGE_MASK)
+#define pmd_pfn(x)	(pmd_val(x) >> PAGE_SHIFT)
 /* interrupts_and_traps.c: */
-void maybe_do_interrupt(struct lg_cpu *cpu);
+unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more);
+void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more);
+void set_interrupt(struct lg_cpu *cpu, unsigned int irq);
 bool deliver_trap(struct lg_cpu *cpu, unsigned int num);
 void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int i,
 			  u32 low, u32 hi);
@@ -150,6 +164,7 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
 void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
 		const unsigned long *def);
 void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta);
+bool send_notify_to_eventfd(struct lg_cpu *cpu);
 void init_clockdev(struct lg_cpu *cpu);
 bool check_syscall_vector(struct lguest *lg);
 int init_interrupts(void);
@@ -168,7 +183,10 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt);
 int init_guest_pagetable(struct lguest *lg);
 void free_guest_pagetable(struct lguest *lg);
 void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable);
+void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#ifdef CONFIG_X86_PAE
 void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 i);
+#endif
 void guest_pagetable_clear_all(struct lg_cpu *cpu);
 void guest_pagetable_flush_user(struct lg_cpu *cpu);
 void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,

--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -7,32 +7,83 @@
 #include <linux/miscdevice.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
+#include <linux/eventfd.h>
+#include <linux/file.h>
 #include "lg.h"
-/*L:055 When something happens, the Waker process needs a way to stop the
+bool send_notify_to_eventfd(struct lg_cpu *cpu)
- * kernel running the Guest and return to the Launcher.  So the Waker writes
- * LHREQ_BREAK and the value "1" to /dev/lguest to do this.  Once the Launcher
- * has done whatever needs attention, it writes LHREQ_BREAK and "0" to release
- * the Waker. */
-static int break_guest_out(struct lg_cpu *cpu, const unsigned long __user*input)
 {
-	unsigned long on;
+	unsigned int i;
+	struct lg_eventfd_map *map;
+	/* lg->eventfds is RCU-protected */
+	rcu_read_lock();
+	map = rcu_dereference(cpu->lg->eventfds);
+	for (i = 0; i < map->num; i++) {
+		if (map->map[i].addr == cpu->pending_notify) {
+			eventfd_signal(map->map[i].event, 1);
+			cpu->pending_notify = 0;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return cpu->pending_notify == 0;
+}
-	/* Fetch whether they're turning break on or off. */
+static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
-	if (get_user(on, input) != 0)
+{
-		return -EFAULT;
+	struct lg_eventfd_map *new, *old = lg->eventfds;
-	if (on) {
+	if (!addr)
-		cpu->break_out = 1;
+		return -EINVAL;
-		/* Pop it out of the Guest (may be running on different CPU) */
-		wake_up_process(cpu->tsk);
+	/* Replace the old array with the new one, carefully: others can
-		/* Wait for them to reset it */
+	 * be accessing it at the same time */
-		return wait_event_interruptible(cpu->break_wq, !cpu->break_out);
+	new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
-	} else {
+		      GFP_KERNEL);
-		cpu->break_out = 0;
+	if (!new)
-		wake_up(&cpu->break_wq);
+		return -ENOMEM;
-		return 0;
+	/* First make identical copy. */
+	memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
+	new->num = old->num;
+	/* Now append new entry. */
+	new->map[new->num].addr = addr;
+	new->map[new->num].event = eventfd_fget(fd);
+	if (IS_ERR(new->map[new->num].event)) {
+		kfree(new);
+		return PTR_ERR(new->map[new->num].event);
 	}
+	new->num++;
+	/* Now put new one in place. */
+	rcu_assign_pointer(lg->eventfds, new);
+	/* We're not in a big hurry.  Wait until noone's looking at old
+	 * version, then delete it. */
+	synchronize_rcu();
+	kfree(old);
+	return 0;
+}
+static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+{
+	unsigned long addr, fd;
+	int err;
+	if (get_user(addr, input) != 0)
+		return -EFAULT;
+	input++;
+	if (get_user(fd, input) != 0)
+		return -EFAULT;
+	mutex_lock(&lguest_lock);
+	err = add_eventfd(lg, addr, fd);
+	mutex_unlock(&lguest_lock);
+	return 0;
 }
 /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
@@ -45,9 +96,8 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
 		return -EFAULT;
 	if (irq >= LGUEST_IRQS)
 		return -EINVAL;
-	/* Next time the Guest runs, the core code will see if it can deliver
-	 * this interrupt. */
+	set_interrupt(cpu, irq);
-	set_bit(irq, cpu->irqs_pending);
 	return 0;
 }
@@ -126,9 +176,6 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
 	 * address. */
 	lguest_arch_setup_regs(cpu, start_ip);
-	/* Initialize the queue for the Waker to wait on */
-	init_waitqueue_head(&cpu->break_wq);
 	/* We keep a pointer to the Launcher task (ie. current task) for when
 	 * other Guests want to wake this one (eg. console input). */
 	cpu->tsk = current;
@@ -185,6 +232,13 @@ static int initialize(struct file *file, const unsigned long __user *input)
 		goto unlock;
 	}
+	lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
+	if (!lg->eventfds) {
+		err = -ENOMEM;
+		goto free_lg;
+	}
+	lg->eventfds->num = 0;
 	/* Populate the easy fields of our "struct lguest" */
 	lg->mem_base = (void __user *)args[0];
 	lg->pfn_limit = args[1];
@@ -192,7 +246,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
 	/* This is the first cpu (cpu 0) and it will start booting at args[2] */
 	err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
 	if (err)
-		goto release_guest;
+		goto free_eventfds;
 	/* Initialize the Guest's shadow page tables, using the toplevel
 	 * address the Launcher gave us.  This allocates memory, so can fail. */
@@ -211,7 +265,9 @@ static int initialize(struct file *file, const unsigned long __user *input)
 free_regs:
 	/* FIXME: This should be in free_vcpu */
 	free_page(lg->cpus[0].regs_page);
-release_guest:
+free_eventfds:
+	kfree(lg->eventfds);
+free_lg:
 	kfree(lg);
 unlock:
 	mutex_unlock(&lguest_lock);
@@ -252,11 +308,6 @@ static ssize_t write(struct file *file, const char __user *in,
 		/* Once the Guest is dead, you can only read() why it died. */
 		if (lg->dead)
 			return -ENOENT;
-		/* If you're not the task which owns the Guest, all you can do
-		 * is break the Launcher out of running the Guest. */
-		if (current != cpu->tsk && req != LHREQ_BREAK)
-			return -EPERM;
 	}
 	switch (req) {
@@ -264,8 +315,8 @@ static ssize_t write(struct file *file, const char __user *in,
 		return initialize(file, input);
 	case LHREQ_IRQ:
 		return user_send_irq(cpu, input);
-	case LHREQ_BREAK:
+	case LHREQ_EVENTFD:
-		return break_guest_out(cpu, input);
+		return attach_eventfd(lg, input);
 	default:
 		return -EINVAL;
 	}
@@ -303,6 +354,12 @@ static int close(struct inode *inode, struct file *file)
 		 * the Launcher's memory management structure. */
 		mmput(lg->cpus[i].mm);
 	}
+	/* Release any eventfds they registered. */
+	for (i = 0; i < lg->eventfds->num; i++)
+		fput(lg->eventfds->map[i].event);
+	kfree(lg->eventfds);
 	/* If lg->dead doesn't contain an error code it will be NULL or a
 	 * kmalloc()ed string, either of which is ok to hand to kfree(). */
 	if (!IS_ERR(lg->dead))

--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -150,7 +150,7 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
 {
 	/* We assume the Guest has the same number of GDT entries as the
 	 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */
-	if (num > ARRAY_SIZE(cpu->arch.gdt))
+	if (num >= ARRAY_SIZE(cpu->arch.gdt))
 		kill_guest(cpu, "too many gdt entries %i", num);
 	/* Set it up, then fix it. */

--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -16,6 +16,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/eventfd.h>
 #include <linux/syscalls.h>
+#include <linux/module.h>
 struct eventfd_ctx {
 	wait_queue_head_t wqh;
@@ -56,6 +57,7 @@ int eventfd_signal(struct file *file, int n)
 	return n;
 }
+EXPORT_SYMBOL_GPL(eventfd_signal);
 static int eventfd_release(struct inode *inode, struct file *file)
 {
@@ -197,6 +199,7 @@ struct file *eventfd_fget(int fd)
 	return file;
 }
+EXPORT_SYMBOL_GPL(eventfd_fget);
 SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
 {

--- a/include/linux/lguest.h
+++ b/include/linux/lguest.h
@@ -30,6 +30,10 @@ struct lguest_data
 	/* Wallclock time set by the Host. */
 	struct timespec time;
+	/* Interrupt pending set by the Host.  The Guest should do a hypercall
+	 * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */
+	int irq_pending;
 	/* Async hypercall ring.  Instead of directly making hypercalls, we can
 	 * place them in here for processing the next time the Host wants.
 	 * This batching can be quite efficient. */

--- a/include/linux/lguest_launcher.h
+++ b/include/linux/lguest_launcher.h
@@ -57,7 +57,8 @@ enum lguest_req
 	LHREQ_INITIALIZE, /* + base, pfnlimit, start */
 	LHREQ_GETDMA, /* No longer used */
 	LHREQ_IRQ, /* + irq */
-	LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */
+	LHREQ_BREAK, /* No longer used */
+	LHREQ_EVENTFD, /* + address, fd. */
 };
 /* The alignment to use between consumer and producer parts of vring.

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2192,6 +2192,7 @@ void kick_process(struct task_struct *p)
 		smp_send_reschedule(cpu);
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(kick_process);
 /*
 * Return a low guess at the load of a migration-source cpu weighted