Commit 47436aa4 authored by Rusty Russell's avatar Rusty Russell

Boot with virtual == physical to get closer to native Linux.

1) This allows us to get alot closer to booting bzImages.

2) It means we don't have to know page_offset.

3) The Guest needs to modify the boot pagetables to create the
   PAGE_OFFSET mapping before jumping to C code.

4) guest_pa() walks the page tables rather than using page_offset.

5) We don't use page_offset to figure out whether to emulate: it was
   always kinda quesationable, and won't work for instructions done
   before remapping (bzImage unpacking in particular).

6) We still want the kernel address for tlb flushing: have the initial
   hypercall give us that, too.
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent c18acd73
This diff is collapsed.
......@@ -136,6 +136,7 @@ void foo(void)
#ifdef CONFIG_LGUEST_GUEST
BLANK();
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc);
OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3);
......
......@@ -86,6 +86,7 @@ struct lguest_data lguest_data = {
.hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF },
.noirq_start = (u32)lguest_noirq_start,
.noirq_end = (u32)lguest_noirq_end,
.kernel_address = PAGE_OFFSET,
.blocked_interrupts = { 1 }, /* Block timer interrupts */
.syscall_vec = SYSCALL_VECTOR,
};
......@@ -1033,11 +1034,7 @@ __init void lguest_init(void *boot)
/*G:070 Now we've seen all the paravirt_ops, we return to
* lguest_init() where the rest of the fairly chaotic boot setup
* occurs.
*
* The Host expects our first hypercall to tell it where our "struct
* lguest_data" is, so we do that first. */
hcall(LHCALL_LGUEST_INIT, __pa(&lguest_data), 0, 0);
* occurs. */
/* The native boot code sets up initial page tables immediately after
* the kernel itself, and sets init_pg_tables_end so they're not
......
#include <linux/linkage.h>
#include <linux/lguest.h>
#include <asm/lguest_hcall.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
#include <asm/processor-flags.h>
......@@ -8,18 +9,48 @@
* looks for. The plan is that the Linux boot protocol will be extended with a
* "platform type" field which will guide us here from the normal entry point,
* but for the moment this suffices. The normal boot code uses %esi for the
* boot header, so we do too. We convert it to a virtual address by adding
* PAGE_OFFSET, and hand it to lguest_init() as its argument (ie. %eax).
* boot header, so we do too.
*
* WARNING: be very careful here! We're running at addresses equal to physical
* addesses (around 0), not above PAGE_OFFSET as most code expectes
* (eg. 0xC0000000). Jumps are relative, so they're OK, but we can't touch any
* data.
*
* The .section line puts this code in .init.text so it will be discarded after
* boot. */
.section .init.text, "ax", @progbits
.ascii "GenuineLguest"
/* Set up initial stack. */
movl $(init_thread_union+THREAD_SIZE),%esp
/* Make initial hypercall now, so we can set up the pagetables. */
movl $LHCALL_LGUEST_INIT, %eax
movl $lguest_data - __PAGE_OFFSET, %edx
int $LGUEST_TRAP_ENTRY
/* Set up boot information pointer to hand to lguest_init(): it wants
* a virtual address. */
movl %esi, %eax
addl $__PAGE_OFFSET, %eax
jmp lguest_init
/* The Host put the toplevel pagetable in lguest_data.pgdir. The movsl
* instruction uses %esi, so we needed to save it above. */
movl lguest_data - __PAGE_OFFSET + LGUEST_DATA_pgdir, %esi
/* Copy first 32 entries of page directory to __PAGE_OFFSET entries.
* This means the first 128M of kernel memory will be mapped at
* PAGE_OFFSET where the kernel expects to run. This will get it far
* enough through boot to switch to its own pagetables. */
movl $32, %ecx
movl %esi, %edi
addl $((__PAGE_OFFSET >> 22) * 4), %edi
rep
movsl
/* Set up the initial stack so we can run C code. */
movl $(init_thread_union+THREAD_SIZE),%esp
/* Jumps are relative, and we're running __PAGE_OFFSET too low at the
* moment. */
jmp lguest_init+__PAGE_OFFSET
/*G:055 We create a macro which puts the assembler code between lgstart_ and
* lgend_ markers. These templates are put in the .text section: they can't be
......
......@@ -181,15 +181,15 @@ static void initialize(struct lguest *lg)
/* The Guest tells us where we're not to deliver interrupts by putting
* the range of addresses into "struct lguest_data". */
if (get_user(lg->noirq_start, &lg->lguest_data->noirq_start)
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end)
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem))
|| get_user(lg->noirq_end, &lg->lguest_data->noirq_end))
kill_guest(lg, "bad guest page %p", lg->lguest_data);
/* We write the current time into the Guest's data page once now. */
write_timestamp(lg);
/* page_tables.c will also do some setup. */
page_table_guest_data_init(lg);
/* This is the one case where the above accesses might have been the
* first write to a Guest page. This may have caused a copy-on-write
* fault, but the Guest might be referring to the old (read-only)
......
......@@ -62,8 +62,9 @@ static void push_guest_stack(struct lguest *lg, unsigned long *gstack, u32 val)
* it). */
static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
{
unsigned long gstack;
unsigned long gstack, origstack;
u32 eflags, ss, irq_enable;
unsigned long virtstack;
/* There are two cases for interrupts: one where the Guest is already
* in the kernel, and a more complex one where the Guest is in
......@@ -71,8 +72,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
if ((lg->regs->ss&0x3) != GUEST_PL) {
/* The Guest told us their kernel stack with the SET_STACK
* hypercall: both the virtual address and the segment */
gstack = guest_pa(lg, lg->esp1);
virtstack = lg->esp1;
ss = lg->ss1;
origstack = gstack = guest_pa(lg, virtstack);
/* We push the old stack segment and pointer onto the new
* stack: when the Guest does an "iret" back from the interrupt
* handler the CPU will notice they're dropping privilege
......@@ -81,8 +84,10 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
push_guest_stack(lg, &gstack, lg->regs->esp);
} else {
/* We're staying on the same Guest (kernel) stack. */
gstack = guest_pa(lg, lg->regs->esp);
virtstack = lg->regs->esp;
ss = lg->regs->ss;
origstack = gstack = guest_pa(lg, virtstack);
}
/* Remember that we never let the Guest actually disable interrupts, so
......@@ -108,7 +113,7 @@ static void set_guest_interrupt(struct lguest *lg, u32 lo, u32 hi, int has_err)
/* Now we've pushed all the old state, we change the stack, the code
* segment and the address to execute. */
lg->regs->ss = ss;
lg->regs->esp = gstack + lg->page_offset;
lg->regs->esp = virtstack + (gstack - origstack);
lg->regs->cs = (__KERNEL_CS|GUEST_PL);
lg->regs->eip = idt_address(lo, hi);
......
......@@ -63,7 +63,7 @@ struct lguest
/* This provides the offset to the base of guest-physical
* memory in the Launcher. */
void __user *mem_base;
u32 page_offset;
unsigned long kernel_address;
u32 cr2;
int halted;
int ts;
......@@ -165,6 +165,8 @@ void guest_set_pte(struct lguest *lg, unsigned long gpgdir,
void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages);
int demand_page(struct lguest *info, unsigned long cr2, int errcode);
void pin_page(struct lguest *lg, unsigned long vaddr);
unsigned long guest_pa(struct lguest *lg, unsigned long vaddr);
void page_table_guest_data_init(struct lguest *lg);
/* <arch>/core.c: */
void lguest_arch_host_init(void);
......@@ -229,9 +231,5 @@ do { \
} while(0)
/* (End of aside) :*/
static inline unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
{
return vaddr - lg->page_offset;
}
#endif /* __ASSEMBLY__ */
#endif /* _LGUEST_H */
......@@ -111,7 +111,7 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
return run_guest(lg, (unsigned long __user *)user);
}
/*L:020 The initialization write supplies 5 pointer sized (32 or 64 bit)
/*L:020 The initialization write supplies 4 pointer sized (32 or 64 bit)
* values (in addition to the LHREQ_INITIALIZE value). These are:
*
* base: The start of the Guest-physical memory inside the Launcher memory.
......@@ -124,12 +124,6 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
* pagetables (which are set up by the Launcher).
*
* start: The first instruction to execute ("eip" in x86-speak).
*
* page_offset: The PAGE_OFFSET constant in the Guest kernel. We should
* probably wean the code off this, but it's a very useful constant! Any
* address above this is within the Guest kernel, and any kernel address can
* quickly converted from physical to virtual by adding PAGE_OFFSET. It's
* 0xC0000000 (3G) by default, but it's configurable at kernel build time.
*/
static int initialize(struct file *file, const unsigned long __user *input)
{
......@@ -137,7 +131,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
* Guest. */
struct lguest *lg;
int err;
unsigned long args[5];
unsigned long args[4];
/* We grab the Big Lguest lock, which protects against multiple
* simultaneous initializations. */
......@@ -162,7 +156,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
/* Populate the easy fields of our "struct lguest" */
lg->mem_base = (void __user *)(long)args[0];
lg->pfn_limit = args[1];
lg->page_offset = args[4];
/* We need a complete page for the Guest registers: they are accessible
* to the Guest and we can only grant it access to whole pages. */
......
......@@ -13,6 +13,7 @@
#include <linux/random.h>
#include <linux/percpu.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#include "lg.h"
/*M:008 We hold reference to pages, which prevents them from being swapped.
......@@ -345,7 +346,7 @@ static void flush_user_mappings(struct lguest *lg, int idx)
{
unsigned int i;
/* Release every pgd entry up to the kernel's address. */
for (i = 0; i < pgd_index(lg->page_offset); i++)
for (i = 0; i < pgd_index(lg->kernel_address); i++)
release_pgd(lg, lg->pgdirs[idx].pgdir + i);
}
......@@ -358,6 +359,25 @@ void guest_pagetable_flush_user(struct lguest *lg)
}
/*:*/
/* We walk down the guest page tables to get a guest-physical address */
unsigned long guest_pa(struct lguest *lg, unsigned long vaddr)
{
pgd_t gpgd;
pte_t gpte;
/* First step: get the top-level Guest page table entry. */
gpgd = __pgd(lgread_u32(lg, gpgd_addr(lg, vaddr)));
/* Toplevel not present? We can't map it in. */
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
kill_guest(lg, "Bad address %#lx", vaddr);
gpte = __pte(lgread_u32(lg, gpte_addr(lg, gpgd, vaddr)));
if (!(pte_flags(gpte) & _PAGE_PRESENT))
kill_guest(lg, "Bad address %#lx", vaddr);
return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
}
/* We keep several page tables. This is a simple routine to find the page
* table (if any) corresponding to this top-level address the Guest has given
* us. */
......@@ -500,7 +520,7 @@ void guest_set_pte(struct lguest *lg,
{
/* Kernel mappings must be changed on all top levels. Slow, but
* doesn't happen often. */
if (vaddr >= lg->page_offset) {
if (vaddr >= lg->kernel_address) {
unsigned int i;
for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
if (lg->pgdirs[i].pgdir)
......@@ -550,11 +570,6 @@ void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx)
* its first page table is. We set some things up here: */
int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
{
/* In flush_user_mappings() we loop from 0 to
* "pgd_index(lg->page_offset)". This assumes it won't hit
* the Switcher mappings, so check that now. */
if (pgd_index(lg->page_offset) >= SWITCHER_PGD_INDEX)
return -EINVAL;
/* We start on the first shadow page table, and give it a blank PGD
* page. */
lg->pgdidx = 0;
......@@ -565,6 +580,24 @@ int init_guest_pagetable(struct lguest *lg, unsigned long pgtable)
return 0;
}
/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lguest *lg)
{
/* We get the kernel address: above this is all kernel memory. */
if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)
/* We tell the Guest that it can't use the top 4MB of virtual
* addresses used by the Switcher. */
|| put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)
|| put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))
kill_guest(lg, "bad guest page %p", lg->lguest_data);
/* In flush_user_mappings() we loop from 0 to
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now. */
if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)
kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);
}
/* When a Guest dies, our cleanup is fairly simple. */
void free_guest_pagetable(struct lguest *lg)
{
......
......@@ -216,9 +216,10 @@ static int emulate_insn(struct lguest *lg)
* guest_pa just subtracts the Guest's page_offset. */
unsigned long physaddr = guest_pa(lg, lg->regs->eip);
/* The guest_pa() function only works for Guest kernel addresses, but
* that's all we're trying to do anyway. */
if (lg->regs->eip < lg->page_offset)
/* This must be the Guest kernel trying to do something, not userspace!
* The bottom two bits of the CS segment register are the privilege
* level. */
if ((lg->regs->cs & 3) != GUEST_PL)
return 0;
/* Decoding x86 instructions is icky. */
......
......@@ -2,8 +2,6 @@
#ifndef _X86_LGUEST_HCALL_H
#define _X86_LGUEST_HCALL_H
#include <asm/hw_irq.h>
#define LHCALL_FLUSH_ASYNC 0
#define LHCALL_LGUEST_INIT 1
#define LHCALL_CRASH 2
......@@ -36,6 +34,9 @@
* definition of a gentleman: "someone who is only rude intentionally". */
#define LGUEST_TRAP_ENTRY 0x1F
#ifndef __ASSEMBLY__
#include <asm/hw_irq.h>
static inline unsigned long
hcall(unsigned long call,
unsigned long arg1, unsigned long arg2, unsigned long arg3)
......@@ -66,4 +67,6 @@ struct hcall_args
/* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
unsigned long arg0, arg2, arg3, arg1;
};
#endif /* !__ASSEMBLY__ */
#endif /* _I386_LGUEST_HCALL_H */
......@@ -44,11 +44,14 @@ struct lguest_data
unsigned long reserve_mem;
/* KHz for the TSC clock. */
u32 tsc_khz;
/* Page where the top-level pagetable is */
unsigned long pgdir;
/* Fields initialized by the Guest at boot: */
/* Instruction range to suppress interrupts even if enabled */
unsigned long noirq_start, noirq_end;
/* Address above which page tables are all identical. */
unsigned long kernel_address;
/* The vector to try to use for system calls (0x40 or 0x80). */
unsigned int syscall_vec;
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment