Commit 4731d4c7 authored by Marcelo Tosatti's avatar Marcelo Tosatti Committed by Avi Kivity

KVM: MMU: out of sync shadow core

Allow guest pagetables to go out of sync.  Instead of emulating write
accesses to guest pagetables, or unshadowing them, we un-write-protect
the page table and allow the guest to modify it at will.  We rely on
invlpg executions to synchronize individual ptes, and will synchronize
the entire pagetable on tlb flushes.
Signed-off-by: default avatarMarcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: default avatarAvi Kivity <avi@redhat.com>
parent 6844dec6
...@@ -147,6 +147,10 @@ struct kvm_shadow_walk { ...@@ -147,6 +147,10 @@ struct kvm_shadow_walk {
u64 addr, u64 *spte, int level); u64 addr, u64 *spte, int level);
}; };
struct kvm_unsync_walk {
int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
};
typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache; static struct kmem_cache *pte_chain_cache;
...@@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) ...@@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
if (write_protected) if (write_protected)
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
account_shadowed(kvm, gfn);
} }
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
...@@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) ...@@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{ {
} }
static int mmu_unsync_walk(struct kvm_mmu_page *sp,
struct kvm_unsync_walk *walker)
{
int i, ret;
if (!sp->unsync_children)
return 0;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
u64 ent = sp->spt[i];
if (is_shadow_present_pte(ent)) {
struct kvm_mmu_page *child;
child = page_header(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
ret = mmu_unsync_walk(child, walker);
if (ret)
return ret;
}
if (child->unsync) {
ret = walker->entry(child, walker);
if (ret)
return ret;
}
}
}
if (i == PT64_ENT_PER_PAGE)
sp->unsync_children = 0;
return 0;
}
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
{ {
unsigned index; unsigned index;
...@@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) ...@@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL; return NULL;
} }
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
--kvm->stat.mmu_unsync;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
if (sp->role.glevels != vcpu->arch.mmu.root_level) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
rmap_write_protect(vcpu->kvm, sp->gfn);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
kvm_mmu_flush_tlb(vcpu);
kvm_unlink_unsync_page(vcpu->kvm, sp);
return 0;
}
struct sync_walker {
struct kvm_vcpu *vcpu;
struct kvm_unsync_walk walker;
};
static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
{
struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
walker);
struct kvm_vcpu *vcpu = sync_walk->vcpu;
kvm_sync_page(vcpu, sp);
return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
}
static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
struct sync_walker walker = {
.walker = { .entry = mmu_sync_fn, },
.vcpu = vcpu,
};
while (mmu_unsync_walk(sp, &walker.walker))
cond_resched_lock(&vcpu->kvm->mmu_lock);
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn, gfn_t gfn,
gva_t gaddr, gva_t gaddr,
...@@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
unsigned quadrant; unsigned quadrant;
struct hlist_head *bucket; struct hlist_head *bucket;
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
struct hlist_node *node; struct hlist_node *node, *tmp;
role.word = 0; role.word = 0;
role.glevels = vcpu->arch.mmu.root_level; role.glevels = vcpu->arch.mmu.root_level;
...@@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn, role.word); gfn, role.word);
index = kvm_page_table_hashfn(gfn); index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index]; bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link) hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) { if (sp->gfn == gfn) {
if (sp->unsync)
if (kvm_sync_page(vcpu, sp))
continue;
if (sp->role.word != role.word)
continue;
if (sp->unsync_children)
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
mmu_page_add_parent_pte(vcpu, sp, parent_pte); mmu_page_add_parent_pte(vcpu, sp, parent_pte);
pgprintk("%s: found\n", __func__); pgprintk("%s: found\n", __func__);
return sp; return sp;
...@@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, ...@@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->gfn = gfn; sp->gfn = gfn;
sp->role = role; sp->role = role;
hlist_add_head(&sp->hash_link, bucket); hlist_add_head(&sp->hash_link, bucket);
if (!metaphysical) if (!metaphysical) {
rmap_write_protect(vcpu->kvm, gfn); rmap_write_protect(vcpu->kvm, gfn);
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
vcpu->arch.mmu.prefetch_page(vcpu, sp); vcpu->arch.mmu.prefetch_page(vcpu, sp);
else else
...@@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) ...@@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
} }
} }
struct zap_walker {
struct kvm_unsync_walk walker;
struct kvm *kvm;
int zapped;
};
static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
{
struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
walker);
kvm_mmu_zap_page(zap_walk->kvm, sp);
zap_walk->zapped = 1;
return 0;
}
static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
{
struct zap_walker walker = {
.walker = { .entry = mmu_zap_fn, },
.kvm = kvm,
.zapped = 0,
};
if (sp->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
mmu_unsync_walk(sp, &walker.walker);
return walker.zapped;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{ {
int ret;
++kvm->stat.mmu_shadow_zapped; ++kvm->stat.mmu_shadow_zapped;
ret = mmu_zap_unsync_children(kvm, sp);
kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp);
kvm_mmu_unlink_parents(kvm, sp); kvm_mmu_unlink_parents(kvm, sp);
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.metaphysical) if (!sp->role.invalid && !sp->role.metaphysical)
unaccount_shadowed(kvm, sp->gfn); unaccount_shadowed(kvm, sp->gfn);
if (sp->unsync)
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) { if (!sp->root_count) {
hlist_del(&sp->hash_link); hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp); kvm_mmu_free_page(kvm, sp);
...@@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) ...@@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_reload_remote_mmus(kvm); kvm_reload_remote_mmus(kvm);
} }
kvm_mmu_reset_last_pte_updated(kvm); kvm_mmu_reset_last_pte_updated(kvm);
return 0; return ret;
} }
/* /*
...@@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) ...@@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
return page; return page;
} }
static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
sp->unsync_children = 1;
return 1;
}
static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
unsigned index;
struct hlist_head *bucket;
struct kvm_mmu_page *s;
struct hlist_node *node, *n;
index = kvm_page_table_hashfn(sp->gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
/* don't unsync if pagetable is shadowed with multiple roles */
hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
if (s->gfn != sp->gfn || s->role.metaphysical)
continue;
if (s->role.word != sp->role.word)
return 1;
}
mmu_parent_walk(vcpu, sp, unsync_walk_fn);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
mmu_convert_notrap(sp);
return 0;
}
static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
bool can_unsync)
{
struct kvm_mmu_page *shadow;
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
if (shadow) {
if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
if (shadow->unsync)
return 0;
if (can_unsync)
return kvm_unsync_page(vcpu, shadow);
return 1;
}
return 0;
}
static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault, unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage, int write_fault, int dirty, int largepage,
gfn_t gfn, pfn_t pfn, bool speculative) gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{ {
u64 spte; u64 spte;
int ret = 0; int ret = 0;
...@@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, ...@@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
if ((pte_access & ACC_WRITE_MASK) if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) { || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
struct kvm_mmu_page *shadow;
if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
ret = 1; ret = 1;
...@@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, ...@@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= PT_WRITABLE_MASK; spte |= PT_WRITABLE_MASK;
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n", pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn); __func__, gfn);
ret = 1; ret = 1;
...@@ -1260,7 +1441,6 @@ set_pte: ...@@ -1260,7 +1441,6 @@ set_pte:
return ret; return ret;
} }
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access, unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty, int user_fault, int write_fault, int dirty,
...@@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, ...@@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
} }
} }
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
dirty, largepage, gfn, pfn, speculative)) { dirty, largepage, gfn, pfn, speculative, true)) {
if (write_fault) if (write_fault)
*ptwrite = 1; *ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu); kvm_x86_ops->tlb_flush(vcpu);
...@@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) ...@@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
} }
static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
}
static void mmu_sync_roots(struct kvm_vcpu *vcpu) static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{ {
int i; int i;
......
...@@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) ...@@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_pte(gpte), 0, gfn, is_dirty_pte(gpte), 0, gfn,
spte_to_pfn(sp->spt[i]), true); spte_to_pfn(sp->spt[i]), true, false);
} }
return !nr_present; return !nr_present;
......
...@@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { ...@@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) }, { "largepages", VM_STAT(lpages) },
{ NULL } { NULL }
...@@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ...@@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vcpu->requests) { if (vcpu->requests) {
if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
__kvm_migrate_timers(vcpu); __kvm_migrate_timers(vcpu);
if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
kvm_mmu_sync_roots(vcpu);
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu); kvm_x86_ops->tlb_flush(vcpu);
if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
......
...@@ -195,6 +195,8 @@ struct kvm_mmu_page { ...@@ -195,6 +195,8 @@ struct kvm_mmu_page {
*/ */
int multimapped; /* More than one parent_pte? */ int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */ int root_count; /* Currently serving as active root */
bool unsync;
bool unsync_children;
union { union {
u64 *parent_pte; /* !multimapped */ u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
...@@ -371,6 +373,7 @@ struct kvm_vm_stat { ...@@ -371,6 +373,7 @@ struct kvm_vm_stat {
u32 mmu_flooded; u32 mmu_flooded;
u32 mmu_recycled; u32 mmu_recycled;
u32 mmu_cache_miss; u32 mmu_cache_miss;
u32 mmu_unsync;
u32 remote_tlb_flush; u32 remote_tlb_flush;
u32 lpages; u32 lpages;
}; };
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#define KVM_REQ_TRIPLE_FAULT 4 #define KVM_REQ_TRIPLE_FAULT 4
#define KVM_REQ_PENDING_TIMER 5 #define KVM_REQ_PENDING_TIMER 5
#define KVM_REQ_UNHALT 6 #define KVM_REQ_UNHALT 6
#define KVM_REQ_MMU_SYNC 7
struct kvm_vcpu; struct kvm_vcpu;
extern struct kmem_cache *kvm_vcpu_cache; extern struct kmem_cache *kvm_vcpu_cache;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment