hlist: drop the node parameter from iterators
[cascardo/linux.git] / arch / x86 / kvm / mmu.c
index 01d7c2a..956ca35 100644 (file)
@@ -448,7 +448,8 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
 
 static bool spte_is_locklessly_modifiable(u64 spte)
 {
-       return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+       return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
+               (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 }
 
 static bool spte_has_volatile_bits(u64 spte)
@@ -831,8 +832,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
        if (host_level == PT_PAGE_TABLE_LEVEL)
                return host_level;
 
-       max_level = kvm_x86_ops->get_lpage_level() < host_level ?
-               kvm_x86_ops->get_lpage_level() : host_level;
+       max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
        for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
                if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
@@ -1142,7 +1142,7 @@ spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
 }
 
 static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
-                                int level, bool pt_protect)
+                                bool pt_protect)
 {
        u64 *sptep;
        struct rmap_iterator iter;
@@ -1180,7 +1180,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
        while (mask) {
                rmapp = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
                                      PT_PAGE_TABLE_LEVEL, slot);
-               __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
+               __rmap_write_protect(kvm, rmapp, false);
 
                /* clear the first set bit */
                mask &= mask - 1;
@@ -1199,7 +1199,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
        for (i = PT_PAGE_TABLE_LEVEL;
             i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
                rmapp = __gfn_to_rmap(gfn, i, slot);
-               write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
+               write_protected |= __rmap_write_protect(kvm, rmapp, true);
        }
 
        return write_protected;
@@ -1460,28 +1460,14 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
        percpu_counter_add(&kvm_total_used_mmu_pages, nr);
 }
 
-/*
- * Remove the sp from shadow page cache, after call it,
- * we can not find this sp from the cache, and the shadow
- * page table is still valid.
- * It should be under the protection of mmu lock.
- */
-static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
 {
        ASSERT(is_empty_shadow_page(sp->spt));
        hlist_del(&sp->hash_link);
-       if (!sp->role.direct)
-               free_page((unsigned long)sp->gfns);
-}
-
-/*
- * Free the shadow page table and the sp, we can do it
- * out of the protection of mmu lock.
- */
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
-{
        list_del(&sp->link);
        free_page((unsigned long)sp->spt);
+       if (!sp->role.direct)
+               free_page((unsigned long)sp->gfns);
        kmem_cache_free(mmu_page_header_cache, sp);
 }
 
@@ -1522,7 +1508,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
                sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
        set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
        list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
-       bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM);
        sp->parent_ptes = 0;
        mmu_page_add_parent_pte(vcpu, sp, parent_pte);
        kvm_mod_used_mmu_pages(vcpu->kvm, +1);
@@ -1659,13 +1644,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
                                    struct list_head *invalid_list);
 
-#define for_each_gfn_sp(kvm, sp, gfn, pos)                             \
-  hlist_for_each_entry(sp, pos,                                                \
+#define for_each_gfn_sp(kvm, sp, gfn)                                  \
+  hlist_for_each_entry(sp,                                             \
    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
        if ((sp)->gfn != (gfn)) {} else
 
-#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)              \
-  hlist_for_each_entry(sp, pos,                                                \
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn)                   \
+  hlist_for_each_entry(sp,                                             \
    &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)  \
                if ((sp)->gfn != (gfn) || (sp)->role.direct ||          \
                        (sp)->role.invalid) {} else
@@ -1721,11 +1706,10 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
        struct kvm_mmu_page *s;
-       struct hlist_node *node;
        LIST_HEAD(invalid_list);
        bool flush = false;
 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                if (!s->unsync)
                        continue;
 
@@ -1863,7 +1847,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
        union kvm_mmu_page_role role;
        unsigned quadrant;
        struct kvm_mmu_page *sp;
-       struct hlist_node *node;
        bool need_sync = false;
 
        role = vcpu->arch.mmu.base_role;
@@ -1878,7 +1861,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
                quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
                role.quadrant = quadrant;
        }
-       for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+       for_each_gfn_sp(vcpu->kvm, sp, gfn) {
                if (!need_sync && sp->unsync)
                        need_sync = true;
 
@@ -1973,9 +1956,9 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
 {
        u64 spte;
 
-       spte = __pa(sp->spt)
-               | PT_PRESENT_MASK | PT_ACCESSED_MASK
-               | PT_WRITABLE_MASK | PT_USER_MASK;
+       spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+              shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
+
        mmu_spte_set(sptep, spte);
 }
 
@@ -2126,7 +2109,6 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
        do {
                sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
                WARN_ON(!sp->role.invalid || sp->root_count);
-               kvm_mmu_isolate_page(sp);
                kvm_mmu_free_page(sp);
        } while (!list_empty(invalid_list));
 }
@@ -2144,6 +2126,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
         * change the value
         */
 
+       spin_lock(&kvm->mmu_lock);
+
        if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
                while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
                        !list_empty(&kvm->arch.active_mmu_pages)) {
@@ -2158,19 +2142,20 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
        }
 
        kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
+
+       spin_unlock(&kvm->mmu_lock);
 }
 
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
        struct kvm_mmu_page *sp;
-       struct hlist_node *node;
        LIST_HEAD(invalid_list);
        int r;
 
        pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
        r = 0;
        spin_lock(&kvm->mmu_lock);
-       for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+       for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
                pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
                         sp->role.word);
                r = 1;
@@ -2183,14 +2168,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
 
-static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
-{
-       int slot = memslot_id(kvm, gfn);
-       struct kvm_mmu_page *sp = page_header(__pa(pte));
-
-       __set_bit(slot, sp->slot_bitmap);
-}
-
 /*
  * The function is based on mtrr_type_lookup() in
  * arch/x86/kernel/cpu/mtrr/generic.c
@@ -2308,9 +2285,8 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
 {
        struct kvm_mmu_page *s;
-       struct hlist_node *node;
 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                if (s->unsync)
                        continue;
                WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
@@ -2322,19 +2298,17 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
                                  bool can_unsync)
 {
        struct kvm_mmu_page *s;
-       struct hlist_node *node;
        bool need_unsync = false;
 
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                if (!can_unsync)
                        return 1;
 
                if (s->role.level != PT_PAGE_TABLE_LEVEL)
                        return 1;
 
-               if (!need_unsync && !s->unsync) {
+               if (!s->unsync)
                        need_unsync = true;
-               }
        }
        if (need_unsync)
                kvm_unsync_pages(vcpu, gfn);
@@ -2342,8 +2316,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 }
 
 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-                   unsigned pte_access, int user_fault,
-                   int write_fault, int level,
+                   unsigned pte_access, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync, bool host_writable)
 {
@@ -2378,20 +2351,13 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
        spte |= (u64)pfn << PAGE_SHIFT;
 
-       if ((pte_access & ACC_WRITE_MASK)
-           || (!vcpu->arch.mmu.direct_map && write_fault
-               && !is_write_protection(vcpu) && !user_fault)) {
+       if (pte_access & ACC_WRITE_MASK) {
 
                /*
-                * There are two cases:
-                * - the one is other vcpu creates new sp in the window
-                *   between mapping_level() and acquiring mmu-lock.
-                * - the another case is the new sp is created by itself
-                *   (page-fault path) when guest uses the target gfn as
-                *   its page table.
-                * Both of these cases can be fixed by allowing guest to
-                * retry the access, it will refault, then we can establish
-                * the mapping by using small page.
+                * Other vcpu creates new sp in the window between
+                * mapping_level() and acquiring mmu-lock. We can
+                * allow guest to retry the access, the mapping can
+                * be fixed if guest refault.
                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
                    has_wrprotected_page(vcpu->kvm, gfn, level))
@@ -2399,19 +2365,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
                spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
 
-               if (!vcpu->arch.mmu.direct_map
-                   && !(pte_access & ACC_WRITE_MASK)) {
-                       spte &= ~PT_USER_MASK;
-                       /*
-                        * If we converted a user page to a kernel page,
-                        * so that the kernel can write to it when cr0.wp=0,
-                        * then we should prevent the kernel from executing it
-                        * if SMEP is enabled.
-                        */
-                       if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
-                               spte |= PT64_NX_MASK;
-               }
-
                /*
                 * Optimization: for pte sync, if spte was writable the hash
                 * lookup is unnecessary (and expensive). Write protection
@@ -2441,19 +2394,15 @@ done:
 }
 
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
-                        unsigned pt_access, unsigned pte_access,
-                        int user_fault, int write_fault,
-                        int *emulate, int level, gfn_t gfn,
-                        pfn_t pfn, bool speculative,
+                        unsigned pte_access, int write_fault, int *emulate,
+                        int level, gfn_t gfn, pfn_t pfn, bool speculative,
                         bool host_writable)
 {
        int was_rmapped = 0;
        int rmap_count;
 
-       pgprintk("%s: spte %llx access %x write_fault %d"
-                " user_fault %d gfn %llx\n",
-                __func__, *sptep, pt_access,
-                write_fault, user_fault, gfn);
+       pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
+                *sptep, write_fault, gfn);
 
        if (is_rmap_spte(*sptep)) {
                /*
@@ -2477,9 +2426,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                        was_rmapped = 1;
        }
 
-       if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
-                     level, gfn, pfn, speculative, true,
-                     host_writable)) {
+       if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
+             true, host_writable)) {
                if (write_fault)
                        *emulate = 1;
                kvm_mmu_flush_tlb(vcpu);
@@ -2497,7 +2445,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                ++vcpu->kvm->stat.lpages;
 
        if (is_shadow_present_pte(*sptep)) {
-               page_header_update_slot(vcpu->kvm, sptep, gfn);
                if (!was_rmapped) {
                        rmap_count = rmap_add(vcpu, sptep, gfn);
                        if (rmap_count > RMAP_RECYCLE_THRESHOLD)
@@ -2571,10 +2518,9 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
                return -1;
 
        for (i = 0; i < ret; i++, gfn++, start++)
-               mmu_set_spte(vcpu, start, ACC_ALL,
-                            access, 0, 0, NULL,
-                            sp->role.level, gfn,
-                            page_to_pfn(pages[i]), true, true);
+               mmu_set_spte(vcpu, start, access, 0, NULL,
+                            sp->role.level, gfn, page_to_pfn(pages[i]),
+                            true, true);
 
        return 0;
 }
@@ -2633,11 +2579,9 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 
        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
                if (iterator.level == level) {
-                       unsigned pte_access = ACC_ALL;
-
-                       mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
-                                    0, write, &emulate,
-                                    level, gfn, pfn, prefault, map_writable);
+                       mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
+                                    write, &emulate, level, gfn, pfn,
+                                    prefault, map_writable);
                        direct_pte_prefetch(vcpu, iterator.sptep);
                        ++vcpu->stat.pf_fixed;
                        break;
@@ -2652,11 +2596,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                                              iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
 
-                       mmu_spte_set(iterator.sptep,
-                                    __pa(sp->spt)
-                                    | PT_PRESENT_MASK | PT_WRITABLE_MASK
-                                    | shadow_user_mask | shadow_x_mask
-                                    | shadow_accessed_mask);
+                       link_shadow_page(iterator.sptep, sp);
                }
        }
        return emulate;
@@ -3719,6 +3659,7 @@ int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
        else
                r = paging32_init_context(vcpu, context);
 
+       vcpu->arch.mmu.base_role.nxe = is_nx(vcpu);
        vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
        vcpu->arch.mmu.base_role.cr0_wp  = is_write_protection(vcpu);
        vcpu->arch.mmu.base_role.smep_andnot_wp
@@ -3885,7 +3826,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
                *gpa &= ~(gpa_t)7;
                *bytes = 8;
-               r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, min(*bytes, 8));
+               r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8);
                if (r)
                        gentry = 0;
                new = (const u8 *)&gentry;
@@ -3987,7 +3928,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        gfn_t gfn = gpa >> PAGE_SHIFT;
        union kvm_mmu_page_role mask = { .word = 0 };
        struct kvm_mmu_page *sp;
-       struct hlist_node *node;
        LIST_HEAD(invalid_list);
        u64 entry, gentry, *spte;
        int npte;
@@ -4018,7 +3958,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
        kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
 
        mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
-       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
+       for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                if (detect_write_misaligned(sp, gpa, bytes) ||
                      detect_write_flooding(sp)) {
                        zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
@@ -4039,7 +3979,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
                              !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
                              & mask.word) && rmap_can_add(vcpu))
                                mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-                       if (!remote_flush && need_remote_flush(entry, *spte))
+                       if (need_remote_flush(entry, *spte))
                                remote_flush = true;
                        ++spte;
                }
@@ -4198,26 +4138,36 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
 
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
-       struct kvm_mmu_page *sp;
-       bool flush = false;
+       struct kvm_memory_slot *memslot;
+       gfn_t last_gfn;
+       int i;
 
-       list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
-               int i;
-               u64 *pt;
+       memslot = id_to_memslot(kvm->memslots, slot);
+       last_gfn = memslot->base_gfn + memslot->npages - 1;
 
-               if (!test_bit(slot, sp->slot_bitmap))
-                       continue;
+       spin_lock(&kvm->mmu_lock);
 
-               pt = sp->spt;
-               for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
-                       if (!is_shadow_present_pte(pt[i]) ||
-                             !is_last_spte(pt[i], sp->role.level))
-                               continue;
+       for (i = PT_PAGE_TABLE_LEVEL;
+            i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
+               unsigned long *rmapp;
+               unsigned long last_index, index;
 
-                       spte_write_protect(kvm, &pt[i], &flush, false);
+               rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL];
+               last_index = gfn_to_index(last_gfn, memslot->base_gfn, i);
+
+               for (index = 0; index <= last_index; ++index, ++rmapp) {
+                       if (*rmapp)
+                               __rmap_write_protect(kvm, rmapp, false);
+
+                       if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+                               kvm_flush_remote_tlbs(kvm);
+                               cond_resched_lock(&kvm->mmu_lock);
+                       }
                }
        }
+
        kvm_flush_remote_tlbs(kvm);
+       spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)