KVM: PPC: Book3S HV: Make sure we don't miss dirty pages
[cascardo/linux.git] / arch / powerpc / kvm / book3s_64_mmu_hv.c
index 96c9044..8056107 100644 (file)
@@ -1060,6 +1060,11 @@ void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
        kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 }
 
+static int vcpus_running(struct kvm *kvm)
+{
+       return atomic_read(&kvm->arch.vcpus_running) != 0;
+}
+
 /*
  * Returns the number of system pages that are dirty.
  * This can be more than 1 if we find a huge-page HPTE.
@@ -1069,6 +1074,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
        struct revmap_entry *rev = kvm->arch.revmap;
        unsigned long head, i, j;
        unsigned long n;
+       unsigned long v, r;
        unsigned long *hptep;
        int npages_dirty = 0;
 
@@ -1088,7 +1094,22 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
                hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4));
                j = rev[i].forw;
 
-               if (!(hptep[1] & HPTE_R_C))
+               /*
+                * Checking the C (changed) bit here is racy since there
+                * is no guarantee about when the hardware writes it back.
+                * If the HPTE is not writable then it is stable since the
+                * page can't be written to, and we would have done a tlbie
+                * (which forces the hardware to complete any writeback)
+                * when making the HPTE read-only.
+                * If vcpus are running then this call is racy anyway
+                * since the page could get dirtied subsequently, so we
+                * expect there to be a further call which would pick up
+                * any delayed C bit writeback.
+                * Otherwise we need to do the tlbie even if C==0 in
+                * order to pick up any delayed writeback of C.
+                */
+               if (!(hptep[1] & HPTE_R_C) &&
+                   (!hpte_is_writable(hptep[1]) || vcpus_running(kvm)))
                        continue;
 
                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
@@ -1100,23 +1121,29 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
                }
 
                /* Now check and modify the HPTE */
-               if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) {
-                       /* need to make it temporarily absent to clear C */
-                       hptep[0] |= HPTE_V_ABSENT;
-                       kvmppc_invalidate_hpte(kvm, hptep, i);
-                       hptep[1] &= ~HPTE_R_C;
-                       eieio();
-                       hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
+               if (!(hptep[0] & HPTE_V_VALID))
+                       continue;
+
+               /* need to make it temporarily absent so C is stable */
+               hptep[0] |= HPTE_V_ABSENT;
+               kvmppc_invalidate_hpte(kvm, hptep, i);
+               v = hptep[0];
+               r = hptep[1];
+               if (r & HPTE_R_C) {
+                       hptep[1] = r & ~HPTE_R_C;
                        if (!(rev[i].guest_rpte & HPTE_R_C)) {
                                rev[i].guest_rpte |= HPTE_R_C;
                                note_hpte_modification(kvm, &rev[i]);
                        }
-                       n = hpte_page_size(hptep[0], hptep[1]);
+                       n = hpte_page_size(v, r);
                        n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
                        if (n > npages_dirty)
                                npages_dirty = n;
+                       eieio();
                }
-               hptep[0] &= ~HPTE_V_HVLOCK;
+               v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+               v |= HPTE_V_VALID;
+               hptep[0] = v;
        } while ((i = j) != head);
 
        unlock_rmap(rmapp);