diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 96c90447d4bf71f0d20f0ff832958f7bab545402..80561074078d01ca8d101f7997a55699e8589e3f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -1060,6 +1060,11 @@ void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); } +static int vcpus_running(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.vcpus_running) != 0; +} + /* * Returns the number of system pages that are dirty. * This can be more than 1 if we find a huge-page HPTE. @@ -1069,6 +1074,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; unsigned long n; + unsigned long v, r; unsigned long *hptep; int npages_dirty = 0; @@ -1088,7 +1094,22 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) hptep = (unsigned long *) (kvm->arch.hpt_virt + (i << 4)); j = rev[i].forw; - if (!(hptep[1] & HPTE_R_C)) + /* + * Checking the C (changed) bit here is racy since there + * is no guarantee about when the hardware writes it back. + * If the HPTE is not writable then it is stable since the + * page can't be written to, and we would have done a tlbie + * (which forces the hardware to complete any writeback) + * when making the HPTE read-only. + * If vcpus are running then this call is racy anyway + * since the page could get dirtied subsequently, so we + * expect there to be a further call which would pick up + * any delayed C bit writeback. + * Otherwise we need to do the tlbie even if C==0 in + * order to pick up any delayed writeback of C. + */ + if (!(hptep[1] & HPTE_R_C) && + (!hpte_is_writable(hptep[1]) || vcpus_running(kvm))) continue; if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { @@ -1100,23 +1121,29 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) } /* Now check and modify the HPTE */ - if ((hptep[0] & HPTE_V_VALID) && (hptep[1] & HPTE_R_C)) { - /* need to make it temporarily absent to clear C */ - hptep[0] |= HPTE_V_ABSENT; - kvmppc_invalidate_hpte(kvm, hptep, i); - hptep[1] &= ~HPTE_R_C; - eieio(); - hptep[0] = (hptep[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + if (!(hptep[0] & HPTE_V_VALID)) + continue; + + /* need to make it temporarily absent so C is stable */ + hptep[0] |= HPTE_V_ABSENT; + kvmppc_invalidate_hpte(kvm, hptep, i); + v = hptep[0]; + r = hptep[1]; + if (r & HPTE_R_C) { + hptep[1] = r & ~HPTE_R_C; if (!(rev[i].guest_rpte & HPTE_R_C)) { rev[i].guest_rpte |= HPTE_R_C; note_hpte_modification(kvm, &rev[i]); } - n = hpte_page_size(hptep[0], hptep[1]); + n = hpte_page_size(v, r); n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; if (n > npages_dirty) npages_dirty = n; + eieio(); } - hptep[0] &= ~HPTE_V_HVLOCK; + v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK); + v |= HPTE_V_VALID; + hptep[0] = v; } while ((i = j) != head); unlock_rmap(rmapp);