Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM fixes from Paolo Bonzini: "PPC: - user-triggerable use-after-free in HPT resizing - stale TLB entries in the guest - trap-and-emulate (PR) KVM guests failing to start under pHyp x86: - Another "Spectre" fix. - async pagefault fix - Revert an old fix for x86 nested virtualization, which turned out to do more harm than good - Check shrinker registration return code, to avoid warnings from upcoming 4.16 -mm patches" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: Add memory barrier on vmcs field lookup KVM: x86: emulate #UD while in guest mode x86: kvm: propagate register_shrinker return code KVM MMU: check pending exception before injecting APF KVM: PPC: Book3S HV: Always flush TLB in kvmppc_alloc_reset_hpt() KVM: PPC: Book3S PR: Fix WIMG handling under pHyp KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests KVM: PPC: Book3S HV: Drop prepare_done from struct kvm_resize_hpt

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM fixes from Paolo Bonzini: "PPC: - user-triggerable use-after-free in HPT resizing - stale TLB entries in the guest - trap-and-emulate (PR) KVM guests failing to start under pHyp x86: - Another "Spectre" fix. - async pagefault fix - Revert an old fix for x86 nested virtualization, which turned out to do more harm than good - Check shrinker registration return code, to avoid warnings from upcoming 4.16 -mm patches" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: x86: Add memory barrier on vmcs field lookup KVM: x86: emulate #UD while in guest mode x86: kvm: propagate register_shrinker return code KVM MMU: check pending exception before injecting APF KVM: PPC: Book3S HV: Always flush TLB in kvmppc_alloc_reset_hpt() KVM: PPC: Book3S PR: Fix WIMG handling under pHyp KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests KVM: PPC: Book3S HV: Drop prepare_done from struct kvm_resize_hpt
4e67483a · Linus Torvalds · 70cd9071 · 2aad9b3e · 4e67483a · 4e67483a
6 changed file
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		gpte->may_read = true;
 		gpte->may_write = true;
 		gpte->page_size = MMU_PAGE_4K;
+		gpte->wimg = HPTE_R_M;

 		return 0;
 	}

--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -65,11 +65,17 @@ struct kvm_resize_hpt {
 	u32 order;

 	/* These fields protected by kvm->lock */
+
+	/* Possible values and their usage:
+	 *  <0     an error occurred during allocation,
+	 *  -EBUSY allocation is in the progress,
+	 *  0      allocation made successfuly.
+	 */
 	int error;
-	bool prepare_done;

-	/* Private to the work thread, until prepare_done is true,
-	 * then protected by kvm->resize_hpt_sem */
+	/* Private to the work thread, until error != -EBUSY,
+	 * then protected by kvm->lock.
+	 */
 	struct kvm_hpt_info hpt;
 };

@@ -159,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 		 * Reset all the reverse-mapping chains for all memslots
 		 */
 		kvmppc_rmap_reset(kvm);
-		/* Ensure that each vcpu will flush its TLB on next entry. */
-		cpumask_setall(&kvm->arch.need_tlb_flush);
 		err = 0;
 		goto out;
 	}
@@ -176,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 	kvmppc_set_hpt(kvm, &info);

 out:
+	if (err == 0)
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+
 	mutex_unlock(&kvm->lock);
 	return err;
 }
@@ -1413,16 +1421,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)

 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
 {
-	BUG_ON(kvm->arch.resize_hpt != resize);
+	if (WARN_ON(!mutex_is_locked(&kvm->lock)))
+		return;

 	if (!resize)
 		return;

-	if (resize->hpt.virt)
-		kvmppc_free_hpt(&resize->hpt);
+	if (resize->error != -EBUSY) {
+		if (resize->hpt.virt)
+			kvmppc_free_hpt(&resize->hpt);
+		kfree(resize);
+	}

-	kvm->arch.resize_hpt = NULL;
-	kfree(resize);
+	if (kvm->arch.resize_hpt == resize)
+		kvm->arch.resize_hpt = NULL;
 }

 static void resize_hpt_prepare_work(struct work_struct *work)
@@ -1431,17 +1443,41 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 						     struct kvm_resize_hpt,
 						     work);
 	struct kvm *kvm = resize->kvm;
-	int err;
+	int err = 0;

-	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
-			 resize->order);
-
-	err = resize_hpt_allocate(resize);
+	if (WARN_ON(resize->error != -EBUSY))
+		return;

 	mutex_lock(&kvm->lock);

+	/* Request is still current? */
+	if (kvm->arch.resize_hpt == resize) {
+		/* We may request large allocations here:
+		 * do not sleep with kvm->lock held for a while.
+		 */
+		mutex_unlock(&kvm->lock);
+
+		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+				 resize->order);
+
+		err = resize_hpt_allocate(resize);
+
+		/* We have strict assumption about -EBUSY
+		 * when preparing for HPT resize.
+		 */
+		if (WARN_ON(err == -EBUSY))
+			err = -EINPROGRESS;
+
+		mutex_lock(&kvm->lock);
+		/* It is possible that kvm->arch.resize_hpt != resize
+		 * after we grab kvm->lock again.
+		 */
+	}
+
 	resize->error = err;
-	resize->prepare_done = true;
+
+	if (kvm->arch.resize_hpt != resize)
+		resize_hpt_release(kvm, resize);

 	mutex_unlock(&kvm->lock);
 }
@@ -1466,14 +1502,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,

 	if (resize) {
 		if (resize->order == shift) {
-			/* Suitable resize in progress */
-			if (resize->prepare_done) {
-				ret = resize->error;
-				if (ret != 0)
-					resize_hpt_release(kvm, resize);
-			} else {
+			/* Suitable resize in progress? */
+			ret = resize->error;
+			if (ret == -EBUSY)
 				ret = 100; /* estimated time in ms */
-			}
+			else if (ret)
+				resize_hpt_release(kvm, resize);

 			goto out;
 		}
@@ -1493,6 +1527,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 		ret = -ENOMEM;
 		goto out;
 	}
+
+	resize->error = -EBUSY;
 	resize->order = shift;
 	resize->kvm = kvm;
 	INIT_WORK(&resize->work, resize_hpt_prepare_work);
@@ -1547,16 +1583,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 	if (!resize || (resize->order != shift))
 		goto out;

-	ret = -EBUSY;
-	if (!resize->prepare_done)
-		goto out;
-
 	ret = resize->error;
-	if (ret != 0)
+	if (ret)
 		goto out;

 	ret = resize_hpt_rehash(resize);
-	if (ret != 0)
+	if (ret)
 		goto out;

 	resize_hpt_pivot(resize);

--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 #define MSR_USER32 MSR_USER
 #define MSR_USER64 MSR_USER
 #define HW_PAGE_SIZE PAGE_SIZE
+#define HPTE_R_M   _PAGE_COHERENT
 #endif

 static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu)
@@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte.eaddr = eaddr;
 		pte.vpage = eaddr >> 12;
 		pte.page_size = MMU_PAGE_64K;
+		pte.wimg = HPTE_R_M;
 	}

 	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {

--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(!lapic_in_kernel(vcpu) ||
-		     kvm_event_needs_reinjection(vcpu)))
+		     kvm_event_needs_reinjection(vcpu) ||
+		     vcpu->arch.exception.pending))
 		return false;

 	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
@@ -5465,30 +5466,34 @@ static void mmu_destroy_caches(void)

 int kvm_mmu_module_init(void)
 {
+	int ret = -ENOMEM;
+
 	kvm_mmu_clear_all_pte_masks();

 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    0, SLAB_ACCOUNT, NULL);
 	if (!pte_list_desc_cache)
-		goto nomem;
+		goto out;

 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
 						  sizeof(struct kvm_mmu_page),
 						  0, SLAB_ACCOUNT, NULL);
 	if (!mmu_page_header_cache)
-		goto nomem;
+		goto out;

 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
-		goto nomem;
+		goto out;

-	register_shrinker(&mmu_shrinker);
+	ret = register_shrinker(&mmu_shrinker);
+	if (ret)
+		goto out;

 	return 0;

-nomem:
+out:
 	mmu_destroy_caches();
-	return -ENOMEM;
+	return ret;
 }

 /*

--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -361,7 +361,6 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *c, *h;
 	struct nested_state *g;
-	u32 h_intercept_exceptions;

 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);

@@ -372,14 +371,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 	h = &svm->nested.hsave->control;
 	g = &svm->nested;

-	/* No need to intercept #UD if L1 doesn't intercept it */
-	h_intercept_exceptions =
-		h->intercept_exceptions & ~(1U << UD_VECTOR);
-
 	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 	c->intercept_dr = h->intercept_dr | g->intercept_dr;
-	c->intercept_exceptions =
-		h_intercept_exceptions | g->intercept_exceptions;
+	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 	c->intercept = h->intercept | g->intercept;
 }

@@ -2202,7 +2196,6 @@ static int ud_interception(struct vcpu_svm *svm)
 {
 	int er;

-	WARN_ON_ONCE(is_guest_mode(&svm->vcpu));
 	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
 	if (er == EMULATE_USER_EXIT)
 		return 0;

--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -899,8 +899,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);

-	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-	    vmcs_field_to_offset_table[field] == 0)
+	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+		return -ENOENT;
+
+	/*
+	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
+	 * generic mechanism.
+	 */
+	asm("lfence");
+
+	if (vmcs_field_to_offset_table[field] == 0)
 		return -ENOENT;

 	return vmcs_field_to_offset_table[field];
@@ -1887,7 +1895,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	u32 eb;

-	eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) |
+	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	if ((vcpu->guest_debug &
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
@@ -1905,8 +1913,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	 */
 	if (is_guest_mode(vcpu))
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
-	else
-		eb |= 1u << UD_VECTOR;

 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -5917,7 +5923,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		return 1;  /* already handled by vmx_vcpu_run() */

 	if (is_invalid_opcode(intr_info)) {
-		WARN_ON_ONCE(is_guest_mode(vcpu));
 		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 		if (er == EMULATE_USER_EXIT)
 			return 0;