KVM: PPC: Book3S HV: Align physical and virtual CPU thread numbers

On a threaded processor such as POWER7, we group VCPUs into virtual cores and arrange that the VCPUs in a virtual core run on the same physical core. Currently we don't enforce any correspondence between virtual thread numbers within a virtual core and physical thread numbers. Physical threads are allocated starting at 0 on a first-come first-served basis to runnable virtual threads (VCPUs). POWER8 implements a new "msgsndp" instruction which guest kernels can use to interrupt other threads in the same core or sub-core. Since the instruction takes the destination physical thread ID as a parameter, it becomes necessary to align the physical thread IDs with the virtual thread IDs, that is, to make sure virtual thread N within a virtual core always runs on physical thread N. This means that it's possible that thread 0, which is where we call __kvmppc_vcore_entry, may end up running some other vcpu than the one whose task called kvmppc_run_core(), or it may end up running no vcpu at all, if for example thread 0 of the virtual core is currently executing in userspace. However, we do need thread 0 to be responsible for switching the MMU -- a previous version of this patch that had other threads switching the MMU was found to be responsible for occasional memory corruption and machine check interrupts in the guest on POWER7 machines. To accommodate this, we no longer pass the vcpu pointer to __kvmppc_vcore_entry, but instead let the assembly code load it from the PACA. Since the assembly code will need to know the kvm pointer and the thread ID for threads which don't have a vcpu, we move the thread ID into the PACA and we add a kvm pointer to the virtual core structure. In the case where thread 0 has no vcpu to run, it still calls into kvmppc_hv_entry in order to do the MMU switch, and then naps until either its vcpu is ready to run in the guest, or some other thread needs to exit the guest. In the latter case, thread 0 jumps to the code that switches the MMU back to the host. This control flow means that now we switch the MMU before loading any guest vcpu state. Similarly, on guest exit we now save all the guest vcpu state before switching the MMU back to the host. This has required substantial code movement, making the diff rather large. Signed-off-by: N Paul Mackerras <paulus@samba.org> Signed-off-by: N Alexander Graf <agraf@suse.de>

KVM: PPC: Book3S HV: Align physical and virtual CPU thread numbers
On a threaded processor such as POWER7, we group VCPUs into virtual cores and arrange that the VCPUs in a virtual core run on the same physical core. Currently we don't enforce any correspondence between virtual thread numbers within a virtual core and physical thread numbers. Physical threads are allocated starting at 0 on a first-come first-served basis to runnable virtual threads (VCPUs). POWER8 implements a new "msgsndp" instruction which guest kernels can use to interrupt other threads in the same core or sub-core. Since the instruction takes the destination physical thread ID as a parameter, it becomes necessary to align the physical thread IDs with the virtual thread IDs, that is, to make sure virtual thread N within a virtual core always runs on physical thread N. This means that it's possible that thread 0, which is where we call __kvmppc_vcore_entry, may end up running some other vcpu than the one whose task called kvmppc_run_core(), or it may end up running no vcpu at all, if for example thread 0 of the virtual core is currently executing in userspace. However, we do need thread 0 to be responsible for switching the MMU -- a previous version of this patch that had other threads switching the MMU was found to be responsible for occasional memory corruption and machine check interrupts in the guest on POWER7 machines. To accommodate this, we no longer pass the vcpu pointer to __kvmppc_vcore_entry, but instead let the assembly code load it from the PACA. Since the assembly code will need to know the kvm pointer and the thread ID for threads which don't have a vcpu, we move the thread ID into the PACA and we add a kvm pointer to the virtual core structure. In the case where thread 0 has no vcpu to run, it still calls into kvmppc_hv_entry in order to do the MMU switch, and then naps until either its vcpu is ready to run in the guest, or some other thread needs to exit the guest. In the latter case, thread 0 jumps to the code that switches the MMU back to the host. This control flow means that now we switch the MMU before loading any guest vcpu state. Similarly, on guest exit we now save all the guest vcpu state before switching the MMU back to the host. This has required substantial code movement, making the diff rather large. Signed-off-by: N Paul Mackerras <paulus@samba.org> Signed-off-by: N Alexander Graf <agraf@suse.de>
e0b7ec05 · Paul Mackerras · Alexander Graf · eee7ff9d · e0b7ec05 · e0b7ec05
6 changed file
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -87,6 +87,7 @@ struct kvmppc_host_state {
 	u8 hwthread_req;
 	u8 hwthread_state;
 	u8 host_ipi;
+	u8 ptid;
 	struct kvm_vcpu *kvm_vcpu;
 	struct kvmppc_vcore *kvm_vcore;
 	unsigned long xics_phys;

--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -288,6 +288,7 @@ struct kvmppc_vcore {
 	int n_woken;
 	int nap_count;
 	int napping_threads;
+	int first_vcpuid;
 	u16 pcpu;
 	u16 last_cpu;
 	u8 vcore_state;
@@ -298,6 +299,7 @@ struct kvmppc_vcore {
 	u64 stolen_tb;
 	u64 preempt_tb;
 	struct kvm_vcpu *runner;
+	struct kvm *kvm;
 	u64 tb_offset;		/* guest timebase - host timebase */
 	ulong lpcr;
 	u32 arch_compat;

--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -506,7 +506,6 @@ int main(void)
 	DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
 	DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
 	DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
-	DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
 	DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
 	DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
 	DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
@@ -514,6 +513,7 @@ int main(void)
 	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
+	DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
 	DEFINE(VCORE_TB_OFFSET, offsetof(struct kvmppc_vcore, tb_offset));
 	DEFINE(VCORE_LPCR, offsetof(struct kvmppc_vcore, lpcr));
 	DEFINE(VCORE_PCR, offsetof(struct kvmppc_vcore, pcr));
@@ -583,6 +583,7 @@ int main(void)
 	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
 	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
 	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
+	HSTATE_FIELD(HSTATE_PTID, ptid);
 	HSTATE_FIELD(HSTATE_MMCR, host_mmcr);
 	HSTATE_FIELD(HSTATE_PMC, host_pmc);
 	HSTATE_FIELD(HSTATE_PURR, host_purr);

--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -990,6 +990,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 			init_waitqueue_head(&vcore->wq);
 			vcore->preempt_tb = TB_NIL;
 			vcore->lpcr = kvm->arch.lpcr;
+			vcore->first_vcpuid = core * threads_per_core;
+			vcore->kvm = kvm;
 		}
 		kvm->arch.vcores[core] = vcore;
 		kvm->arch.online_vcores++;
@@ -1003,6 +1005,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	++vcore->num_threads;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
+	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -1066,7 +1069,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
 	}
 }
-extern int __kvmppc_vcore_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
+extern void __kvmppc_vcore_entry(void);
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 				   struct kvm_vcpu *vcpu)
@@ -1140,15 +1143,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcore = vc;
-	tpaca->kvm_hstate.napping = 0;
+	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
 	vcpu->cpu = vc->pcpu;
 	smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-	if (vcpu->arch.ptid) {
+	if (cpu != smp_processor_id()) {
 #ifdef CONFIG_KVM_XICS
 		xics_wake_cpu(cpu);
 #endif
-		++vc->n_woken;
+		if (vcpu->arch.ptid)
+			++vc->n_woken;
 	}
 #endif
 }
@@ -1205,10 +1209,10 @@ static int on_primary_thread(void)
 */
 static void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
+	struct kvm_vcpu *vcpu, *vnext;
 	long ret;
 	u64 now;
-	int ptid, i, need_vpa_update;
+	int i, need_vpa_update;
 	int srcu_idx;
 	struct kvm_vcpu *vcpus_to_update[threads_per_core];
@@ -1245,25 +1249,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		spin_lock(&vc->lock);
 	}
-	/*
-	 * Assign physical thread IDs, first to non-ceded vcpus
-	 * and then to ceded ones.
-	 */
-	ptid = 0;
-	vcpu0 = NULL;
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		if (!vcpu->arch.ceded) {
-			if (!ptid)
-				vcpu0 = vcpu;
-			vcpu->arch.ptid = ptid++;
-		}
-	}
-	if (!vcpu0)
-		goto out;	/* nothing to run; should never happen */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-		if (vcpu->arch.ceded)
-			vcpu->arch.ptid = ptid++;
 	/*
 	 * Make sure we are running on thread 0, and that
 	 * secondary threads are offline.
@@ -1280,15 +1265,19 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
+	/* Set this explicitly in case thread 0 doesn't have a vcpu */
+	get_paca()->kvm_hstate.kvm_vcore = vc;
+	get_paca()->kvm_hstate.ptid = 0;
 	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
 	spin_unlock(&vc->lock);
 	kvm_guest_enter();
-	srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
+	srcu_idx = srcu_read_lock(&vc->kvm->srcu);
-	__kvmppc_vcore_entry(NULL, vcpu0);
+	__kvmppc_vcore_entry();
 	spin_lock(&vc->lock);
 	/* disable sending of IPIs on virtual external irqs */
@@ -1303,7 +1292,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->vcore_state = VCORE_EXITING;
 	spin_unlock(&vc->lock);
-	srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx);
+	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
@@ -1411,7 +1400,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	if (!signal_pending(current)) {
 		if (vc->vcore_state == VCORE_RUNNING &&
 		    VCORE_EXIT_COUNT(vc) == 0) {
-			vcpu->arch.ptid = vc->n_runnable - 1;
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {

--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -35,7 +35,7 @@
 ****************************************************************************/
 /* Registers:
- *  r4: vcpu pointer
+ *  none
 */
 _GLOBAL(__kvmppc_vcore_entry)
@@ -71,7 +71,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	mtmsrd  r10,1
 	/* Save host PMU registers */
-	/* R4 is live here (vcpu pointer) but not r3 or r5 */
 	li	r3, 1
 	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
 	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
@@ -136,16 +135,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 	 * enters the guest with interrupts enabled.
 	 */
 BEGIN_FTR_SECTION
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	ld	r0, VCPU_PENDING_EXC(r4)
 	li	r7, (1 << BOOK3S_IRQPRIO_EXTERNAL)
 	oris	r7, r7, (1 << BOOK3S_IRQPRIO_EXTERNAL_LEVEL)@h
 	and.	r0, r0, r7
 	beq	32f
-	mr	r31, r4
 	lhz	r3, PACAPACAINDEX(r13)
 	bl	smp_send_reschedule
 	nop
-	mr	r4, r31
 32:
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201)
 #endif /* CONFIG_SMP */

--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S