book3s_hv_builtin.c 12.9 KB
Newer Older
1 2 3 4 5 6 7 8
/*
 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 */

9
#include <linux/cpu.h>
10 11
#include <linux/kvm_host.h>
#include <linux/preempt.h>
12
#include <linux/export.h>
13 14 15
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/init.h>
16 17
#include <linux/memblock.h>
#include <linux/sizes.h>
18
#include <linux/cma.h>
19
#include <linux/bitops.h>
20 21 22 23

#include <asm/cputable.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
24
#include <asm/archrandom.h>
25
#include <asm/xics.h>
26 27
#include <asm/dbell.h>
#include <asm/cputhreads.h>
28
#include <asm/io.h>
29
#include <asm/opal.h>
30
#include <asm/smp.h>
31

32 33 34 35 36
static bool in_realmode(void)
{
	return !(mfmsr() & MSR_IR);
}

37 38
#define KVM_CMA_CHUNK_ORDER	18

39 40 41 42 43 44 45 46 47
/*
 * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
 * should be power of 2.
 */
#define HPT_ALIGN_PAGES		((1 << 18) >> PAGE_SHIFT) /* 256k */
/*
 * By default we reserve 5% of memory for hash pagetable allocation.
 */
static unsigned long kvm_cma_resv_ratio = 5;
48

49 50
static struct cma *kvm_cma;

51
static int __init early_parse_kvm_cma_resv(char *p)
A
Alexander Graf 已提交
52
{
53
	pr_debug("%s(%s)\n", __func__, p);
A
Alexander Graf 已提交
54
	if (!p)
55 56
		return -EINVAL;
	return kstrtoul(p, 0, &kvm_cma_resv_ratio);
A
Alexander Graf 已提交
57
}
58
early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
A
Alexander Graf 已提交
59

60
struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
A
Alexander Graf 已提交
61
{
62
	VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
63

64
	return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
A
Alexander Graf 已提交
65
}
66
EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
A
Alexander Graf 已提交
67

68
void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
A
Alexander Graf 已提交
69
{
70
	cma_release(kvm_cma, page, nr_pages);
A
Alexander Graf 已提交
71
}
72
EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
A
Alexander Graf 已提交
73

74 75 76 77
/**
 * kvm_cma_reserve() - reserve area for kvm hash pagetable
 *
 * This function reserves memory from early allocator. It should be
78
 * called by arch specific code once the memblock allocator
79 80 81 82 83 84 85 86
 * has been activated and all other subsystems have already allocated/reserved
 * memory.
 */
void __init kvm_cma_reserve(void)
{
	unsigned long align_size;
	struct memblock_region *reg;
	phys_addr_t selected_size = 0;
87 88 89 90 91 92

	/*
	 * We need CMA reservation only when we are in HV mode
	 */
	if (!cpu_has_feature(CPU_FTR_HVMODE))
		return;
93 94 95 96 97 98 99 100 101 102 103 104
	/*
	 * We cannot use memblock_phys_mem_size() here, because
	 * memblock_analyze() has not been called yet.
	 */
	for_each_memblock(memory, reg)
		selected_size += memblock_region_memory_end_pfn(reg) -
				 memblock_region_memory_base_pfn(reg);

	selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT;
	if (selected_size) {
		pr_debug("%s: reserving %ld MiB for global area\n", __func__,
			 (unsigned long)selected_size / SZ_1M);
105
		align_size = HPT_ALIGN_PAGES << PAGE_SHIFT;
106 107
		cma_declare_contiguous(0, selected_size, 0, align_size,
			KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma);
108 109
	}
}
110

111 112 113 114 115 116 117 118 119 120
/*
 * Real-mode H_CONFER implementation.
 * We check if we are the only vcpu out of this virtual core
 * still running in the guest and not ceded.  If so, we pop up
 * to the virtual-mode implementation; if not, just return to
 * the guest.
 */
long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
			    unsigned int yield_count)
{
121 122
	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
	int ptid = local_paca->kvm_hstate.ptid;
123 124 125 126 127 128
	int threads_running;
	int threads_ceded;
	int threads_conferring;
	u64 stop = get_tb() + 10 * tb_ticks_per_usec;
	int rv = H_SUCCESS; /* => don't yield */

129
	set_bit(ptid, &vc->conferring_threads);
130 131 132 133 134
	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
		threads_running = VCORE_ENTRY_MAP(vc);
		threads_ceded = vc->napping_threads;
		threads_conferring = vc->conferring_threads;
		if ((threads_ceded | threads_conferring) == threads_running) {
135 136 137 138
			rv = H_TOO_HARD; /* => do yield */
			break;
		}
	}
139
	clear_bit(ptid, &vc->conferring_threads);
140 141 142
	return rv;
}

143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
/*
 * When running HV mode KVM we need to block certain operations while KVM VMs
 * exist in the system. We use a counter of VMs to track this.
 *
 * One of the operations we need to block is onlining of secondaries, so we
 * protect hv_vm_count with get/put_online_cpus().
 */
static atomic_t hv_vm_count;

void kvm_hv_vm_activated(void)
{
	get_online_cpus();
	atomic_inc(&hv_vm_count);
	put_online_cpus();
}
EXPORT_SYMBOL_GPL(kvm_hv_vm_activated);

void kvm_hv_vm_deactivated(void)
{
	get_online_cpus();
	atomic_dec(&hv_vm_count);
	put_online_cpus();
}
EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated);

bool kvm_hv_mode_active(void)
{
	return atomic_read(&hv_vm_count) != 0;
}
172 173 174 175 176 177 178 179 180 181 182 183 184

extern int hcall_real_table[], hcall_real_table_end[];

int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
{
	cmd /= 4;
	if (cmd < hcall_real_table_end - hcall_real_table &&
	    hcall_real_table[cmd])
		return 1;

	return 0;
}
EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
185 186 187 188 189 190 191 192 193 194 195 196 197 198

int kvmppc_hwrng_present(void)
{
	return powernv_hwrng_present();
}
EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);

long kvmppc_h_random(struct kvm_vcpu *vcpu)
{
	if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
		return H_SUCCESS;

	return H_HARDWARE;
}
199 200 201 202 203 204 205 206

static inline void rm_writeb(unsigned long paddr, u8 val)
{
	__asm__ __volatile__("stbcix %0,0,%1"
		: : "r" (val), "r" (paddr) : "memory");
}

/*
207
 * Send an interrupt or message to another CPU.
208 209 210 211 212 213
 * The caller needs to include any barrier needed to order writes
 * to memory vs. the IPI/message.
 */
void kvmhv_rm_send_ipi(int cpu)
{
	unsigned long xics_phys;
214
	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
215

216 217 218 219 220 221 222
	/* On POWER9 we can use msgsnd for any destination cpu. */
	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
		msg |= get_hard_smp_processor_id(cpu);
		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
		return;
	}
	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
223 224 225 226 227 228 229 230 231
	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
	    cpu_first_thread_sibling(cpu) ==
	    cpu_first_thread_sibling(raw_smp_processor_id())) {
		msg |= cpu_thread_in_core(cpu);
		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
		return;
	}

	/* Else poke the target with an IPI */
232
	xics_phys = paca[cpu].kvm_hstate.xics_phys;
233 234 235
	if (!in_realmode())
		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
	else if (xics_phys)
236 237 238 239
		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
	else
		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
				     IPI_PRIORITY);
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
}

/*
 * The following functions are called from the assembly code
 * in book3s_hv_rmhandlers.S.
 */
static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
{
	int cpu = vc->pcpu;

	/* Order setting of exit map vs. msgsnd/IPI */
	smp_mb();
	for (; active; active >>= 1, ++cpu)
		if (active & 1)
			kvmhv_rm_send_ipi(cpu);
}

void kvmhv_commence_exit(int trap)
{
	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
	int ptid = local_paca->kvm_hstate.ptid;
261 262
	struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
	int me, ee, i;
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281

	/* Set our bit in the threads-exiting-guest map in the 0xff00
	   bits of vcore->entry_exit_map */
	me = 0x100 << ptid;
	do {
		ee = vc->entry_exit_map;
	} while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);

	/* Are we the first here? */
	if ((ee >> 8) != 0)
		return;

	/*
	 * Trigger the other threads in this vcore to exit the guest.
	 * If this is a hypervisor decrementer interrupt then they
	 * will be already on their way out of the guest.
	 */
	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303

	/*
	 * If we are doing dynamic micro-threading, interrupt the other
	 * subcores to pull them out of their guests too.
	 */
	if (!sip)
		return;

	for (i = 0; i < MAX_SUBCORES; ++i) {
		vc = sip->master_vcs[i];
		if (!vc)
			break;
		do {
			ee = vc->entry_exit_map;
			/* Already asked to exit? */
			if ((ee >> 8) != 0)
				break;
		} while (cmpxchg(&vc->entry_exit_map, ee,
				 ee | VCORE_EXIT_REQ) != ee);
		if ((ee >> 8) == 0)
			kvmhv_interrupt_vcore(vc, ee);
	}
304
}
305 306 307

struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
308

309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
#ifdef CONFIG_KVM_XICS
static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
					 u32 xisr)
{
	int i;

	/*
	 * We access the mapped array here without a lock.  That
	 * is safe because we never reduce the number of entries
	 * in the array and we never change the v_hwirq field of
	 * an entry once it is set.
	 *
	 * We have also carefully ordered the stores in the writer
	 * and the loads here in the reader, so that if we find a matching
	 * hwirq here, the associated GSI and irq_desc fields are valid.
	 */
	for (i = 0; i < pimap->n_mapped; i++)  {
		if (xisr == pimap->mapped[i].r_hwirq) {
			/*
			 * Order subsequent reads in the caller to serialize
			 * with the writer.
			 */
			smp_rmb();
			return &pimap->mapped[i];
		}
	}
	return NULL;
}

/*
 * If we have an interrupt that's not an IPI, check if we have a
 * passthrough adapter and if so, check if this external interrupt
 * is for the adapter.
 * We will attempt to deliver the IRQ directly to the target VCPU's
 * ICP, the virtual ICP (based on affinity - the xive value in ICS).
 *
 * If the delivery fails or if this is not for a passthrough adapter,
 * return to the host to handle this interrupt. We earlier
 * saved a copy of the XIRR in the PACA, it will be picked up by
 * the host ICP driver.
 */
350
static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368
{
	struct kvmppc_passthru_irqmap *pimap;
	struct kvmppc_irq_map *irq_map;
	struct kvm_vcpu *vcpu;

	vcpu = local_paca->kvm_hstate.kvm_vcpu;
	if (!vcpu)
		return 1;
	pimap = kvmppc_get_passthru_irqmap(vcpu->kvm);
	if (!pimap)
		return 1;
	irq_map = get_irqmap(pimap, xisr);
	if (!irq_map)
		return 1;

	/* We're handling this interrupt, generic code doesn't need to */
	local_paca->kvm_hstate.saved_xirr = 0;

369
	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again);
370 371 372
}

#else
373
static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
374 375 376 377 378
{
	return 1;
}
#endif

379 380 381 382 383
/*
 * Determine what sort of external interrupt is pending (if any).
 * Returns:
 *	0 if no interrupt is pending
 *	1 if an interrupt is pending that needs to be handled by the host
384
 *	2 Passthrough that needs completion in the host
385
 *	-1 if there was a guest wakeup IPI (which has now been cleared)
386
 *	-2 if there is PCI passthrough external interrupt that was handled
387
 */
388
static long kvmppc_read_one_intr(bool *again);
389 390

long kvmppc_read_intr(void)
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
{
	long ret = 0;
	long rc;
	bool again;

	do {
		again = false;
		rc = kvmppc_read_one_intr(&again);
		if (rc && (ret == 0 || rc > ret))
			ret = rc;
	} while (again);
	return ret;
}

static long kvmppc_read_one_intr(bool *again)
406 407 408 409 410 411
{
	unsigned long xics_phys;
	u32 h_xirr;
	__be32 xirr;
	u32 xisr;
	u8 host_ipi;
412
	int64_t rc;
413 414 415 416 417 418 419 420

	/* see if a host IPI is pending */
	host_ipi = local_paca->kvm_hstate.host_ipi;
	if (host_ipi)
		return 1;

	/* Now read the interrupt from the ICP */
	xics_phys = local_paca->kvm_hstate.xics_phys;
421 422 423 424
	rc = 0;
	if (!in_realmode())
		rc = opal_int_get_xirr(&xirr, false);
	else if (!xics_phys)
425
		rc = opal_rm_int_get_xirr(&xirr, false);
426
	else
427
		xirr = _lwzcix(xics_phys + XICS_XIRR);
428 429
	if (rc < 0)
		return 1;
430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454

	/*
	 * Save XIRR for later. Since we get control in reverse endian
	 * on LE systems, save it byte reversed and fetch it back in
	 * host endian. Note that xirr is the value read from the
	 * XIRR register, while h_xirr is the host endian version.
	 */
	h_xirr = be32_to_cpu(xirr);
	local_paca->kvm_hstate.saved_xirr = h_xirr;
	xisr = h_xirr & 0xffffff;
	/*
	 * Ensure that the store/load complete to guarantee all side
	 * effects of loading from XIRR has completed
	 */
	smp_mb();

	/* if nothing pending in the ICP */
	if (!xisr)
		return 0;

	/* We found something in the ICP...
	 *
	 * If it is an IPI, clear the MFRR and EOI it.
	 */
	if (xisr == XICS_IPI) {
455 456 457 458 459
		rc = 0;
		if (!in_realmode()) {
			opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
			rc = opal_int_eoi(h_xirr);
		} else if (xics_phys) {
460 461 462 463 464 465
			_stbcix(xics_phys + XICS_MFRR, 0xff);
			_stwcix(xics_phys + XICS_XIRR, xirr);
		} else {
			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
			rc = opal_rm_int_eoi(h_xirr);
		}
466 467
		/* If rc > 0, there is another interrupt pending */
		*again = rc > 0;
468

469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
		/*
		 * Need to ensure side effects of above stores
		 * complete before proceeding.
		 */
		smp_mb();

		/*
		 * We need to re-check host IPI now in case it got set in the
		 * meantime. If it's clear, we bounce the interrupt to the
		 * guest
		 */
		host_ipi = local_paca->kvm_hstate.host_ipi;
		if (unlikely(host_ipi != 0)) {
			/* We raced with the host,
			 * we need to resend that IPI, bummer
			 */
485 486 487 488
			if (!in_realmode())
				opal_int_set_mfrr(hard_smp_processor_id(),
						  IPI_PRIORITY);
			else if (xics_phys)
489 490 491 492
				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
			else
				opal_rm_int_set_mfrr(hard_smp_processor_id(),
						     IPI_PRIORITY);
493 494 495 496 497 498 499 500 501 502
			/* Let side effects complete */
			smp_mb();
			return 1;
		}

		/* OK, it's an IPI for us */
		local_paca->kvm_hstate.saved_xirr = 0;
		return -1;
	}

503
	return kvmppc_check_passthru(xisr, xirr, again);
504
}