book3s_hv_rm_mmu.c 24.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/types.h>
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/hugetlb.h>
14
#include <linux/module.h>
15 16 17 18 19 20 21 22 23

#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu-hash64.h>
#include <asm/hvcall.h>
#include <asm/synch.h>
#include <asm/ppc-opcode.h>

24 25 26 27 28 29 30 31 32 33 34 35 36
/* Translate address of a vmalloc'd thing to a linear map address */
static void *real_vmalloc_addr(void *x)
{
	unsigned long addr = (unsigned long) x;
	pte_t *p;

	p = find_linux_pte(swapper_pg_dir, addr);
	if (!p || !pte_present(*p))
		return NULL;
	/* assume we don't have huge pages in vmalloc space... */
	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
	return __va(addr);
}
37

38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
static int global_invalidates(struct kvm *kvm, unsigned long flags)
{
	int global;

	/*
	 * If there is only one vcore, and it's currently running,
	 * we can use tlbiel as long as we mark all other physical
	 * cores as potentially having stale TLB entries for this lpid.
	 * If we're not using MMU notifiers, we never take pages away
	 * from the guest, so we can use tlbiel if requested.
	 * Otherwise, don't use tlbiel.
	 */
	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
		global = 0;
	else if (kvm->arch.using_mmu_notifiers)
		global = 1;
	else
		global = !(flags & H_LOCAL);

	if (!global) {
		/* any other core might now have stale TLB entries... */
		smp_wmb();
		cpumask_setall(&kvm->arch.need_tlb_flush);
		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
				  &kvm->arch.need_tlb_flush);
	}

	return global;
}

69 70 71 72
/*
 * Add this HPTE into the chain for the real page.
 * Must be called with the chain locked; it unlocks the chain.
 */
73
void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
			     unsigned long *rmap, long pte_index, int realmode)
{
	struct revmap_entry *head, *tail;
	unsigned long i;

	if (*rmap & KVMPPC_RMAP_PRESENT) {
		i = *rmap & KVMPPC_RMAP_INDEX;
		head = &kvm->arch.revmap[i];
		if (realmode)
			head = real_vmalloc_addr(head);
		tail = &kvm->arch.revmap[head->back];
		if (realmode)
			tail = real_vmalloc_addr(tail);
		rev->forw = i;
		rev->back = head->back;
		tail->forw = pte_index;
		head->back = pte_index;
	} else {
		rev->forw = rev->back = pte_index;
93 94
		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
			pte_index | KVMPPC_RMAP_PRESENT;
95
	}
96
	unlock_rmap(rmap);
97
}
98
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
99 100 101

/* Remove this HPTE from the chain for a real page */
static void remove_revmap_chain(struct kvm *kvm, long pte_index,
102 103
				struct revmap_entry *rev,
				unsigned long hpte_v, unsigned long hpte_r)
104
{
105
	struct revmap_entry *next, *prev;
106 107 108
	unsigned long gfn, ptel, head;
	struct kvm_memory_slot *memslot;
	unsigned long *rmap;
109
	unsigned long rcbits;
110

111 112
	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
	ptel = rev->guest_rpte |= rcbits;
113
	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
114
	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
115
	if (!memslot)
116 117
		return;

118
	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
119 120 121 122 123 124 125 126 127 128 129 130 131 132
	lock_rmap(rmap);

	head = *rmap & KVMPPC_RMAP_INDEX;
	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
	next->back = rev->back;
	prev->forw = rev->forw;
	if (head == pte_index) {
		head = rev->forw;
		if (head == pte_index)
			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
		else
			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
	}
133
	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
134 135 136
	unlock_rmap(rmap);
}

137
static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
138
			      int writing, unsigned long *pte_sizep)
139 140 141 142 143
{
	pte_t *ptep;
	unsigned long ps = *pte_sizep;
	unsigned int shift;

144
	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
145 146 147 148 149 150 151 152 153 154
	if (!ptep)
		return __pte(0);
	if (shift)
		*pte_sizep = 1ul << shift;
	else
		*pte_sizep = PAGE_SIZE;
	if (ps > *pte_sizep)
		return __pte(0);
	if (!pte_present(*ptep))
		return __pte(0);
155
	return kvmppc_read_update_linux_pte(ptep, writing);
156 157
}

158 159 160 161 162 163
static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
{
	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
	hpte[0] = hpte_v;
}

164 165 166
long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
		       long pte_index, unsigned long pteh, unsigned long ptel,
		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
167
{
168
	unsigned long i, pa, gpa, gfn, psize;
169
	unsigned long slot_fn, hva;
170
	unsigned long *hpte;
171
	struct revmap_entry *rev;
172
	unsigned long g_ptel;
173
	struct kvm_memory_slot *memslot;
174
	unsigned long *physp, pte_size;
175
	unsigned long is_io;
176
	unsigned long *rmap;
177
	pte_t pte;
178
	unsigned int writing;
179
	unsigned long mmu_seq;
180
	unsigned long rcbits;
181 182 183

	psize = hpte_page_size(pteh, ptel);
	if (!psize)
184
		return H_PARAMETER;
185
	writing = hpte_is_writable(ptel);
186
	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
187 188
	ptel &= ~HPTE_GR_RESERVED;
	g_ptel = ptel;
189

190 191 192 193
	/* used later to detect if we might have been invalidated */
	mmu_seq = kvm->mmu_notifier_seq;
	smp_rmb();

194 195 196
	/* Find the memslot (if any) for this address */
	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
	gfn = gpa >> PAGE_SHIFT;
197
	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
198
	pa = 0;
199
	is_io = ~0ul;
200 201 202 203 204 205 206 207 208 209
	rmap = NULL;
	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
		/* PPC970 can't do emulated MMIO */
		if (!cpu_has_feature(CPU_FTR_ARCH_206))
			return H_PARAMETER;
		/* Emulated MMIO - mark this with key=31 */
		pteh |= HPTE_V_ABSENT;
		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
		goto do_insert;
	}
210 211 212 213

	/* Check if the requested page fits entirely in the memslot. */
	if (!slot_is_aligned(memslot, psize))
		return H_PARAMETER;
214
	slot_fn = gfn - memslot->base_gfn;
215
	rmap = &memslot->arch.rmap[slot_fn];
216

217
	if (!kvm->arch.using_mmu_notifiers) {
218
		physp = memslot->arch.slot_phys;
219 220 221 222 223 224 225 226 227 228 229 230 231
		if (!physp)
			return H_PARAMETER;
		physp += slot_fn;
		if (realmode)
			physp = real_vmalloc_addr(physp);
		pa = *physp;
		if (!pa)
			return H_TOO_HARD;
		is_io = pa & (HPTE_R_I | HPTE_R_W);
		pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
		pa &= PAGE_MASK;
	} else {
		/* Translate to host virtual address */
232
		hva = __gfn_to_hva_memslot(memslot, gfn);
233 234 235

		/* Look up the Linux PTE for the backing page */
		pte_size = psize;
236
		pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
237
		if (pte_present(pte)) {
238 239 240
			if (writing && !pte_write(pte))
				/* make the actual HPTE be read-only */
				ptel = hpte_make_readonly(ptel);
241 242 243 244
			is_io = hpte_cache_bits(pte_val(pte));
			pa = pte_pfn(pte) << PAGE_SHIFT;
		}
	}
245

246 247 248 249 250 251 252
	if (pte_size < psize)
		return H_PARAMETER;
	if (pa && pte_size > psize)
		pa |= gpa & (pte_size - 1);

	ptel &= ~(HPTE_R_PP0 - psize);
	ptel |= pa;
253 254 255 256 257

	if (pa)
		pteh |= HPTE_V_VALID;
	else
		pteh |= HPTE_V_ABSENT;
258

259
	/* Check WIMG */
260
	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
261 262 263 264 265 266 267 268 269
		if (is_io)
			return H_PARAMETER;
		/*
		 * Allow guest to map emulated device memory as
		 * uncacheable, but actually make it cacheable.
		 */
		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
		ptel |= HPTE_R_M;
	}
270

271
	/* Find and lock the HPTEG slot to use */
272
 do_insert:
273
	if (pte_index >= kvm->arch.hpt_npte)
274 275 276 277
		return H_PARAMETER;
	if (likely((flags & H_EXACT) == 0)) {
		pte_index &= ~7UL;
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
278
		for (i = 0; i < 8; ++i) {
279
			if ((*hpte & HPTE_V_VALID) == 0 &&
280 281
			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
					  HPTE_V_ABSENT))
282 283 284
				break;
			hpte += 2;
		}
285 286 287 288 289 290 291 292 293 294 295
		if (i == 8) {
			/*
			 * Since try_lock_hpte doesn't retry (not even stdcx.
			 * failures), it could be that there is a free slot
			 * but we transiently failed to lock it.  Try again,
			 * actually locking each slot and checking it.
			 */
			hpte -= 16;
			for (i = 0; i < 8; ++i) {
				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
					cpu_relax();
296
				if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
297 298 299 300 301 302 303
					break;
				*hpte &= ~HPTE_V_HVLOCK;
				hpte += 2;
			}
			if (i == 8)
				return H_PTEG_FULL;
		}
304
		pte_index += i;
305 306
	} else {
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
307 308
		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
				   HPTE_V_ABSENT)) {
309 310 311
			/* Lock the slot and check again */
			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
				cpu_relax();
312
			if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
313 314 315 316
				*hpte &= ~HPTE_V_HVLOCK;
				return H_PTEG_FULL;
			}
		}
317
	}
318 319

	/* Save away the guest's idea of the second HPTE dword */
320 321 322
	rev = &kvm->arch.revmap[pte_index];
	if (realmode)
		rev = real_vmalloc_addr(rev);
323
	if (rev) {
324
		rev->guest_rpte = g_ptel;
325 326
		note_hpte_modification(kvm, rev);
	}
327 328

	/* Link HPTE into reverse-map chain */
329 330 331 332
	if (pteh & HPTE_V_VALID) {
		if (realmode)
			rmap = real_vmalloc_addr(rmap);
		lock_rmap(rmap);
333 334
		/* Check for pending invalidations under the rmap chain lock */
		if (kvm->arch.using_mmu_notifiers &&
335
		    mmu_notifier_retry(kvm, mmu_seq)) {
336 337 338 339 340 341 342
			/* inval in progress, write a non-present HPTE */
			pteh |= HPTE_V_ABSENT;
			pteh &= ~HPTE_V_VALID;
			unlock_rmap(rmap);
		} else {
			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
						realmode);
343 344 345
			/* Only set R/C in real HPTE if already set in *rmap */
			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
346
		}
347
	}
348

349
	hpte[1] = ptel;
350 351

	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
352 353 354
	eieio();
	hpte[0] = pteh;
	asm volatile("ptesync" : : : "memory");
355

356
	*pte_idx_ret = pte_index;
357 358
	return H_SUCCESS;
}
359 360 361 362 363 364 365 366
EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);

long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
		    long pte_index, unsigned long pteh, unsigned long ptel)
{
	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
}
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387

#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))

static inline int try_lock_tlbie(unsigned int *lock)
{
	unsigned int tmp, old;
	unsigned int token = LOCK_TOKEN;

	asm volatile("1:lwarx	%1,0,%2\n"
		     "	cmpwi	cr0,%1,0\n"
		     "	bne	2f\n"
		     "  stwcx.	%3,0,%2\n"
		     "	bne-	1b\n"
		     "  isync\n"
		     "2:"
		     : "=&r" (tmp), "=&r" (old)
		     : "r" (lock), "r" (token)
		     : "cc", "memory");
	return old == 0;
}

388 389 390
long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
			unsigned long pte_index, unsigned long avpn,
			unsigned long *hpret)
391 392 393
{
	unsigned long *hpte;
	unsigned long v, r, rb;
394
	struct revmap_entry *rev;
395

396
	if (pte_index >= kvm->arch.hpt_npte)
397 398
		return H_PARAMETER;
	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
399
	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
400
		cpu_relax();
401
	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
402 403 404 405 406
	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
		hpte[0] &= ~HPTE_V_HVLOCK;
		return H_NOT_FOUND;
	}
407 408 409 410 411 412

	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
	v = hpte[0] & ~HPTE_V_HVLOCK;
	if (v & HPTE_V_VALID) {
		hpte[0] &= ~HPTE_V_VALID;
		rb = compute_tlbie_rb(v, hpte[1], pte_index);
413
		if (global_invalidates(kvm, flags)) {
414 415 416 417 418 419 420 421 422 423 424 425
			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
				cpu_relax();
			asm volatile("ptesync" : : : "memory");
			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
				     : : "r" (rb), "r" (kvm->arch.lpid));
			asm volatile("ptesync" : : : "memory");
			kvm->arch.tlbie_lock = 0;
		} else {
			asm volatile("ptesync" : : : "memory");
			asm volatile("tlbiel %0" : : "r" (rb));
			asm volatile("ptesync" : : : "memory");
		}
426 427
		/* Read PTE low word after tlbie to get final R/C values */
		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
428
	}
429 430
	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
	note_hpte_modification(kvm, rev);
431 432
	unlock_hpte(hpte, 0);

433 434
	hpret[0] = v;
	hpret[1] = r;
435 436
	return H_SUCCESS;
}
437 438 439 440 441 442 443 444
EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);

long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
		     unsigned long pte_index, unsigned long avpn)
{
	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
				  &vcpu->arch.gpr[4]);
}
445 446 447 448 449

long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
{
	struct kvm *kvm = vcpu->kvm;
	unsigned long *args = &vcpu->arch.gpr[4];
450 451 452
	unsigned long *hp, *hptes[4], tlbrb[4];
	long int i, j, k, n, found, indexes[4];
	unsigned long flags, req, pte_index, rcbits;
453 454
	long int local = 0;
	long int ret = H_SUCCESS;
455
	struct revmap_entry *rev, *revs[4];
456 457 458

	if (atomic_read(&kvm->online_vcpus) == 1)
		local = 1;
459 460 461 462 463 464 465 466 467 468 469
	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
		n = 0;
		for (; i < 4; ++i) {
			j = i * 2;
			pte_index = args[j];
			flags = pte_index >> 56;
			pte_index &= ((1ul << 56) - 1);
			req = flags >> 6;
			flags &= 3;
			if (req == 3) {		/* no more requests */
				i = 4;
470
				break;
471
			}
472 473
			if (req != 1 || flags == 3 ||
			    pte_index >= kvm->arch.hpt_npte) {
474 475 476
				/* parameter error */
				args[j] = ((0xa0 | flags) << 56) + pte_index;
				ret = H_PARAMETER;
477
				break;
478 479 480 481 482 483 484 485 486 487 488 489 490 491
			}
			hp = (unsigned long *)
				(kvm->arch.hpt_virt + (pte_index << 4));
			/* to avoid deadlock, don't spin except for first */
			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
				if (n)
					break;
				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
					cpu_relax();
			}
			found = 0;
			if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
				switch (flags & 3) {
				case 0:		/* absolute */
492
					found = 1;
493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
					break;
				case 1:		/* andcond */
					if (!(hp[0] & args[j + 1]))
						found = 1;
					break;
				case 2:		/* AVPN */
					if ((hp[0] & ~0x7fUL) == args[j + 1])
						found = 1;
					break;
				}
			}
			if (!found) {
				hp[0] &= ~HPTE_V_HVLOCK;
				args[j] = ((0x90 | flags) << 56) + pte_index;
				continue;
508
			}
509 510 511

			args[j] = ((0x80 | flags) << 56) + pte_index;
			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
512
			note_hpte_modification(kvm, rev);
513

514 515 516 517
			if (!(hp[0] & HPTE_V_VALID)) {
				/* insert R and C bits from PTE */
				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
				args[j] |= rcbits << (56 - 5);
518
				hp[0] = 0;
519
				continue;
520
			}
521 522 523 524 525 526 527

			hp[0] &= ~HPTE_V_VALID;		/* leave it locked */
			tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
			indexes[n] = j;
			hptes[n] = hp;
			revs[n] = rev;
			++n;
528
		}
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548

		if (!n)
			break;

		/* Now that we've collected a batch, do the tlbies */
		if (!local) {
			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
				cpu_relax();
			asm volatile("ptesync" : : : "memory");
			for (k = 0; k < n; ++k)
				asm volatile(PPC_TLBIE(%1,%0) : :
					     "r" (tlbrb[k]),
					     "r" (kvm->arch.lpid));
			asm volatile("eieio; tlbsync; ptesync" : : : "memory");
			kvm->arch.tlbie_lock = 0;
		} else {
			asm volatile("ptesync" : : : "memory");
			for (k = 0; k < n; ++k)
				asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
			asm volatile("ptesync" : : : "memory");
549
		}
550

551
		/* Read PTE low words after tlbie to get final R/C values */
552 553 554 555 556
		for (k = 0; k < n; ++k) {
			j = indexes[k];
			pte_index = args[j] & ((1ul << 56) - 1);
			hp = hptes[k];
			rev = revs[k];
557 558 559 560
			remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
			args[j] |= rcbits << (56 - 5);
			hp[0] = 0;
561
		}
562
	}
563

564 565 566 567 568 569 570 571 572
	return ret;
}

long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
		      unsigned long pte_index, unsigned long avpn,
		      unsigned long va)
{
	struct kvm *kvm = vcpu->kvm;
	unsigned long *hpte;
573 574
	struct revmap_entry *rev;
	unsigned long v, r, rb, mask, bits;
575

576
	if (pte_index >= kvm->arch.hpt_npte)
577
		return H_PARAMETER;
578

579
	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
580
	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
581
		cpu_relax();
582
	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
583 584 585 586
	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
		hpte[0] &= ~HPTE_V_HVLOCK;
		return H_NOT_FOUND;
	}
587

588
	v = hpte[0];
589 590 591 592 593 594 595 596 597 598 599
	bits = (flags << 55) & HPTE_R_PP0;
	bits |= (flags << 48) & HPTE_R_KEY_HI;
	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);

	/* Update guest view of 2nd HPTE dword */
	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
	if (rev) {
		r = (rev->guest_rpte & ~mask) | bits;
		rev->guest_rpte = r;
600
		note_hpte_modification(kvm, rev);
601 602 603 604
	}
	r = (hpte[1] & ~mask) | bits;

	/* Update HPTE */
605 606 607
	if (v & HPTE_V_VALID) {
		rb = compute_tlbie_rb(v, r, pte_index);
		hpte[0] = v & ~HPTE_V_VALID;
608
		if (global_invalidates(kvm, flags)) {
609 610 611 612 613 614 615 616 617 618 619 620
			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
				cpu_relax();
			asm volatile("ptesync" : : : "memory");
			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
				     : : "r" (rb), "r" (kvm->arch.lpid));
			asm volatile("ptesync" : : : "memory");
			kvm->arch.tlbie_lock = 0;
		} else {
			asm volatile("ptesync" : : : "memory");
			asm volatile("tlbiel %0" : : "r" (rb));
			asm volatile("ptesync" : : : "memory");
		}
621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
		/*
		 * If the host has this page as readonly but the guest
		 * wants to make it read/write, reduce the permissions.
		 * Checking the host permissions involves finding the
		 * memslot and then the Linux PTE for the page.
		 */
		if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
			unsigned long psize, gfn, hva;
			struct kvm_memory_slot *memslot;
			pgd_t *pgdir = vcpu->arch.pgdir;
			pte_t pte;

			psize = hpte_page_size(v, r);
			gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
			memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
			if (memslot) {
				hva = __gfn_to_hva_memslot(memslot, gfn);
				pte = lookup_linux_pte(pgdir, hva, 1, &psize);
				if (pte_present(pte) && !pte_write(pte))
					r = hpte_make_readonly(r);
			}
		}
643 644 645 646 647 648 649 650 651 652 653 654
	}
	hpte[1] = r;
	eieio();
	hpte[0] = v & ~HPTE_V_HVLOCK;
	asm volatile("ptesync" : : : "memory");
	return H_SUCCESS;
}

long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
		   unsigned long pte_index)
{
	struct kvm *kvm = vcpu->kvm;
655
	unsigned long *hpte, v, r;
656
	int i, n = 1;
657
	struct revmap_entry *rev = NULL;
658

659
	if (pte_index >= kvm->arch.hpt_npte)
660 661 662 663 664
		return H_PARAMETER;
	if (flags & H_READ_4) {
		pte_index &= ~3;
		n = 4;
	}
665
	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
666 667
	for (i = 0; i < n; ++i, ++pte_index) {
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
668
		v = hpte[0] & ~HPTE_V_HVLOCK;
669
		r = hpte[1];
670 671 672 673
		if (v & HPTE_V_ABSENT) {
			v &= ~HPTE_V_ABSENT;
			v |= HPTE_V_VALID;
		}
674
		if (v & HPTE_V_VALID) {
675
			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
676 677
			r &= ~HPTE_GR_RESERVED;
		}
678
		vcpu->arch.gpr[4 + i * 2] = v;
679 680 681 682
		vcpu->arch.gpr[5 + i * 2] = r;
	}
	return H_SUCCESS;
}
683

684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
			unsigned long pte_index)
{
	unsigned long rb;

	hptep[0] &= ~HPTE_V_VALID;
	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
		cpu_relax();
	asm volatile("ptesync" : : : "memory");
	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
		     : : "r" (rb), "r" (kvm->arch.lpid));
	asm volatile("ptesync" : : : "memory");
	kvm->arch.tlbie_lock = 0;
}
EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);

701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
			   unsigned long pte_index)
{
	unsigned long rb;
	unsigned char rbyte;

	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
	rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
	/* modify only the second-last byte, which contains the ref bit */
	*((char *)hptep + 14) = rbyte;
	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
		cpu_relax();
	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
		     : : "r" (rb), "r" (kvm->arch.lpid));
	asm volatile("ptesync" : : : "memory");
	kvm->arch.tlbie_lock = 0;
}
EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);

720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755
static int slb_base_page_shift[4] = {
	24,	/* 16M */
	16,	/* 64k */
	34,	/* 16G */
	20,	/* 1M, unsupported */
};

long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
			      unsigned long valid)
{
	unsigned int i;
	unsigned int pshift;
	unsigned long somask;
	unsigned long vsid, hash;
	unsigned long avpn;
	unsigned long *hpte;
	unsigned long mask, val;
	unsigned long v, r;

	/* Get page shift, work out hash and AVPN etc. */
	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
	val = 0;
	pshift = 12;
	if (slb_v & SLB_VSID_L) {
		mask |= HPTE_V_LARGE;
		val |= HPTE_V_LARGE;
		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
	}
	if (slb_v & SLB_VSID_B_1T) {
		somask = (1UL << 40) - 1;
		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
		vsid ^= vsid << 25;
	} else {
		somask = (1UL << 28) - 1;
		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
	}
756
	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
	avpn = slb_v & ~(somask >> 16);	/* also includes B */
	avpn |= (eaddr & somask) >> 16;

	if (pshift >= 24)
		avpn &= ~((1UL << (pshift - 16)) - 1);
	else
		avpn &= ~0x7fUL;
	val |= avpn;

	for (;;) {
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));

		for (i = 0; i < 16; i += 2) {
			/* Read the PTE racily */
			v = hpte[i] & ~HPTE_V_HVLOCK;

			/* Check valid/absent, hash, segment size and AVPN */
			if (!(v & valid) || (v & mask) != val)
				continue;

			/* Lock the PTE and read it under the lock */
			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
				cpu_relax();
			v = hpte[i] & ~HPTE_V_HVLOCK;
			r = hpte[i+1];

			/*
			 * Check the HPTE again, including large page size
			 * Since we don't currently allow any MPSS (mixed
			 * page-size segment) page sizes, it is sufficient
			 * to check against the actual page size.
			 */
			if ((v & valid) && (v & mask) == val &&
			    hpte_page_size(v, r) == (1ul << pshift))
				/* Return with the HPTE still locked */
				return (hash << 3) + (i >> 1);

			/* Unlock and move on */
			hpte[i] = v;
		}

		if (val & HPTE_V_SECONDARY)
			break;
		val |= HPTE_V_SECONDARY;
801
		hash = hash ^ kvm->arch.hpt_mask;
802 803 804 805 806 807 808
	}
	return -1;
}
EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);

/*
 * Called in real mode to check whether an HPTE not found fault
809 810 811
 * is due to accessing a paged-out page or an emulated MMIO page,
 * or if a protection fault is due to accessing a page that the
 * guest wanted read/write access to but which we made read-only.
812 813 814
 * Returns a possibly modified status (DSISR) value if not
 * (i.e. pass the interrupt to the guest),
 * -1 to pass the fault up to host kernel mode code, -2 to do that
815
 * and also load the instruction word (for MMIO emulation),
816 817 818
 * or 0 if we should make the guest retry the access.
 */
long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
819
			  unsigned long slb_v, unsigned int status, bool data)
820 821 822 823 824 825 826 827 828
{
	struct kvm *kvm = vcpu->kvm;
	long int index;
	unsigned long v, r, gr;
	unsigned long *hpte;
	unsigned long valid;
	struct revmap_entry *rev;
	unsigned long pp, key;

829 830 831 832
	/* For protection fault, expect to find a valid HPTE */
	valid = HPTE_V_VALID;
	if (status & DSISR_NOHPTE)
		valid |= HPTE_V_ABSENT;
833

834
	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
835 836 837 838 839
	if (index < 0) {
		if (status & DSISR_NOHPTE)
			return status;	/* there really was no HPTE */
		return 0;		/* for prot fault, HPTE disappeared */
	}
840 841 842 843 844 845
	hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
	v = hpte[0] & ~HPTE_V_HVLOCK;
	r = hpte[1];
	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
	gr = rev->guest_rpte;

846
	unlock_hpte(hpte, v);
847

848 849
	/* For not found, if the HPTE is valid by now, retry the instruction */
	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
850 851 852 853 854
		return 0;

	/* Check access permissions to the page */
	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
855 856 857 858 859 860 861
	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
	if (!data) {
		if (gr & (HPTE_R_N | HPTE_R_G))
			return status | SRR1_ISI_N_OR_G;
		if (!hpte_read_permission(pp, slb_v & key))
			return status | SRR1_ISI_PROT;
	} else if (status & DSISR_ISSTORE) {
862 863
		/* check write permission */
		if (!hpte_write_permission(pp, slb_v & key))
864
			return status | DSISR_PROTFAULT;
865 866
	} else {
		if (!hpte_read_permission(pp, slb_v & key))
867
			return status | DSISR_PROTFAULT;
868 869 870
	}

	/* Check storage key, if applicable */
871
	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
872 873 874 875
		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
		if (status & DSISR_ISSTORE)
			perm >>= 1;
		if (perm & 1)
876
			return status | DSISR_KEYFAULT;
877 878 879 880 881 882 883 884
	}

	/* Save HPTE info for virtual-mode handler */
	vcpu->arch.pgfault_addr = addr;
	vcpu->arch.pgfault_index = index;
	vcpu->arch.pgfault_hpte[0] = v;
	vcpu->arch.pgfault_hpte[1] = r;

885 886 887 888
	/* Check the storage key to see if it is possibly emulated MMIO */
	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
889 890 891 892
		return -2;	/* MMIO emulation - load instr word */

	return -1;		/* send fault up to host kernel mode */
}