book3s_hv_rm_mmu.c 23.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/types.h>
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/hugetlb.h>
14
#include <linux/module.h>
15 16 17 18 19 20 21 22 23

#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu-hash64.h>
#include <asm/hvcall.h>
#include <asm/synch.h>
#include <asm/ppc-opcode.h>

24 25 26 27 28 29 30 31 32 33 34 35 36
/* Translate address of a vmalloc'd thing to a linear map address */
static void *real_vmalloc_addr(void *x)
{
	unsigned long addr = (unsigned long) x;
	pte_t *p;

	p = find_linux_pte(swapper_pg_dir, addr);
	if (!p || !pte_present(*p))
		return NULL;
	/* assume we don't have huge pages in vmalloc space... */
	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
	return __va(addr);
}
37

38 39 40 41
/*
 * Add this HPTE into the chain for the real page.
 * Must be called with the chain locked; it unlocks the chain.
 */
42
void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
			     unsigned long *rmap, long pte_index, int realmode)
{
	struct revmap_entry *head, *tail;
	unsigned long i;

	if (*rmap & KVMPPC_RMAP_PRESENT) {
		i = *rmap & KVMPPC_RMAP_INDEX;
		head = &kvm->arch.revmap[i];
		if (realmode)
			head = real_vmalloc_addr(head);
		tail = &kvm->arch.revmap[head->back];
		if (realmode)
			tail = real_vmalloc_addr(tail);
		rev->forw = i;
		rev->back = head->back;
		tail->forw = pte_index;
		head->back = pte_index;
	} else {
		rev->forw = rev->back = pte_index;
62 63
		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
			pte_index | KVMPPC_RMAP_PRESENT;
64
	}
65
	unlock_rmap(rmap);
66
}
67
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
68

69 70 71 72 73 74 75 76 77 78 79
/*
 * Note modification of an HPTE; set the HPTE modified bit
 * if anyone is interested.
 */
static inline void note_hpte_modification(struct kvm *kvm,
					  struct revmap_entry *rev)
{
	if (atomic_read(&kvm->arch.hpte_mod_interest))
		rev->guest_rpte |= HPTE_GR_MODIFIED;
}

80 81
/* Remove this HPTE from the chain for a real page */
static void remove_revmap_chain(struct kvm *kvm, long pte_index,
82 83
				struct revmap_entry *rev,
				unsigned long hpte_v, unsigned long hpte_r)
84
{
85
	struct revmap_entry *next, *prev;
86 87 88
	unsigned long gfn, ptel, head;
	struct kvm_memory_slot *memslot;
	unsigned long *rmap;
89
	unsigned long rcbits;
90

91 92
	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
	ptel = rev->guest_rpte |= rcbits;
93
	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
94
	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
95
	if (!memslot)
96 97
		return;

98
	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
99 100 101 102 103 104 105 106 107 108 109 110 111 112
	lock_rmap(rmap);

	head = *rmap & KVMPPC_RMAP_INDEX;
	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
	next->back = rev->back;
	prev->forw = rev->forw;
	if (head == pte_index) {
		head = rev->forw;
		if (head == pte_index)
			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
		else
			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
	}
113
	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
114 115 116
	unlock_rmap(rmap);
}

117
static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
118
			      int writing, unsigned long *pte_sizep)
119 120 121 122 123
{
	pte_t *ptep;
	unsigned long ps = *pte_sizep;
	unsigned int shift;

124
	ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
125 126 127 128 129 130 131 132 133 134
	if (!ptep)
		return __pte(0);
	if (shift)
		*pte_sizep = 1ul << shift;
	else
		*pte_sizep = PAGE_SIZE;
	if (ps > *pte_sizep)
		return __pte(0);
	if (!pte_present(*ptep))
		return __pte(0);
135
	return kvmppc_read_update_linux_pte(ptep, writing);
136 137
}

138 139 140 141 142 143
static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
{
	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
	hpte[0] = hpte_v;
}

144 145 146
long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
		       long pte_index, unsigned long pteh, unsigned long ptel,
		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
147
{
148
	unsigned long i, pa, gpa, gfn, psize;
149
	unsigned long slot_fn, hva;
150
	unsigned long *hpte;
151
	struct revmap_entry *rev;
152
	unsigned long g_ptel;
153
	struct kvm_memory_slot *memslot;
154
	unsigned long *physp, pte_size;
155
	unsigned long is_io;
156
	unsigned long *rmap;
157
	pte_t pte;
158
	unsigned int writing;
159
	unsigned long mmu_seq;
160
	unsigned long rcbits;
161 162 163

	psize = hpte_page_size(pteh, ptel);
	if (!psize)
164
		return H_PARAMETER;
165
	writing = hpte_is_writable(ptel);
166
	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
167 168
	ptel &= ~HPTE_GR_RESERVED;
	g_ptel = ptel;
169

170 171 172 173
	/* used later to detect if we might have been invalidated */
	mmu_seq = kvm->mmu_notifier_seq;
	smp_rmb();

174 175 176
	/* Find the memslot (if any) for this address */
	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
	gfn = gpa >> PAGE_SHIFT;
177
	memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
178
	pa = 0;
179
	is_io = ~0ul;
180 181 182 183 184 185 186 187 188 189
	rmap = NULL;
	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
		/* PPC970 can't do emulated MMIO */
		if (!cpu_has_feature(CPU_FTR_ARCH_206))
			return H_PARAMETER;
		/* Emulated MMIO - mark this with key=31 */
		pteh |= HPTE_V_ABSENT;
		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
		goto do_insert;
	}
190 191 192 193

	/* Check if the requested page fits entirely in the memslot. */
	if (!slot_is_aligned(memslot, psize))
		return H_PARAMETER;
194
	slot_fn = gfn - memslot->base_gfn;
195
	rmap = &memslot->arch.rmap[slot_fn];
196

197
	if (!kvm->arch.using_mmu_notifiers) {
198
		physp = memslot->arch.slot_phys;
199 200 201 202 203 204 205 206 207 208 209 210 211
		if (!physp)
			return H_PARAMETER;
		physp += slot_fn;
		if (realmode)
			physp = real_vmalloc_addr(physp);
		pa = *physp;
		if (!pa)
			return H_TOO_HARD;
		is_io = pa & (HPTE_R_I | HPTE_R_W);
		pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
		pa &= PAGE_MASK;
	} else {
		/* Translate to host virtual address */
212
		hva = __gfn_to_hva_memslot(memslot, gfn);
213 214 215

		/* Look up the Linux PTE for the backing page */
		pte_size = psize;
216
		pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
217
		if (pte_present(pte)) {
218 219 220
			if (writing && !pte_write(pte))
				/* make the actual HPTE be read-only */
				ptel = hpte_make_readonly(ptel);
221 222 223 224
			is_io = hpte_cache_bits(pte_val(pte));
			pa = pte_pfn(pte) << PAGE_SHIFT;
		}
	}
225

226 227 228 229 230 231 232
	if (pte_size < psize)
		return H_PARAMETER;
	if (pa && pte_size > psize)
		pa |= gpa & (pte_size - 1);

	ptel &= ~(HPTE_R_PP0 - psize);
	ptel |= pa;
233 234 235 236 237

	if (pa)
		pteh |= HPTE_V_VALID;
	else
		pteh |= HPTE_V_ABSENT;
238

239
	/* Check WIMG */
240
	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
241 242 243 244 245 246 247 248 249
		if (is_io)
			return H_PARAMETER;
		/*
		 * Allow guest to map emulated device memory as
		 * uncacheable, but actually make it cacheable.
		 */
		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
		ptel |= HPTE_R_M;
	}
250

251
	/* Find and lock the HPTEG slot to use */
252
 do_insert:
253
	if (pte_index >= kvm->arch.hpt_npte)
254 255 256 257
		return H_PARAMETER;
	if (likely((flags & H_EXACT) == 0)) {
		pte_index &= ~7UL;
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
258
		for (i = 0; i < 8; ++i) {
259
			if ((*hpte & HPTE_V_VALID) == 0 &&
260 261
			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
					  HPTE_V_ABSENT))
262 263 264
				break;
			hpte += 2;
		}
265 266 267 268 269 270 271 272 273 274 275
		if (i == 8) {
			/*
			 * Since try_lock_hpte doesn't retry (not even stdcx.
			 * failures), it could be that there is a free slot
			 * but we transiently failed to lock it.  Try again,
			 * actually locking each slot and checking it.
			 */
			hpte -= 16;
			for (i = 0; i < 8; ++i) {
				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
					cpu_relax();
276
				if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
277 278 279 280 281 282 283
					break;
				*hpte &= ~HPTE_V_HVLOCK;
				hpte += 2;
			}
			if (i == 8)
				return H_PTEG_FULL;
		}
284
		pte_index += i;
285 286
	} else {
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
287 288
		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
				   HPTE_V_ABSENT)) {
289 290 291
			/* Lock the slot and check again */
			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
				cpu_relax();
292
			if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
293 294 295 296
				*hpte &= ~HPTE_V_HVLOCK;
				return H_PTEG_FULL;
			}
		}
297
	}
298 299

	/* Save away the guest's idea of the second HPTE dword */
300 301 302
	rev = &kvm->arch.revmap[pte_index];
	if (realmode)
		rev = real_vmalloc_addr(rev);
303
	if (rev) {
304
		rev->guest_rpte = g_ptel;
305 306
		note_hpte_modification(kvm, rev);
	}
307 308

	/* Link HPTE into reverse-map chain */
309 310 311 312
	if (pteh & HPTE_V_VALID) {
		if (realmode)
			rmap = real_vmalloc_addr(rmap);
		lock_rmap(rmap);
313 314
		/* Check for pending invalidations under the rmap chain lock */
		if (kvm->arch.using_mmu_notifiers &&
315
		    mmu_notifier_retry(kvm, mmu_seq)) {
316 317 318 319 320 321 322
			/* inval in progress, write a non-present HPTE */
			pteh |= HPTE_V_ABSENT;
			pteh &= ~HPTE_V_VALID;
			unlock_rmap(rmap);
		} else {
			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
						realmode);
323 324 325
			/* Only set R/C in real HPTE if already set in *rmap */
			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
326
		}
327
	}
328

329
	hpte[1] = ptel;
330 331

	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
332 333 334
	eieio();
	hpte[0] = pteh;
	asm volatile("ptesync" : : : "memory");
335

336
	*pte_idx_ret = pte_index;
337 338
	return H_SUCCESS;
}
339 340 341 342 343 344 345 346
EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);

long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
		    long pte_index, unsigned long pteh, unsigned long ptel)
{
	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
}
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367

#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))

static inline int try_lock_tlbie(unsigned int *lock)
{
	unsigned int tmp, old;
	unsigned int token = LOCK_TOKEN;

	asm volatile("1:lwarx	%1,0,%2\n"
		     "	cmpwi	cr0,%1,0\n"
		     "	bne	2f\n"
		     "  stwcx.	%3,0,%2\n"
		     "	bne-	1b\n"
		     "  isync\n"
		     "2:"
		     : "=&r" (tmp), "=&r" (old)
		     : "r" (lock), "r" (token)
		     : "cc", "memory");
	return old == 0;
}

368 369 370
long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
			unsigned long pte_index, unsigned long avpn,
			unsigned long *hpret)
371 372 373
{
	unsigned long *hpte;
	unsigned long v, r, rb;
374
	struct revmap_entry *rev;
375

376
	if (pte_index >= kvm->arch.hpt_npte)
377 378
		return H_PARAMETER;
	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
379
	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
380
		cpu_relax();
381
	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
382 383 384 385 386
	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
	    ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
		hpte[0] &= ~HPTE_V_HVLOCK;
		return H_NOT_FOUND;
	}
387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405

	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
	v = hpte[0] & ~HPTE_V_HVLOCK;
	if (v & HPTE_V_VALID) {
		hpte[0] &= ~HPTE_V_VALID;
		rb = compute_tlbie_rb(v, hpte[1], pte_index);
		if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) {
			while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
				cpu_relax();
			asm volatile("ptesync" : : : "memory");
			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
				     : : "r" (rb), "r" (kvm->arch.lpid));
			asm volatile("ptesync" : : : "memory");
			kvm->arch.tlbie_lock = 0;
		} else {
			asm volatile("ptesync" : : : "memory");
			asm volatile("tlbiel %0" : : "r" (rb));
			asm volatile("ptesync" : : : "memory");
		}
406 407
		/* Read PTE low word after tlbie to get final R/C values */
		remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
408
	}
409 410
	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
	note_hpte_modification(kvm, rev);
411 412
	unlock_hpte(hpte, 0);

413 414
	hpret[0] = v;
	hpret[1] = r;
415 416
	return H_SUCCESS;
}
417 418 419 420 421 422 423 424
EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);

long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
		     unsigned long pte_index, unsigned long avpn)
{
	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
				  &vcpu->arch.gpr[4]);
}
425 426 427 428 429

long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
{
	struct kvm *kvm = vcpu->kvm;
	unsigned long *args = &vcpu->arch.gpr[4];
430 431 432
	unsigned long *hp, *hptes[4], tlbrb[4];
	long int i, j, k, n, found, indexes[4];
	unsigned long flags, req, pte_index, rcbits;
433 434
	long int local = 0;
	long int ret = H_SUCCESS;
435
	struct revmap_entry *rev, *revs[4];
436 437 438

	if (atomic_read(&kvm->online_vcpus) == 1)
		local = 1;
439 440 441 442 443 444 445 446 447 448 449
	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
		n = 0;
		for (; i < 4; ++i) {
			j = i * 2;
			pte_index = args[j];
			flags = pte_index >> 56;
			pte_index &= ((1ul << 56) - 1);
			req = flags >> 6;
			flags &= 3;
			if (req == 3) {		/* no more requests */
				i = 4;
450
				break;
451
			}
452 453
			if (req != 1 || flags == 3 ||
			    pte_index >= kvm->arch.hpt_npte) {
454 455 456
				/* parameter error */
				args[j] = ((0xa0 | flags) << 56) + pte_index;
				ret = H_PARAMETER;
457
				break;
458 459 460 461 462 463 464 465 466 467 468 469 470 471
			}
			hp = (unsigned long *)
				(kvm->arch.hpt_virt + (pte_index << 4));
			/* to avoid deadlock, don't spin except for first */
			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
				if (n)
					break;
				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
					cpu_relax();
			}
			found = 0;
			if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
				switch (flags & 3) {
				case 0:		/* absolute */
472
					found = 1;
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
					break;
				case 1:		/* andcond */
					if (!(hp[0] & args[j + 1]))
						found = 1;
					break;
				case 2:		/* AVPN */
					if ((hp[0] & ~0x7fUL) == args[j + 1])
						found = 1;
					break;
				}
			}
			if (!found) {
				hp[0] &= ~HPTE_V_HVLOCK;
				args[j] = ((0x90 | flags) << 56) + pte_index;
				continue;
488
			}
489 490 491

			args[j] = ((0x80 | flags) << 56) + pte_index;
			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
492
			note_hpte_modification(kvm, rev);
493

494 495 496 497
			if (!(hp[0] & HPTE_V_VALID)) {
				/* insert R and C bits from PTE */
				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
				args[j] |= rcbits << (56 - 5);
498
				hp[0] = 0;
499
				continue;
500
			}
501 502 503 504 505 506 507

			hp[0] &= ~HPTE_V_VALID;		/* leave it locked */
			tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
			indexes[n] = j;
			hptes[n] = hp;
			revs[n] = rev;
			++n;
508
		}
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528

		if (!n)
			break;

		/* Now that we've collected a batch, do the tlbies */
		if (!local) {
			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
				cpu_relax();
			asm volatile("ptesync" : : : "memory");
			for (k = 0; k < n; ++k)
				asm volatile(PPC_TLBIE(%1,%0) : :
					     "r" (tlbrb[k]),
					     "r" (kvm->arch.lpid));
			asm volatile("eieio; tlbsync; ptesync" : : : "memory");
			kvm->arch.tlbie_lock = 0;
		} else {
			asm volatile("ptesync" : : : "memory");
			for (k = 0; k < n; ++k)
				asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
			asm volatile("ptesync" : : : "memory");
529
		}
530

531
		/* Read PTE low words after tlbie to get final R/C values */
532 533 534 535 536
		for (k = 0; k < n; ++k) {
			j = indexes[k];
			pte_index = args[j] & ((1ul << 56) - 1);
			hp = hptes[k];
			rev = revs[k];
537 538 539 540
			remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
			args[j] |= rcbits << (56 - 5);
			hp[0] = 0;
541
		}
542
	}
543

544 545 546 547 548 549 550 551 552
	return ret;
}

long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
		      unsigned long pte_index, unsigned long avpn,
		      unsigned long va)
{
	struct kvm *kvm = vcpu->kvm;
	unsigned long *hpte;
553 554
	struct revmap_entry *rev;
	unsigned long v, r, rb, mask, bits;
555

556
	if (pte_index >= kvm->arch.hpt_npte)
557
		return H_PARAMETER;
558

559
	hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
560
	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
561
		cpu_relax();
562
	if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
563 564 565 566
	    ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
		hpte[0] &= ~HPTE_V_HVLOCK;
		return H_NOT_FOUND;
	}
567

568 569 570
	if (atomic_read(&kvm->online_vcpus) == 1)
		flags |= H_LOCAL;
	v = hpte[0];
571 572 573 574 575 576 577 578 579 580 581
	bits = (flags << 55) & HPTE_R_PP0;
	bits |= (flags << 48) & HPTE_R_KEY_HI;
	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);

	/* Update guest view of 2nd HPTE dword */
	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
	if (rev) {
		r = (rev->guest_rpte & ~mask) | bits;
		rev->guest_rpte = r;
582
		note_hpte_modification(kvm, rev);
583 584 585 586
	}
	r = (hpte[1] & ~mask) | bits;

	/* Update HPTE */
587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
	if (v & HPTE_V_VALID) {
		rb = compute_tlbie_rb(v, r, pte_index);
		hpte[0] = v & ~HPTE_V_VALID;
		if (!(flags & H_LOCAL)) {
			while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
				cpu_relax();
			asm volatile("ptesync" : : : "memory");
			asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
				     : : "r" (rb), "r" (kvm->arch.lpid));
			asm volatile("ptesync" : : : "memory");
			kvm->arch.tlbie_lock = 0;
		} else {
			asm volatile("ptesync" : : : "memory");
			asm volatile("tlbiel %0" : : "r" (rb));
			asm volatile("ptesync" : : : "memory");
		}
603 604 605 606 607 608 609 610 611 612 613 614
	}
	hpte[1] = r;
	eieio();
	hpte[0] = v & ~HPTE_V_HVLOCK;
	asm volatile("ptesync" : : : "memory");
	return H_SUCCESS;
}

long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
		   unsigned long pte_index)
{
	struct kvm *kvm = vcpu->kvm;
615
	unsigned long *hpte, v, r;
616
	int i, n = 1;
617
	struct revmap_entry *rev = NULL;
618

619
	if (pte_index >= kvm->arch.hpt_npte)
620 621 622 623 624
		return H_PARAMETER;
	if (flags & H_READ_4) {
		pte_index &= ~3;
		n = 4;
	}
625
	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
626 627
	for (i = 0; i < n; ++i, ++pte_index) {
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
628
		v = hpte[0] & ~HPTE_V_HVLOCK;
629
		r = hpte[1];
630 631 632 633
		if (v & HPTE_V_ABSENT) {
			v &= ~HPTE_V_ABSENT;
			v |= HPTE_V_VALID;
		}
634
		if (v & HPTE_V_VALID) {
635
			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
636 637
			r &= ~HPTE_GR_RESERVED;
		}
638
		vcpu->arch.gpr[4 + i * 2] = v;
639 640 641 642
		vcpu->arch.gpr[5 + i * 2] = r;
	}
	return H_SUCCESS;
}
643

644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660
void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
			unsigned long pte_index)
{
	unsigned long rb;

	hptep[0] &= ~HPTE_V_VALID;
	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
		cpu_relax();
	asm volatile("ptesync" : : : "memory");
	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
		     : : "r" (rb), "r" (kvm->arch.lpid));
	asm volatile("ptesync" : : : "memory");
	kvm->arch.tlbie_lock = 0;
}
EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);

661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679
void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
			   unsigned long pte_index)
{
	unsigned long rb;
	unsigned char rbyte;

	rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
	rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
	/* modify only the second-last byte, which contains the ref bit */
	*((char *)hptep + 14) = rbyte;
	while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
		cpu_relax();
	asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
		     : : "r" (rb), "r" (kvm->arch.lpid));
	asm volatile("ptesync" : : : "memory");
	kvm->arch.tlbie_lock = 0;
}
EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);

680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715
static int slb_base_page_shift[4] = {
	24,	/* 16M */
	16,	/* 64k */
	34,	/* 16G */
	20,	/* 1M, unsupported */
};

long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
			      unsigned long valid)
{
	unsigned int i;
	unsigned int pshift;
	unsigned long somask;
	unsigned long vsid, hash;
	unsigned long avpn;
	unsigned long *hpte;
	unsigned long mask, val;
	unsigned long v, r;

	/* Get page shift, work out hash and AVPN etc. */
	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
	val = 0;
	pshift = 12;
	if (slb_v & SLB_VSID_L) {
		mask |= HPTE_V_LARGE;
		val |= HPTE_V_LARGE;
		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
	}
	if (slb_v & SLB_VSID_B_1T) {
		somask = (1UL << 40) - 1;
		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
		vsid ^= vsid << 25;
	} else {
		somask = (1UL << 28) - 1;
		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
	}
716
	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
	avpn = slb_v & ~(somask >> 16);	/* also includes B */
	avpn |= (eaddr & somask) >> 16;

	if (pshift >= 24)
		avpn &= ~((1UL << (pshift - 16)) - 1);
	else
		avpn &= ~0x7fUL;
	val |= avpn;

	for (;;) {
		hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));

		for (i = 0; i < 16; i += 2) {
			/* Read the PTE racily */
			v = hpte[i] & ~HPTE_V_HVLOCK;

			/* Check valid/absent, hash, segment size and AVPN */
			if (!(v & valid) || (v & mask) != val)
				continue;

			/* Lock the PTE and read it under the lock */
			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
				cpu_relax();
			v = hpte[i] & ~HPTE_V_HVLOCK;
			r = hpte[i+1];

			/*
			 * Check the HPTE again, including large page size
			 * Since we don't currently allow any MPSS (mixed
			 * page-size segment) page sizes, it is sufficient
			 * to check against the actual page size.
			 */
			if ((v & valid) && (v & mask) == val &&
			    hpte_page_size(v, r) == (1ul << pshift))
				/* Return with the HPTE still locked */
				return (hash << 3) + (i >> 1);

			/* Unlock and move on */
			hpte[i] = v;
		}

		if (val & HPTE_V_SECONDARY)
			break;
		val |= HPTE_V_SECONDARY;
761
		hash = hash ^ kvm->arch.hpt_mask;
762 763 764 765 766 767 768
	}
	return -1;
}
EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);

/*
 * Called in real mode to check whether an HPTE not found fault
769 770 771
 * is due to accessing a paged-out page or an emulated MMIO page,
 * or if a protection fault is due to accessing a page that the
 * guest wanted read/write access to but which we made read-only.
772 773 774
 * Returns a possibly modified status (DSISR) value if not
 * (i.e. pass the interrupt to the guest),
 * -1 to pass the fault up to host kernel mode code, -2 to do that
775
 * and also load the instruction word (for MMIO emulation),
776 777 778
 * or 0 if we should make the guest retry the access.
 */
long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
779
			  unsigned long slb_v, unsigned int status, bool data)
780 781 782 783 784 785 786 787 788
{
	struct kvm *kvm = vcpu->kvm;
	long int index;
	unsigned long v, r, gr;
	unsigned long *hpte;
	unsigned long valid;
	struct revmap_entry *rev;
	unsigned long pp, key;

789 790 791 792
	/* For protection fault, expect to find a valid HPTE */
	valid = HPTE_V_VALID;
	if (status & DSISR_NOHPTE)
		valid |= HPTE_V_ABSENT;
793

794
	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
795 796 797 798 799
	if (index < 0) {
		if (status & DSISR_NOHPTE)
			return status;	/* there really was no HPTE */
		return 0;		/* for prot fault, HPTE disappeared */
	}
800 801 802 803 804 805
	hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
	v = hpte[0] & ~HPTE_V_HVLOCK;
	r = hpte[1];
	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
	gr = rev->guest_rpte;

806
	unlock_hpte(hpte, v);
807

808 809
	/* For not found, if the HPTE is valid by now, retry the instruction */
	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
810 811 812 813 814
		return 0;

	/* Check access permissions to the page */
	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
815 816 817 818 819 820 821
	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
	if (!data) {
		if (gr & (HPTE_R_N | HPTE_R_G))
			return status | SRR1_ISI_N_OR_G;
		if (!hpte_read_permission(pp, slb_v & key))
			return status | SRR1_ISI_PROT;
	} else if (status & DSISR_ISSTORE) {
822 823
		/* check write permission */
		if (!hpte_write_permission(pp, slb_v & key))
824
			return status | DSISR_PROTFAULT;
825 826
	} else {
		if (!hpte_read_permission(pp, slb_v & key))
827
			return status | DSISR_PROTFAULT;
828 829 830
	}

	/* Check storage key, if applicable */
831
	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
832 833 834 835
		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
		if (status & DSISR_ISSTORE)
			perm >>= 1;
		if (perm & 1)
836
			return status | DSISR_KEYFAULT;
837 838 839 840 841 842 843 844
	}

	/* Save HPTE info for virtual-mode handler */
	vcpu->arch.pgfault_addr = addr;
	vcpu->arch.pgfault_index = index;
	vcpu->arch.pgfault_hpte[0] = v;
	vcpu->arch.pgfault_hpte[1] = r;

845 846 847 848
	/* Check the storage key to see if it is possibly emulated MMIO */
	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
849 850 851 852
		return -2;	/* MMIO emulation - load instr word */

	return -1;		/* send fault up to host kernel mode */
}