book3s_hv_rm_mmu.c 25.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/types.h>
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/hugetlb.h>
14
#include <linux/module.h>
15 16 17 18 19 20 21 22 23

#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu-hash64.h>
#include <asm/hvcall.h>
#include <asm/synch.h>
#include <asm/ppc-opcode.h>

24 25 26 27 28 29
/* Translate address of a vmalloc'd thing to a linear map address */
static void *real_vmalloc_addr(void *x)
{
	unsigned long addr = (unsigned long) x;
	pte_t *p;

30
	p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
31 32 33 34 35 36
	if (!p || !pte_present(*p))
		return NULL;
	/* assume we don't have huge pages in vmalloc space... */
	addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
	return __va(addr);
}
37

38 39 40 41 42 43 44
/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
static int global_invalidates(struct kvm *kvm, unsigned long flags)
{
	int global;

	/*
	 * If there is only one vcore, and it's currently running,
45
	 * as indicated by local_paca->kvm_hstate.kvm_vcpu being set,
46 47 48 49 50 51
	 * we can use tlbiel as long as we mark all other physical
	 * cores as potentially having stale TLB entries for this lpid.
	 * If we're not using MMU notifiers, we never take pages away
	 * from the guest, so we can use tlbiel if requested.
	 * Otherwise, don't use tlbiel.
	 */
52
	if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu)
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
		global = 0;
	else if (kvm->arch.using_mmu_notifiers)
		global = 1;
	else
		global = !(flags & H_LOCAL);

	if (!global) {
		/* any other core might now have stale TLB entries... */
		smp_wmb();
		cpumask_setall(&kvm->arch.need_tlb_flush);
		cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
				  &kvm->arch.need_tlb_flush);
	}

	return global;
}

70 71 72 73
/*
 * Add this HPTE into the chain for the real page.
 * Must be called with the chain locked; it unlocks the chain.
 */
74
void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
			     unsigned long *rmap, long pte_index, int realmode)
{
	struct revmap_entry *head, *tail;
	unsigned long i;

	if (*rmap & KVMPPC_RMAP_PRESENT) {
		i = *rmap & KVMPPC_RMAP_INDEX;
		head = &kvm->arch.revmap[i];
		if (realmode)
			head = real_vmalloc_addr(head);
		tail = &kvm->arch.revmap[head->back];
		if (realmode)
			tail = real_vmalloc_addr(tail);
		rev->forw = i;
		rev->back = head->back;
		tail->forw = pte_index;
		head->back = pte_index;
	} else {
		rev->forw = rev->back = pte_index;
94 95
		*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
			pte_index | KVMPPC_RMAP_PRESENT;
96
	}
97
	unlock_rmap(rmap);
98
}
99
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
100 101 102

/* Remove this HPTE from the chain for a real page */
static void remove_revmap_chain(struct kvm *kvm, long pte_index,
103 104
				struct revmap_entry *rev,
				unsigned long hpte_v, unsigned long hpte_r)
105
{
106
	struct revmap_entry *next, *prev;
107 108 109
	unsigned long gfn, ptel, head;
	struct kvm_memory_slot *memslot;
	unsigned long *rmap;
110
	unsigned long rcbits;
111

112 113
	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
	ptel = rev->guest_rpte |= rcbits;
114
	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
115
	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
116
	if (!memslot)
117 118
		return;

119
	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
120 121 122 123 124 125 126 127 128 129 130 131 132 133
	lock_rmap(rmap);

	head = *rmap & KVMPPC_RMAP_INDEX;
	next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
	prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
	next->back = rev->back;
	prev->forw = rev->forw;
	if (head == pte_index) {
		head = rev->forw;
		if (head == pte_index)
			*rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
		else
			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
	}
134
	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
135 136 137
	unlock_rmap(rmap);
}

138
static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
139
			      int writing, unsigned long *pte_sizep)
140 141 142
{
	pte_t *ptep;
	unsigned long ps = *pte_sizep;
143
	unsigned int hugepage_shift;
144

145
	ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
146 147
	if (!ptep)
		return __pte(0);
148 149
	if (hugepage_shift)
		*pte_sizep = 1ul << hugepage_shift;
150 151 152 153
	else
		*pte_sizep = PAGE_SIZE;
	if (ps > *pte_sizep)
		return __pte(0);
154
	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
155 156
}

157
static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
158 159
{
	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
160
	hpte[0] = cpu_to_be64(hpte_v);
161 162
}

163 164 165
long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
		       long pte_index, unsigned long pteh, unsigned long ptel,
		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
166
{
167
	unsigned long i, pa, gpa, gfn, psize;
168
	unsigned long slot_fn, hva;
169
	__be64 *hpte;
170
	struct revmap_entry *rev;
171
	unsigned long g_ptel;
172
	struct kvm_memory_slot *memslot;
173
	unsigned long *physp, pte_size;
174
	unsigned long is_io;
175
	unsigned long *rmap;
176
	pte_t pte;
177
	unsigned int writing;
178
	unsigned long mmu_seq;
179
	unsigned long rcbits;
180 181 182

	psize = hpte_page_size(pteh, ptel);
	if (!psize)
183
		return H_PARAMETER;
184
	writing = hpte_is_writable(ptel);
185
	pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
186 187
	ptel &= ~HPTE_GR_RESERVED;
	g_ptel = ptel;
188

189 190 191 192
	/* used later to detect if we might have been invalidated */
	mmu_seq = kvm->mmu_notifier_seq;
	smp_rmb();

193 194 195
	/* Find the memslot (if any) for this address */
	gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
	gfn = gpa >> PAGE_SHIFT;
196
	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
197
	pa = 0;
198
	is_io = ~0ul;
199 200 201 202 203 204 205 206 207 208
	rmap = NULL;
	if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
		/* PPC970 can't do emulated MMIO */
		if (!cpu_has_feature(CPU_FTR_ARCH_206))
			return H_PARAMETER;
		/* Emulated MMIO - mark this with key=31 */
		pteh |= HPTE_V_ABSENT;
		ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
		goto do_insert;
	}
209 210 211 212

	/* Check if the requested page fits entirely in the memslot. */
	if (!slot_is_aligned(memslot, psize))
		return H_PARAMETER;
213
	slot_fn = gfn - memslot->base_gfn;
214
	rmap = &memslot->arch.rmap[slot_fn];
215

216
	if (!kvm->arch.using_mmu_notifiers) {
217
		physp = memslot->arch.slot_phys;
218 219 220 221 222 223 224 225 226 227 228
		if (!physp)
			return H_PARAMETER;
		physp += slot_fn;
		if (realmode)
			physp = real_vmalloc_addr(physp);
		pa = *physp;
		if (!pa)
			return H_TOO_HARD;
		is_io = pa & (HPTE_R_I | HPTE_R_W);
		pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
		pa &= PAGE_MASK;
229
		pa |= gpa & ~PAGE_MASK;
230 231
	} else {
		/* Translate to host virtual address */
232
		hva = __gfn_to_hva_memslot(memslot, gfn);
233 234 235

		/* Look up the Linux PTE for the backing page */
		pte_size = psize;
236 237
		pte = lookup_linux_pte_and_update(pgdir, hva, writing,
						  &pte_size);
238
		if (pte_present(pte) && !pte_numa(pte)) {
239 240 241
			if (writing && !pte_write(pte))
				/* make the actual HPTE be read-only */
				ptel = hpte_make_readonly(ptel);
242 243
			is_io = hpte_cache_bits(pte_val(pte));
			pa = pte_pfn(pte) << PAGE_SHIFT;
244
			pa |= hva & (pte_size - 1);
245
			pa |= gpa & ~PAGE_MASK;
246 247
		}
	}
248

249 250 251 252 253
	if (pte_size < psize)
		return H_PARAMETER;

	ptel &= ~(HPTE_R_PP0 - psize);
	ptel |= pa;
254 255 256 257 258

	if (pa)
		pteh |= HPTE_V_VALID;
	else
		pteh |= HPTE_V_ABSENT;
259

260
	/* Check WIMG */
261
	if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
262 263 264 265 266 267 268 269 270
		if (is_io)
			return H_PARAMETER;
		/*
		 * Allow guest to map emulated device memory as
		 * uncacheable, but actually make it cacheable.
		 */
		ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
		ptel |= HPTE_R_M;
	}
271

272
	/* Find and lock the HPTEG slot to use */
273
 do_insert:
274
	if (pte_index >= kvm->arch.hpt_npte)
275 276 277
		return H_PARAMETER;
	if (likely((flags & H_EXACT) == 0)) {
		pte_index &= ~7UL;
278
		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
279
		for (i = 0; i < 8; ++i) {
280
			if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
281 282
			    try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
					  HPTE_V_ABSENT))
283 284 285
				break;
			hpte += 2;
		}
286 287 288 289 290 291 292 293 294
		if (i == 8) {
			/*
			 * Since try_lock_hpte doesn't retry (not even stdcx.
			 * failures), it could be that there is a free slot
			 * but we transiently failed to lock it.  Try again,
			 * actually locking each slot and checking it.
			 */
			hpte -= 16;
			for (i = 0; i < 8; ++i) {
295
				u64 pte;
296 297
				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
					cpu_relax();
298 299
				pte = be64_to_cpu(*hpte);
				if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
300
					break;
301
				*hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
302 303 304 305 306
				hpte += 2;
			}
			if (i == 8)
				return H_PTEG_FULL;
		}
307
		pte_index += i;
308
	} else {
309
		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
310 311
		if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
				   HPTE_V_ABSENT)) {
312
			/* Lock the slot and check again */
313 314
			u64 pte;

315 316
			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
				cpu_relax();
317 318 319
			pte = be64_to_cpu(*hpte);
			if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
				*hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
320 321 322
				return H_PTEG_FULL;
			}
		}
323
	}
324 325

	/* Save away the guest's idea of the second HPTE dword */
326 327 328
	rev = &kvm->arch.revmap[pte_index];
	if (realmode)
		rev = real_vmalloc_addr(rev);
329
	if (rev) {
330
		rev->guest_rpte = g_ptel;
331 332
		note_hpte_modification(kvm, rev);
	}
333 334

	/* Link HPTE into reverse-map chain */
335 336 337 338
	if (pteh & HPTE_V_VALID) {
		if (realmode)
			rmap = real_vmalloc_addr(rmap);
		lock_rmap(rmap);
339 340
		/* Check for pending invalidations under the rmap chain lock */
		if (kvm->arch.using_mmu_notifiers &&
341
		    mmu_notifier_retry(kvm, mmu_seq)) {
342 343 344 345 346 347 348
			/* inval in progress, write a non-present HPTE */
			pteh |= HPTE_V_ABSENT;
			pteh &= ~HPTE_V_VALID;
			unlock_rmap(rmap);
		} else {
			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
						realmode);
349 350 351
			/* Only set R/C in real HPTE if already set in *rmap */
			rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
			ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
352
		}
353
	}
354

355
	hpte[1] = cpu_to_be64(ptel);
356 357

	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
358
	eieio();
359
	hpte[0] = cpu_to_be64(pteh);
360
	asm volatile("ptesync" : : : "memory");
361

362
	*pte_idx_ret = pte_index;
363 364
	return H_SUCCESS;
}
365 366 367 368 369 370 371 372
EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);

long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
		    long pte_index, unsigned long pteh, unsigned long ptel)
{
	return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
				 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
}
373

374
#ifdef __BIG_ENDIAN__
375
#define LOCK_TOKEN	(*(u32 *)(&get_paca()->lock_token))
376 377 378
#else
#define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
#endif
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397

static inline int try_lock_tlbie(unsigned int *lock)
{
	unsigned int tmp, old;
	unsigned int token = LOCK_TOKEN;

	asm volatile("1:lwarx	%1,0,%2\n"
		     "	cmpwi	cr0,%1,0\n"
		     "	bne	2f\n"
		     "  stwcx.	%3,0,%2\n"
		     "	bne-	1b\n"
		     "  isync\n"
		     "2:"
		     : "=&r" (tmp), "=&r" (old)
		     : "r" (lock), "r" (token)
		     : "cc", "memory");
	return old == 0;
}

398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
/*
 * tlbie/tlbiel is a bit different on the PPC970 compared to later
 * processors such as POWER7; the large page bit is in the instruction
 * not RB, and the top 16 bits and the bottom 12 bits of the VA
 * in RB must be 0.
 */
static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
			  long npages, int global, bool need_sync)
{
	long i;

	if (global) {
		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
			cpu_relax();
		if (need_sync)
			asm volatile("ptesync" : : : "memory");
		for (i = 0; i < npages; ++i) {
			unsigned long rb = rbvalues[i];

			if (rb & 1)		/* large page */
				asm volatile("tlbie %0,1" : :
					     "r" (rb & 0x0000fffffffff000ul));
			else
				asm volatile("tlbie %0,0" : :
					     "r" (rb & 0x0000fffffffff000ul));
		}
		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
		kvm->arch.tlbie_lock = 0;
	} else {
		if (need_sync)
			asm volatile("ptesync" : : : "memory");
		for (i = 0; i < npages; ++i) {
			unsigned long rb = rbvalues[i];

			if (rb & 1)		/* large page */
				asm volatile("tlbiel %0,1" : :
					     "r" (rb & 0x0000fffffffff000ul));
			else
				asm volatile("tlbiel %0,0" : :
					     "r" (rb & 0x0000fffffffff000ul));
		}
		asm volatile("ptesync" : : : "memory");
	}
}

static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
		      long npages, int global, bool need_sync)
{
	long i;

	if (cpu_has_feature(CPU_FTR_ARCH_201)) {
		/* PPC970 tlbie instruction is a bit different */
		do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
		return;
	}
	if (global) {
		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
			cpu_relax();
		if (need_sync)
			asm volatile("ptesync" : : : "memory");
		for (i = 0; i < npages; ++i)
			asm volatile(PPC_TLBIE(%1,%0) : :
				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
		kvm->arch.tlbie_lock = 0;
	} else {
		if (need_sync)
			asm volatile("ptesync" : : : "memory");
		for (i = 0; i < npages; ++i)
			asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
		asm volatile("ptesync" : : : "memory");
	}
}

472 473 474
long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
			unsigned long pte_index, unsigned long avpn,
			unsigned long *hpret)
475
{
476
	__be64 *hpte;
477
	unsigned long v, r, rb;
478
	struct revmap_entry *rev;
479
	u64 pte;
480

481
	if (pte_index >= kvm->arch.hpt_npte)
482
		return H_PARAMETER;
483
	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
484
	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
485
		cpu_relax();
486 487 488 489 490
	pte = be64_to_cpu(hpte[0]);
	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
		hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
491 492
		return H_NOT_FOUND;
	}
493 494

	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
495
	v = pte & ~HPTE_V_HVLOCK;
496
	if (v & HPTE_V_VALID) {
497 498 499 500 501
		u64 pte1;

		pte1 = be64_to_cpu(hpte[1]);
		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
		rb = compute_tlbie_rb(v, pte1, pte_index);
502
		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
503
		/* Read PTE low word after tlbie to get final R/C values */
504
		remove_revmap_chain(kvm, pte_index, rev, v, pte1);
505
	}
506 507
	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
	note_hpte_modification(kvm, rev);
508 509
	unlock_hpte(hpte, 0);

510 511
	hpret[0] = v;
	hpret[1] = r;
512 513
	return H_SUCCESS;
}
514 515 516 517 518 519 520 521
EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);

long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
		     unsigned long pte_index, unsigned long avpn)
{
	return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
				  &vcpu->arch.gpr[4]);
}
522 523 524 525 526

long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
{
	struct kvm *kvm = vcpu->kvm;
	unsigned long *args = &vcpu->arch.gpr[4];
527 528
	__be64 *hp, *hptes[4];
	unsigned long tlbrb[4];
529 530
	long int i, j, k, n, found, indexes[4];
	unsigned long flags, req, pte_index, rcbits;
531
	int global;
532
	long int ret = H_SUCCESS;
533
	struct revmap_entry *rev, *revs[4];
534
	u64 hp0;
535

536
	global = global_invalidates(kvm, 0);
537 538 539 540 541 542 543 544 545 546 547
	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
		n = 0;
		for (; i < 4; ++i) {
			j = i * 2;
			pte_index = args[j];
			flags = pte_index >> 56;
			pte_index &= ((1ul << 56) - 1);
			req = flags >> 6;
			flags &= 3;
			if (req == 3) {		/* no more requests */
				i = 4;
548
				break;
549
			}
550 551
			if (req != 1 || flags == 3 ||
			    pte_index >= kvm->arch.hpt_npte) {
552 553 554
				/* parameter error */
				args[j] = ((0xa0 | flags) << 56) + pte_index;
				ret = H_PARAMETER;
555
				break;
556
			}
557
			hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
558 559 560 561 562 563 564 565
			/* to avoid deadlock, don't spin except for first */
			if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
				if (n)
					break;
				while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
					cpu_relax();
			}
			found = 0;
566 567
			hp0 = be64_to_cpu(hp[0]);
			if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
568 569
				switch (flags & 3) {
				case 0:		/* absolute */
570
					found = 1;
571 572
					break;
				case 1:		/* andcond */
573
					if (!(hp0 & args[j + 1]))
574 575 576
						found = 1;
					break;
				case 2:		/* AVPN */
577
					if ((hp0 & ~0x7fUL) == args[j + 1])
578 579 580 581 582
						found = 1;
					break;
				}
			}
			if (!found) {
583
				hp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
584 585
				args[j] = ((0x90 | flags) << 56) + pte_index;
				continue;
586
			}
587 588 589

			args[j] = ((0x80 | flags) << 56) + pte_index;
			rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
590
			note_hpte_modification(kvm, rev);
591

592
			if (!(hp0 & HPTE_V_VALID)) {
593 594 595
				/* insert R and C bits from PTE */
				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
				args[j] |= rcbits << (56 - 5);
596
				hp[0] = 0;
597
				continue;
598
			}
599

600 601 602 603
			/* leave it locked */
			hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
			tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]),
				be64_to_cpu(hp[1]), pte_index);
604 605 606 607
			indexes[n] = j;
			hptes[n] = hp;
			revs[n] = rev;
			++n;
608
		}
609 610 611 612 613

		if (!n)
			break;

		/* Now that we've collected a batch, do the tlbies */
614
		do_tlbies(kvm, tlbrb, n, global, true);
615

616
		/* Read PTE low words after tlbie to get final R/C values */
617 618 619 620 621
		for (k = 0; k < n; ++k) {
			j = indexes[k];
			pte_index = args[j] & ((1ul << 56) - 1);
			hp = hptes[k];
			rev = revs[k];
622 623
			remove_revmap_chain(kvm, pte_index, rev,
				be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
624 625 626
			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
			args[j] |= rcbits << (56 - 5);
			hp[0] = 0;
627
		}
628
	}
629

630 631 632 633 634 635 636 637
	return ret;
}

long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
		      unsigned long pte_index, unsigned long avpn,
		      unsigned long va)
{
	struct kvm *kvm = vcpu->kvm;
638
	__be64 *hpte;
639 640
	struct revmap_entry *rev;
	unsigned long v, r, rb, mask, bits;
641
	u64 pte;
642

643
	if (pte_index >= kvm->arch.hpt_npte)
644
		return H_PARAMETER;
645

646
	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
647
	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
648
		cpu_relax();
649 650 651 652
	pte = be64_to_cpu(hpte[0]);
	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
		hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
653 654
		return H_NOT_FOUND;
	}
655

656
	v = pte;
657 658 659 660 661 662 663 664 665 666 667
	bits = (flags << 55) & HPTE_R_PP0;
	bits |= (flags << 48) & HPTE_R_KEY_HI;
	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);

	/* Update guest view of 2nd HPTE dword */
	mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
		HPTE_R_KEY_HI | HPTE_R_KEY_LO;
	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
	if (rev) {
		r = (rev->guest_rpte & ~mask) | bits;
		rev->guest_rpte = r;
668
		note_hpte_modification(kvm, rev);
669
	}
670
	r = (be64_to_cpu(hpte[1]) & ~mask) | bits;
671 672

	/* Update HPTE */
673 674
	if (v & HPTE_V_VALID) {
		rb = compute_tlbie_rb(v, r, pte_index);
675
		hpte[0] = cpu_to_be64(v & ~HPTE_V_VALID);
676
		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
677 678 679 680 681 682 683 684 685 686 687 688 689 690
		/*
		 * If the host has this page as readonly but the guest
		 * wants to make it read/write, reduce the permissions.
		 * Checking the host permissions involves finding the
		 * memslot and then the Linux PTE for the page.
		 */
		if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
			unsigned long psize, gfn, hva;
			struct kvm_memory_slot *memslot;
			pgd_t *pgdir = vcpu->arch.pgdir;
			pte_t pte;

			psize = hpte_page_size(v, r);
			gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
691
			memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
692 693
			if (memslot) {
				hva = __gfn_to_hva_memslot(memslot, gfn);
694 695
				pte = lookup_linux_pte_and_update(pgdir, hva,
								  1, &psize);
696 697 698 699
				if (pte_present(pte) && !pte_write(pte))
					r = hpte_make_readonly(r);
			}
		}
700
	}
701
	hpte[1] = cpu_to_be64(r);
702
	eieio();
703
	hpte[0] = cpu_to_be64(v & ~HPTE_V_HVLOCK);
704 705 706 707 708 709 710 711
	asm volatile("ptesync" : : : "memory");
	return H_SUCCESS;
}

long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
		   unsigned long pte_index)
{
	struct kvm *kvm = vcpu->kvm;
712 713
	__be64 *hpte;
	unsigned long v, r;
714
	int i, n = 1;
715
	struct revmap_entry *rev = NULL;
716

717
	if (pte_index >= kvm->arch.hpt_npte)
718 719 720 721 722
		return H_PARAMETER;
	if (flags & H_READ_4) {
		pte_index &= ~3;
		n = 4;
	}
723
	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
724
	for (i = 0; i < n; ++i, ++pte_index) {
725 726 727
		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
		r = be64_to_cpu(hpte[1]);
728 729 730 731
		if (v & HPTE_V_ABSENT) {
			v &= ~HPTE_V_ABSENT;
			v |= HPTE_V_VALID;
		}
732
		if (v & HPTE_V_VALID) {
733
			r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
734 735
			r &= ~HPTE_GR_RESERVED;
		}
736
		vcpu->arch.gpr[4 + i * 2] = v;
737 738 739 740
		vcpu->arch.gpr[5 + i * 2] = r;
	}
	return H_SUCCESS;
}
741

742
void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
743 744 745 746
			unsigned long pte_index)
{
	unsigned long rb;

747 748 749
	hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
			      pte_index);
750
	do_tlbies(kvm, &rb, 1, 1, true);
751 752 753
}
EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);

754
void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
755 756 757 758 759
			   unsigned long pte_index)
{
	unsigned long rb;
	unsigned char rbyte;

760 761 762
	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
			      pte_index);
	rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
763 764
	/* modify only the second-last byte, which contains the ref bit */
	*((char *)hptep + 14) = rbyte;
765
	do_tlbies(kvm, &rb, 1, 1, false);
766 767 768
}
EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);

769 770 771 772 773 774 775
static int slb_base_page_shift[4] = {
	24,	/* 16M */
	16,	/* 64k */
	34,	/* 16G */
	20,	/* 1M, unsupported */
};

776 777 778 779
/* When called from virtmode, this func should be protected by
 * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
 * can trigger deadlock issue.
 */
780 781 782 783 784 785 786 787
long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
			      unsigned long valid)
{
	unsigned int i;
	unsigned int pshift;
	unsigned long somask;
	unsigned long vsid, hash;
	unsigned long avpn;
788
	__be64 *hpte;
789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
	unsigned long mask, val;
	unsigned long v, r;

	/* Get page shift, work out hash and AVPN etc. */
	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
	val = 0;
	pshift = 12;
	if (slb_v & SLB_VSID_L) {
		mask |= HPTE_V_LARGE;
		val |= HPTE_V_LARGE;
		pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
	}
	if (slb_v & SLB_VSID_B_1T) {
		somask = (1UL << 40) - 1;
		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
		vsid ^= vsid << 25;
	} else {
		somask = (1UL << 28) - 1;
		vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
	}
809
	hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
810 811 812 813 814 815 816 817 818 819
	avpn = slb_v & ~(somask >> 16);	/* also includes B */
	avpn |= (eaddr & somask) >> 16;

	if (pshift >= 24)
		avpn &= ~((1UL << (pshift - 16)) - 1);
	else
		avpn &= ~0x7fUL;
	val |= avpn;

	for (;;) {
820
		hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
821 822 823

		for (i = 0; i < 16; i += 2) {
			/* Read the PTE racily */
824
			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
825 826 827 828 829 830 831 832

			/* Check valid/absent, hash, segment size and AVPN */
			if (!(v & valid) || (v & mask) != val)
				continue;

			/* Lock the PTE and read it under the lock */
			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
				cpu_relax();
833 834
			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
			r = be64_to_cpu(hpte[i+1]);
835 836

			/*
837
			 * Check the HPTE again, including base page size
838 839
			 */
			if ((v & valid) && (v & mask) == val &&
840
			    hpte_base_page_size(v, r) == (1ul << pshift))
841 842 843 844
				/* Return with the HPTE still locked */
				return (hash << 3) + (i >> 1);

			/* Unlock and move on */
845
			hpte[i] = cpu_to_be64(v);
846 847 848 849 850
		}

		if (val & HPTE_V_SECONDARY)
			break;
		val |= HPTE_V_SECONDARY;
851
		hash = hash ^ kvm->arch.hpt_mask;
852 853 854 855 856 857 858
	}
	return -1;
}
EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);

/*
 * Called in real mode to check whether an HPTE not found fault
859 860 861
 * is due to accessing a paged-out page or an emulated MMIO page,
 * or if a protection fault is due to accessing a page that the
 * guest wanted read/write access to but which we made read-only.
862 863 864
 * Returns a possibly modified status (DSISR) value if not
 * (i.e. pass the interrupt to the guest),
 * -1 to pass the fault up to host kernel mode code, -2 to do that
865
 * and also load the instruction word (for MMIO emulation),
866 867 868
 * or 0 if we should make the guest retry the access.
 */
long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
869
			  unsigned long slb_v, unsigned int status, bool data)
870 871 872 873
{
	struct kvm *kvm = vcpu->kvm;
	long int index;
	unsigned long v, r, gr;
874
	__be64 *hpte;
875 876 877 878
	unsigned long valid;
	struct revmap_entry *rev;
	unsigned long pp, key;

879 880 881 882
	/* For protection fault, expect to find a valid HPTE */
	valid = HPTE_V_VALID;
	if (status & DSISR_NOHPTE)
		valid |= HPTE_V_ABSENT;
883

884
	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
885 886 887 888 889
	if (index < 0) {
		if (status & DSISR_NOHPTE)
			return status;	/* there really was no HPTE */
		return 0;		/* for prot fault, HPTE disappeared */
	}
890 891 892
	hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
	v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
	r = be64_to_cpu(hpte[1]);
893 894 895
	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
	gr = rev->guest_rpte;

896
	unlock_hpte(hpte, v);
897

898 899
	/* For not found, if the HPTE is valid by now, retry the instruction */
	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
900 901 902 903 904
		return 0;

	/* Check access permissions to the page */
	pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
	key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
905 906 907 908 909 910 911
	status &= ~DSISR_NOHPTE;	/* DSISR_NOHPTE == SRR1_ISI_NOPT */
	if (!data) {
		if (gr & (HPTE_R_N | HPTE_R_G))
			return status | SRR1_ISI_N_OR_G;
		if (!hpte_read_permission(pp, slb_v & key))
			return status | SRR1_ISI_PROT;
	} else if (status & DSISR_ISSTORE) {
912 913
		/* check write permission */
		if (!hpte_write_permission(pp, slb_v & key))
914
			return status | DSISR_PROTFAULT;
915 916
	} else {
		if (!hpte_read_permission(pp, slb_v & key))
917
			return status | DSISR_PROTFAULT;
918 919 920
	}

	/* Check storage key, if applicable */
921
	if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
922 923 924 925
		unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
		if (status & DSISR_ISSTORE)
			perm >>= 1;
		if (perm & 1)
926
			return status | DSISR_KEYFAULT;
927 928 929 930 931 932 933 934
	}

	/* Save HPTE info for virtual-mode handler */
	vcpu->arch.pgfault_addr = addr;
	vcpu->arch.pgfault_index = index;
	vcpu->arch.pgfault_hpte[0] = v;
	vcpu->arch.pgfault_hpte[1] = r;

935 936 937 938
	/* Check the storage key to see if it is possibly emulated MMIO */
	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
939 940 941 942
		return -2;	/* MMIO emulation - load instr word */

	return -1;		/* send fault up to host kernel mode */
}