lpar.c 25.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 * pSeries_lpar.c
 * Copyright (C) 2001 Todd Inglett, IBM Corporation
 *
 * pSeries LPAR support.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 */

22 23
/* Enables debugging of low-level hash table routines - careful! */
#undef DEBUG
24
#define pr_fmt(fmt) "lpar: " fmt
L
Linus Torvalds 已提交
25 26 27

#include <linux/kernel.h>
#include <linux/dma-mapping.h>
28
#include <linux/console.h>
29
#include <linux/export.h>
30
#include <linux/jump_label.h>
31 32
#include <linux/delay.h>
#include <linux/stop_machine.h>
L
Linus Torvalds 已提交
33 34 35 36 37 38 39 40 41 42
#include <asm/processor.h>
#include <asm/mmu.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/machdep.h>
#include <asm/mmu_context.h>
#include <asm/iommu.h>
#include <asm/tlb.h>
#include <asm/prom.h>
#include <asm/cputable.h>
D
David Gibson 已提交
43
#include <asm/udbg.h>
P
Paul Mackerras 已提交
44
#include <asm/smp.h>
45
#include <asm/trace.h>
46
#include <asm/firmware.h>
47
#include <asm/plpar_wrappers.h>
48
#include <asm/kexec.h>
49
#include <asm/fadump.h>
50
#include <asm/asm-prototypes.h>
51

52
#include "pseries.h"
L
Linus Torvalds 已提交
53

54 55 56 57 58 59 60
/* Flag bits for H_BULK_REMOVE */
#define HBR_REQUEST	0x4000000000000000UL
#define HBR_RESPONSE	0x8000000000000000UL
#define HBR_END		0xc000000000000000UL
#define HBR_AVPN	0x0200000000000000UL
#define HBR_ANDCOND	0x0100000000000000UL

L
Linus Torvalds 已提交
61

62
/* in hvCall.S */
L
Linus Torvalds 已提交
63
EXPORT_SYMBOL(plpar_hcall);
64
EXPORT_SYMBOL(plpar_hcall9);
L
Linus Torvalds 已提交
65
EXPORT_SYMBOL(plpar_hcall_norets);
66

L
Linus Torvalds 已提交
67 68 69
void vpa_init(int cpu)
{
	int hwcpu = get_hard_smp_processor_id(cpu);
70
	unsigned long addr;
L
Linus Torvalds 已提交
71
	long ret;
72 73
	struct paca_struct *pp;
	struct dtl_entry *dtl;
74

75 76 77 78 79 80
	/*
	 * The spec says it "may be problematic" if CPU x registers the VPA of
	 * CPU y. We should never do that, but wail if we ever do.
	 */
	WARN_ON(cpu != smp_processor_id());

81
	if (cpu_has_feature(CPU_FTR_ALTIVEC))
82
		lppaca_of(cpu).vmxregs_in_use = 1;
83

84 85 86
	if (cpu_has_feature(CPU_FTR_ARCH_207S))
		lppaca_of(cpu).ebb_regs_in_use = 1;

87
	addr = __pa(&lppaca_of(cpu));
88
	ret = register_vpa(hwcpu, addr);
L
Linus Torvalds 已提交
89

90
	if (ret) {
91 92
		pr_err("WARNING: VPA registration for cpu %d (hw %d) of area "
		       "%lx failed with %ld\n", cpu, hwcpu, addr, ret);
93 94
		return;
	}
95

96
#ifdef CONFIG_PPC_BOOK3S_64
97 98 99 100
	/*
	 * PAPR says this feature is SLB-Buffer but firmware never
	 * reports that.  All SPLPAR support SLB shadow buffer.
	 */
101
	if (!radix_enabled() && firmware_has_feature(FW_FEATURE_SPLPAR)) {
102
		addr = __pa(paca_ptrs[cpu]->slb_shadow_ptr);
103 104
		ret = register_slb_shadow(hwcpu, addr);
		if (ret)
105 106 107
			pr_err("WARNING: SLB shadow buffer registration for "
			       "cpu %d (hw %d) of area %lx failed with %ld\n",
			       cpu, hwcpu, addr, ret);
108
	}
109
#endif /* CONFIG_PPC_BOOK3S_64 */
110 111 112 113

	/*
	 * Register dispatch trace log, if one has been allocated.
	 */
114
	pp = paca_ptrs[cpu];
115 116 117 118 119 120 121
	dtl = pp->dispatch_log;
	if (dtl) {
		pp->dtl_ridx = 0;
		pp->dtl_curr = dtl;
		lppaca_of(cpu).dtl_idx = 0;

		/* hypervisor reads buffer length from this field */
122
		dtl->enqueue_to_dispatch_time = cpu_to_be32(DISPATCH_LOG_BYTES);
123 124
		ret = register_dtl(hwcpu, __pa(dtl));
		if (ret)
125 126 127
			pr_err("WARNING: DTL registration of cpu %d (hw %d) "
			       "failed with %ld\n", smp_processor_id(),
			       hwcpu, ret);
128 129
		lppaca_of(cpu).dtl_enable_mask = 2;
	}
L
Linus Torvalds 已提交
130 131
}

132
#ifdef CONFIG_PPC_BOOK3S_64
133

134
static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
135 136
				     unsigned long vpn, unsigned long pa,
				     unsigned long rflags, unsigned long vflags,
137
				     int psize, int apsize, int ssize)
L
Linus Torvalds 已提交
138 139 140 141
{
	unsigned long lpar_rc;
	unsigned long flags;
	unsigned long slot;
142
	unsigned long hpte_v, hpte_r;
L
Linus Torvalds 已提交
143

144
	if (!(vflags & HPTE_V_BOLTED))
145 146 147
		pr_devel("hpte_insert(group=%lx, vpn=%016lx, "
			 "pa=%016lx, rflags=%lx, vflags=%lx, psize=%d)\n",
			 hpte_group, vpn,  pa, rflags, vflags, psize);
148

149
	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
150
	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
151 152

	if (!(vflags & HPTE_V_BOLTED))
153
		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
154

L
Linus Torvalds 已提交
155 156 157 158 159 160 161 162
	/* Now fill in the actual HPTE */
	/* Set CEC cookie to 0         */
	/* Zero page = 0               */
	/* I-cache Invalidate = 0      */
	/* I-cache synchronize = 0     */
	/* Exact = 0                   */
	flags = 0;

163 164
	if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N))
		flags |= H_COALESCE_CAND;
L
Linus Torvalds 已提交
165

166
	lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot);
167
	if (unlikely(lpar_rc == H_PTEG_FULL)) {
168
		pr_devel("Hash table group is full\n");
L
Linus Torvalds 已提交
169
		return -1;
170
	}
L
Linus Torvalds 已提交
171 172 173 174 175 176

	/*
	 * Since we try and ioremap PHBs we don't own, the pte insert
	 * will fail. However we must catch the failure in hash_page
	 * or we will loop forever, so return -2 in this case.
	 */
177
	if (unlikely(lpar_rc != H_SUCCESS)) {
178
		pr_err("Failed hash pte insert with error %ld\n", lpar_rc);
L
Linus Torvalds 已提交
179
		return -2;
180 181
	}
	if (!(vflags & HPTE_V_BOLTED))
182
		pr_devel(" -> slot: %lu\n", slot & 7);
L
Linus Torvalds 已提交
183 184 185 186

	/* Because of iSeries, we have to pass down the secondary
	 * bucket bit here as well
	 */
187
	return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3);
L
Linus Torvalds 已提交
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206
}

static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock);

static long pSeries_lpar_hpte_remove(unsigned long hpte_group)
{
	unsigned long slot_offset;
	unsigned long lpar_rc;
	int i;
	unsigned long dummy1, dummy2;

	/* pick a random slot to start at */
	slot_offset = mftb() & 0x7;

	for (i = 0; i < HPTES_PER_GROUP; i++) {

		/* don't remove a bolted entry */
		lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset,
					   (0x1UL << 4), &dummy1, &dummy2);
207
		if (lpar_rc == H_SUCCESS)
L
Linus Torvalds 已提交
208
			return i;
209 210 211 212 213 214 215

		/*
		 * The test for adjunct partition is performed before the
		 * ANDCOND test.  H_RESOURCE may be returned, so we need to
		 * check for that as well.
		 */
		BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE);
L
Linus Torvalds 已提交
216 217 218 219 220 221 222 223

		slot_offset++;
		slot_offset &= 0x7;
	}

	return -1;
}

224
static void manual_hpte_clear_all(void)
L
Linus Torvalds 已提交
225 226 227
{
	unsigned long size_bytes = 1UL << ppc64_pft_size;
	unsigned long hpte_count = size_bytes >> 4;
228 229 230 231
	struct {
		unsigned long pteh;
		unsigned long ptel;
	} ptes[4];
232
	long lpar_rc;
233
	unsigned long i, j;
234 235 236 237 238 239 240

	/* Read in batches of 4,
	 * invalidate only valid entries not in the VRMA
	 * hpte_count will be a multiple of 4
         */
	for (i = 0; i < hpte_count; i += 4) {
		lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes);
241 242 243
		if (lpar_rc != H_SUCCESS) {
			pr_info("Failed to read hash page table at %ld err %ld\n",
				i, lpar_rc);
244
			continue;
245
		}
246 247 248 249 250 251 252
		for (j = 0; j < 4; j++){
			if ((ptes[j].pteh & HPTE_V_VRMA_MASK) ==
				HPTE_V_VRMA_MASK)
				continue;
			if (ptes[j].pteh & HPTE_V_VALID)
				plpar_pte_remove_raw(0, i + j, 0,
					&(ptes[j].pteh), &(ptes[j].ptel));
253 254
		}
	}
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
}

static int hcall_hpte_clear_all(void)
{
	int rc;

	do {
		rc = plpar_hcall_norets(H_CLEAR_HPT);
	} while (rc == H_CONTINUE);

	return rc;
}

static void pseries_hpte_clear_all(void)
{
	int rc;

	rc = hcall_hpte_clear_all();
	if (rc != H_SUCCESS)
		manual_hpte_clear_all();
275 276

#ifdef __LITTLE_ENDIAN__
277 278 279 280 281 282 283 284 285 286
	/*
	 * Reset exceptions to big endian.
	 *
	 * FIXME this is a hack for kexec, we need to reset the exception
	 * endian before starting the new kernel and this is a convenient place
	 * to do it.
	 *
	 * This is also called on boot when a fadump happens. In that case we
	 * must not change the exception endian mode.
	 */
287 288
	if (firmware_has_feature(FW_FEATURE_SET_MODE) && !is_fadump_active())
		pseries_big_endian_exceptions();
289
#endif
L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297
}

/*
 * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and
 * the low 3 bits of flags happen to line up.  So no transform is needed.
 * We can probably optimize here and assume the high bits of newpp are
 * already zero.  For now I am paranoid.
 */
298 299
static long pSeries_lpar_hpte_updatepp(unsigned long slot,
				       unsigned long newpp,
300
				       unsigned long vpn,
301
				       int psize, int apsize,
302
				       int ssize, unsigned long inv_flags)
L
Linus Torvalds 已提交
303 304
{
	unsigned long lpar_rc;
305
	unsigned long flags;
306
	unsigned long want_v;
L
Linus Torvalds 已提交
307

308
	want_v = hpte_encode_avpn(vpn, psize, ssize);
L
Linus Torvalds 已提交
309

310 311 312 313 314
	flags = (newpp & 7) | H_AVPN;
	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
		/* Move pp0 into bit 8 (IBM 55) */
		flags |= (newpp & HPTE_R_PP0) >> 55;

315 316 317
	pr_devel("    update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...",
		 want_v, slot, flags, psize);

P
Paul Mackerras 已提交
318
	lpar_rc = plpar_pte_protect(flags, slot, want_v);
319

320
	if (lpar_rc == H_NOT_FOUND) {
321
		pr_devel("not found !\n");
L
Linus Torvalds 已提交
322
		return -1;
323 324
	}

325
	pr_devel("ok\n");
L
Linus Torvalds 已提交
326

327
	BUG_ON(lpar_rc != H_SUCCESS);
L
Linus Torvalds 已提交
328 329 330 331

	return 0;
}

332
static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group)
L
Linus Torvalds 已提交
333
{
334 335 336 337 338 339
	long lpar_rc;
	unsigned long i, j;
	struct {
		unsigned long pteh;
		unsigned long ptel;
	} ptes[4];
L
Linus Torvalds 已提交
340

341
	for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
L
Linus Torvalds 已提交
342

343
		lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
344 345 346
		if (lpar_rc != H_SUCCESS) {
			pr_info("Failed to read hash page table at %ld err %ld\n",
				hpte_group, lpar_rc);
347
			continue;
348
		}
L
Linus Torvalds 已提交
349

350 351 352 353 354 355
		for (j = 0; j < 4; j++) {
			if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
			    (ptes[j].pteh & HPTE_V_VALID))
				return i + j;
		}
	}
L
Linus Torvalds 已提交
356

357
	return -1;
L
Linus Torvalds 已提交
358 359
}

360
static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
L
Linus Torvalds 已提交
361 362
{
	long slot;
363 364 365
	unsigned long hash;
	unsigned long want_v;
	unsigned long hpte_group;
L
Linus Torvalds 已提交
366

367 368
	hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
	want_v = hpte_encode_avpn(vpn, psize, ssize);
P
Paul Mackerras 已提交
369 370

	/* Bolted entries are always in the primary group */
371 372 373 374 375 376
	hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
	slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
	if (slot < 0)
		return -1;
	return hpte_group + slot;
}
L
Linus Torvalds 已提交
377 378

static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
379
					     unsigned long ea,
P
Paul Mackerras 已提交
380
					     int psize, int ssize)
L
Linus Torvalds 已提交
381
{
382 383
	unsigned long vpn;
	unsigned long lpar_rc, slot, vsid, flags;
L
Linus Torvalds 已提交
384

P
Paul Mackerras 已提交
385
	vsid = get_kernel_vsid(ea, ssize);
386
	vpn = hpt_vpn(ea, vsid, ssize);
L
Linus Torvalds 已提交
387

388
	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
L
Linus Torvalds 已提交
389 390 391
	BUG_ON(slot == -1);

	flags = newpp & 7;
392 393 394 395
	if (mmu_has_feature(MMU_FTR_KERNEL_RO))
		/* Move pp0 into bit 8 (IBM 55) */
		flags |= (newpp & HPTE_R_PP0) >> 55;

L
Linus Torvalds 已提交
396 397
	lpar_rc = plpar_pte_protect(flags, slot, 0);

398
	BUG_ON(lpar_rc != H_SUCCESS);
L
Linus Torvalds 已提交
399 400
}

401
static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long vpn,
402 403
					 int psize, int apsize,
					 int ssize, int local)
L
Linus Torvalds 已提交
404
{
405
	unsigned long want_v;
L
Linus Torvalds 已提交
406 407 408
	unsigned long lpar_rc;
	unsigned long dummy1, dummy2;

409 410
	pr_devel("    inval : slot=%lx, vpn=%016lx, psize: %d, local: %d\n",
		 slot, vpn, psize, local);
L
Linus Torvalds 已提交
411

412
	want_v = hpte_encode_avpn(vpn, psize, ssize);
P
Paul Mackerras 已提交
413
	lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2);
414
	if (lpar_rc == H_NOT_FOUND)
L
Linus Torvalds 已提交
415 416
		return;

417
	BUG_ON(lpar_rc != H_SUCCESS);
L
Linus Torvalds 已提交
418 419
}

420
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
421 422 423 424 425 426 427 428 429 430
/*
 * Limit iterations holding pSeries_lpar_tlbie_lock to 3. We also need
 * to make sure that we avoid bouncing the hypervisor tlbie lock.
 */
#define PPC64_HUGE_HPTE_BATCH 12

static void __pSeries_lpar_hugepage_invalidate(unsigned long *slot,
					     unsigned long *vpn, int count,
					     int psize, int ssize)
{
431
	unsigned long param[PLPAR_HCALL9_BUFSIZE];
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
	int i = 0, pix = 0, rc;
	unsigned long flags = 0;
	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);

	if (lock_tlbie)
		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);

	for (i = 0; i < count; i++) {

		if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
			pSeries_lpar_hpte_invalidate(slot[i], vpn[i], psize, 0,
						     ssize, 0);
		} else {
			param[pix] = HBR_REQUEST | HBR_AVPN | slot[i];
			param[pix+1] = hpte_encode_avpn(vpn[i], psize, ssize);
			pix += 2;
			if (pix == 8) {
				rc = plpar_hcall9(H_BULK_REMOVE, param,
						  param[0], param[1], param[2],
						  param[3], param[4], param[5],
						  param[6], param[7]);
				BUG_ON(rc != H_SUCCESS);
				pix = 0;
			}
		}
	}
	if (pix) {
		param[pix] = HBR_END;
		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
				  param[2], param[3], param[4], param[5],
				  param[6], param[7]);
		BUG_ON(rc != H_SUCCESS);
	}

	if (lock_tlbie)
		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
}

470 471 472
static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
					     unsigned long addr,
					     unsigned char *hpte_slot_array,
473
					     int psize, int ssize, int local)
474
{
475
	int i, index = 0;
476 477 478 479
	unsigned long s_addr = addr;
	unsigned int max_hpte_count, valid;
	unsigned long vpn_array[PPC64_HUGE_HPTE_BATCH];
	unsigned long slot_array[PPC64_HUGE_HPTE_BATCH];
480
	unsigned long shift, hidx, vpn = 0, hash, slot;
481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518

	shift = mmu_psize_defs[psize].shift;
	max_hpte_count = 1U << (PMD_SHIFT - shift);

	for (i = 0; i < max_hpte_count; i++) {
		valid = hpte_valid(hpte_slot_array, i);
		if (!valid)
			continue;
		hidx =  hpte_hash_index(hpte_slot_array, i);

		/* get the vpn */
		addr = s_addr + (i * (1ul << shift));
		vpn = hpt_vpn(addr, vsid, ssize);
		hash = hpt_hash(vpn, shift, ssize);
		if (hidx & _PTEIDX_SECONDARY)
			hash = ~hash;

		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
		slot += hidx & _PTEIDX_GROUP_IX;

		slot_array[index] = slot;
		vpn_array[index] = vpn;
		if (index == PPC64_HUGE_HPTE_BATCH - 1) {
			/*
			 * Now do a bluk invalidate
			 */
			__pSeries_lpar_hugepage_invalidate(slot_array,
							   vpn_array,
							   PPC64_HUGE_HPTE_BATCH,
							   psize, ssize);
			index = 0;
		} else
			index++;
	}
	if (index)
		__pSeries_lpar_hugepage_invalidate(slot_array, vpn_array,
						   index, psize, ssize);
}
519 520 521 522 523 524 525 526 527
#else
static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
					     unsigned long addr,
					     unsigned char *hpte_slot_array,
					     int psize, int ssize, int local)
{
	WARN(1, "%s called without THP support\n", __func__);
}
#endif
528

529 530
static int pSeries_lpar_hpte_removebolted(unsigned long ea,
					  int psize, int ssize)
531
{
532 533
	unsigned long vpn;
	unsigned long slot, vsid;
534 535

	vsid = get_kernel_vsid(ea, ssize);
536
	vpn = hpt_vpn(ea, vsid, ssize);
537

538
	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
539 540 541
	if (slot == -1)
		return -ENOENT;

542 543 544 545
	/*
	 * lpar doesn't use the passed actual page size
	 */
	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
546
	return 0;
547 548
}

L
Linus Torvalds 已提交
549 550 551 552
/*
 * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie
 * lock.
 */
553
static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
L
Linus Torvalds 已提交
554
{
555
	unsigned long vpn;
556
	unsigned long i, pix, rc;
557
	unsigned long flags = 0;
558
	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
559
	int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
560
	unsigned long param[PLPAR_HCALL9_BUFSIZE];
561 562
	unsigned long hash, index, shift, hidx, slot;
	real_pte_t pte;
P
Paul Mackerras 已提交
563
	int psize, ssize;
L
Linus Torvalds 已提交
564 565 566 567

	if (lock_tlbie)
		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);

568
	psize = batch->psize;
P
Paul Mackerras 已提交
569
	ssize = batch->ssize;
570 571
	pix = 0;
	for (i = 0; i < number; i++) {
572
		vpn = batch->vpn[i];
573
		pte = batch->pte[i];
574 575
		pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
			hash = hpt_hash(vpn, shift, ssize);
576 577 578 579 580
			hidx = __rpte_to_hidx(pte, index);
			if (hidx & _PTEIDX_SECONDARY)
				hash = ~hash;
			slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
			slot += hidx & _PTEIDX_GROUP_IX;
581
			if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
582 583 584
				/*
				 * lpar doesn't use the passed actual page size
				 */
585
				pSeries_lpar_hpte_invalidate(slot, vpn, psize,
586
							     0, ssize, local);
587 588
			} else {
				param[pix] = HBR_REQUEST | HBR_AVPN | slot;
589
				param[pix+1] = hpte_encode_avpn(vpn, psize,
P
Paul Mackerras 已提交
590
								ssize);
591 592 593
				pix += 2;
				if (pix == 8) {
					rc = plpar_hcall9(H_BULK_REMOVE, param,
594 595 596
						param[0], param[1], param[2],
						param[3], param[4], param[5],
						param[6], param[7]);
597 598 599
					BUG_ON(rc != H_SUCCESS);
					pix = 0;
				}
600 601 602 603 604 605 606 607 608 609
			}
		} pte_iterate_hashed_end();
	}
	if (pix) {
		param[pix] = HBR_END;
		rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1],
				  param[2], param[3], param[4], param[5],
				  param[6], param[7]);
		BUG_ON(rc != H_SUCCESS);
	}
L
Linus Torvalds 已提交
610 611 612 613 614

	if (lock_tlbie)
		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
}

615 616 617 618
static int __init disable_bulk_remove(char *str)
{
	if (strcmp(str, "off") == 0 &&
	    firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
619 620
		pr_info("Disabling BULK_REMOVE firmware feature");
		powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE;
621 622 623 624 625 626
	}
	return 1;
}

__setup("bulk_remove=", disable_bulk_remove);

627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
#define HPT_RESIZE_TIMEOUT	10000 /* ms */

struct hpt_resize_state {
	unsigned long shift;
	int commit_rc;
};

static int pseries_lpar_resize_hpt_commit(void *data)
{
	struct hpt_resize_state *state = data;

	state->commit_rc = plpar_resize_hpt_commit(0, state->shift);
	if (state->commit_rc != H_SUCCESS)
		return -EIO;

	/* Hypervisor has transitioned the HTAB, update our globals */
	ppc64_pft_size = state->shift;
	htab_size_bytes = 1UL << ppc64_pft_size;
	htab_hash_mask = (htab_size_bytes >> 7) - 1;

	return 0;
}

650 651 652 653
/*
 * Must be called in process context. The caller must hold the
 * cpus_lock.
 */
654 655 656 657 658 659 660 661 662 663 664 665 666 667 668
static int pseries_lpar_resize_hpt(unsigned long shift)
{
	struct hpt_resize_state state = {
		.shift = shift,
		.commit_rc = H_FUNCTION,
	};
	unsigned int delay, total_delay = 0;
	int rc;
	ktime_t t0, t1, t2;

	might_sleep();

	if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE))
		return -ENODEV;

669
	pr_info("Attempting to resize HPT to shift %lu\n", shift);
670 671 672 673 674 675 676 677 678 679 680

	t0 = ktime_get();

	rc = plpar_resize_hpt_prepare(0, shift);
	while (H_IS_LONG_BUSY(rc)) {
		delay = get_longbusy_msecs(rc);
		total_delay += delay;
		if (total_delay > HPT_RESIZE_TIMEOUT) {
			/* prepare with shift==0 cancels an in-progress resize */
			rc = plpar_resize_hpt_prepare(0, 0);
			if (rc != H_SUCCESS)
681
				pr_warn("Unexpected error %d cancelling timed out HPT resize\n",
682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
				       rc);
			return -ETIMEDOUT;
		}
		msleep(delay);
		rc = plpar_resize_hpt_prepare(0, shift);
	};

	switch (rc) {
	case H_SUCCESS:
		/* Continue on */
		break;

	case H_PARAMETER:
		return -EINVAL;
	case H_RESOURCE:
		return -EPERM;
	default:
699
		pr_warn("Unexpected error %d from H_RESIZE_HPT_PREPARE\n", rc);
700 701 702 703 704
		return -EIO;
	}

	t1 = ktime_get();

705 706
	rc = stop_machine_cpuslocked(pseries_lpar_resize_hpt_commit,
				     &state, NULL);
707 708 709 710 711 712

	t2 = ktime_get();

	if (rc != 0) {
		switch (state.commit_rc) {
		case H_PTEG_FULL:
713
			pr_warn("Hash collision while resizing HPT\n");
714 715 716
			return -ENOSPC;

		default:
717 718
			pr_warn("Unexpected error %d from H_RESIZE_HPT_COMMIT\n",
				state.commit_rc);
719 720 721 722
			return -EIO;
		};
	}

723 724 725
	pr_info("HPT resize to shift %lu complete (%lld ms / %lld ms)\n",
		shift, (long long) ktime_ms_delta(t1, t0),
		(long long) ktime_ms_delta(t2, t1));
726 727 728 729

	return 0;
}

730 731 732 733
static int pseries_lpar_register_process_table(unsigned long base,
			unsigned long page_size, unsigned long table_size)
{
	long rc;
734
	unsigned long flags = 0;
735

736 737
	if (table_size)
		flags |= PROC_TABLE_NEW;
738 739
	if (radix_enabled())
		flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE;
740 741
	else
		flags |= PROC_TABLE_HPT_SLB;
742 743 744 745 746 747 748 749 750 751 752 753 754 755
	for (;;) {
		rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base,
					page_size, table_size);
		if (!H_IS_LONG_BUSY(rc))
			break;
		mdelay(get_longbusy_msecs(rc));
	}
	if (rc != H_SUCCESS) {
		pr_err("Failed to register process table (rc=%ld)\n", rc);
		BUG();
	}
	return rc;
}

756
void __init hpte_init_pseries(void)
L
Linus Torvalds 已提交
757
{
758 759 760 761 762 763 764
	mmu_hash_ops.hpte_invalidate	 = pSeries_lpar_hpte_invalidate;
	mmu_hash_ops.hpte_updatepp	 = pSeries_lpar_hpte_updatepp;
	mmu_hash_ops.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp;
	mmu_hash_ops.hpte_insert	 = pSeries_lpar_hpte_insert;
	mmu_hash_ops.hpte_remove	 = pSeries_lpar_hpte_remove;
	mmu_hash_ops.hpte_removebolted   = pSeries_lpar_hpte_removebolted;
	mmu_hash_ops.flush_hash_range	 = pSeries_lpar_flush_hash_range;
765
	mmu_hash_ops.hpte_clear_all      = pseries_hpte_clear_all;
766
	mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate;
767
	register_process_table		 = pseries_lpar_register_process_table;
768 769 770

	if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
		mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
L
Linus Torvalds 已提交
771
}
772

773 774 775 776 777 778
void radix_init_pseries(void)
{
	pr_info("Using radix MMU under hypervisor\n");
	register_process_table = pseries_lpar_register_process_table;
}

779 780 781 782 783 784 785 786 787 788
#ifdef CONFIG_PPC_SMLPAR
#define CMO_FREE_HINT_DEFAULT 1
static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT;

static int __init cmo_free_hint(char *str)
{
	char *parm;
	parm = strstrip(str);

	if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) {
789
		pr_info("%s: CMO free page hinting is not active.\n", __func__);
790 791 792 793 794
		cmo_free_hint_flag = 0;
		return 1;
	}

	cmo_free_hint_flag = 1;
795
	pr_info("%s: CMO free page hinting is active.\n", __func__);
796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821

	if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0)
		return 1;

	return 0;
}

__setup("cmo_free_hint=", cmo_free_hint);

static void pSeries_set_page_state(struct page *page, int order,
				   unsigned long state)
{
	int i, j;
	unsigned long cmo_page_sz, addr;

	cmo_page_sz = cmo_get_page_size();
	addr = __pa((unsigned long)page_address(page));

	for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) {
		for (j = 0; j < PAGE_SIZE; j += cmo_page_sz)
			plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0);
	}
}

void arch_free_page(struct page *page, int order)
{
822 823
	if (radix_enabled())
		return;
824 825 826 827 828 829 830
	if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO))
		return;

	pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED);
}
EXPORT_SYMBOL(arch_free_page);

831
#endif /* CONFIG_PPC_SMLPAR */
832
#endif /* CONFIG_PPC_BOOK3S_64 */
833 834

#ifdef CONFIG_TRACEPOINTS
835
#ifdef CONFIG_JUMP_LABEL
836 837
struct static_key hcall_tracepoint_key = STATIC_KEY_INIT;

838
int hcall_tracepoint_regfunc(void)
839 840
{
	static_key_slow_inc(&hcall_tracepoint_key);
841
	return 0;
842 843 844 845 846 847 848
}

void hcall_tracepoint_unregfunc(void)
{
	static_key_slow_dec(&hcall_tracepoint_key);
}
#else
849 850 851 852 853 854 855 856 857
/*
 * We optimise our hcall path by placing hcall_tracepoint_refcount
 * directly in the TOC so we can check if the hcall tracepoints are
 * enabled via a single load.
 */

/* NB: reg/unreg are called while guarded with the tracepoints_mutex */
extern long hcall_tracepoint_refcount;

858
int hcall_tracepoint_regfunc(void)
859 860
{
	hcall_tracepoint_refcount++;
861
	return 0;
862 863 864 865 866 867
}

void hcall_tracepoint_unregfunc(void)
{
	hcall_tracepoint_refcount--;
}
868 869 870 871 872 873 874 875 876
#endif

/*
 * Since the tracing code might execute hcalls we need to guard against
 * recursion. One example of this are spinlocks calling H_YIELD on
 * shared processor partitions.
 */
static DEFINE_PER_CPU(unsigned int, hcall_trace_depth);

877

878
void __trace_hcall_entry(unsigned long opcode, unsigned long *args)
879
{
880 881 882
	unsigned long flags;
	unsigned int *depth;

883 884 885 886 887 888 889
	/*
	 * We cannot call tracepoints inside RCU idle regions which
	 * means we must not trace H_CEDE.
	 */
	if (opcode == H_CEDE)
		return;

890 891
	local_irq_save(flags);

892
	depth = this_cpu_ptr(&hcall_trace_depth);
893 894 895 896 897

	if (*depth)
		goto out;

	(*depth)++;
898
	preempt_disable();
899
	trace_hcall_entry(opcode, args);
900 901 902 903
	(*depth)--;

out:
	local_irq_restore(flags);
904 905
}

906
void __trace_hcall_exit(long opcode, long retval, unsigned long *retbuf)
907
{
908 909 910
	unsigned long flags;
	unsigned int *depth;

911 912 913
	if (opcode == H_CEDE)
		return;

914 915
	local_irq_save(flags);

916
	depth = this_cpu_ptr(&hcall_trace_depth);
917 918 919 920 921

	if (*depth)
		goto out;

	(*depth)++;
922
	trace_hcall_exit(opcode, retval, retbuf);
923
	preempt_enable();
924 925 926 927
	(*depth)--;

out:
	local_irq_restore(flags);
928 929
}
#endif
930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949

/**
 * h_get_mpp
 * H_GET_MPP hcall returns info in 7 parms
 */
int h_get_mpp(struct hvcall_mpp_data *mpp_data)
{
	int rc;
	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];

	rc = plpar_hcall9(H_GET_MPP, retbuf);

	mpp_data->entitled_mem = retbuf[0];
	mpp_data->mapped_mem = retbuf[1];

	mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff;
	mpp_data->pool_num = retbuf[2] & 0xffff;

	mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff;
	mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff;
950
	mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffffUL;
951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973

	mpp_data->pool_size = retbuf[4];
	mpp_data->loan_request = retbuf[5];
	mpp_data->backing_mem = retbuf[6];

	return rc;
}
EXPORT_SYMBOL(h_get_mpp);

int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
{
	int rc;
	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 };

	rc = plpar_hcall9(H_GET_MPP_X, retbuf);

	mpp_x_data->coalesced_bytes = retbuf[0];
	mpp_x_data->pool_coalesced_bytes = retbuf[1];
	mpp_x_data->pool_purr_cycles = retbuf[2];
	mpp_x_data->pool_spurr_cycles = retbuf[3];

	return rc;
}
974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034

static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
{
	unsigned long protovsid;
	unsigned long va_bits = VA_BITS;
	unsigned long modinv, vsid_modulus;
	unsigned long max_mod_inv, tmp_modinv;

	if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
		va_bits = 65;

	if (ssize == MMU_SEGSIZE_256M) {
		modinv = VSID_MULINV_256M;
		vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
	} else {
		modinv = VSID_MULINV_1T;
		vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
	}

	/*
	 * vsid outside our range.
	 */
	if (vsid >= vsid_modulus)
		return 0;

	/*
	 * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
	 * and vsid = (protovsid * x) % vsid_modulus, then we say:
	 *   protovsid = (vsid * modinv) % vsid_modulus
	 */

	/* Check if (vsid * modinv) overflow (63 bits) */
	max_mod_inv = 0x7fffffffffffffffull / vsid;
	if (modinv < max_mod_inv)
		return (vsid * modinv) % vsid_modulus;

	tmp_modinv = modinv/max_mod_inv;
	modinv %= max_mod_inv;

	protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
	protovsid = (protovsid + vsid * modinv) % vsid_modulus;

	return protovsid;
}

static int __init reserve_vrma_context_id(void)
{
	unsigned long protovsid;

	/*
	 * Reserve context ids which map to reserved virtual addresses. For now
	 * we only reserve the context id which maps to the VRMA VSID. We ignore
	 * the addresses in "ibm,adjunct-virtual-addresses" because we don't
	 * enable adjunct support via the "ibm,client-architecture-support"
	 * interface.
	 */
	protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
	hash__reserve_context_id(protovsid >> ESID_BITS_1T);
	return 0;
}
machine_device_initcall(pseries, reserve_vrma_context_id);