pgtable-radix.c 27.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/*
 * Page table handling routines for radix page table.
 *
 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
11 12 13 14

#define pr_fmt(fmt) "radix-mmu: " fmt

#include <linux/kernel.h>
15
#include <linux/sched/mm.h>
16 17
#include <linux/memblock.h>
#include <linux/of_fdt.h>
18
#include <linux/mm.h>
19
#include <linux/string_helpers.h>
20
#include <linux/stop_machine.h>
21 22 23

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
24
#include <asm/mmu_context.h>
25 26 27 28
#include <asm/dma.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/firmware.h>
29
#include <asm/powernv.h>
30
#include <asm/sections.h>
31
#include <asm/trace.h>
32

33 34
#include <trace/events/thp.h>

35 36 37
unsigned int mmu_pid_bits;
unsigned int mmu_base_pid;

38 39
static int native_register_process_table(unsigned long base, unsigned long pg_sz,
					 unsigned long table_size)
40
{
41 42 43 44 45 46
	unsigned long patb0, patb1;

	patb0 = be64_to_cpu(partition_tb[0].patb0);
	patb1 = base | table_size | PATB_GR;

	mmu_partition_table_set_entry(0, patb0, patb1);
47

48 49 50
	return 0;
}

51 52
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
			unsigned long region_start, unsigned long region_end)
53
{
54
	unsigned long pa = 0;
55 56
	void *pt;

57 58 59 60 61 62 63 64 65 66 67 68 69 70
	if (region_start || region_end) /* has region hint */
		pa = memblock_alloc_range(size, size, region_start, region_end,
						MEMBLOCK_NONE);
	else if (nid != -1) /* has node hint */
		pa = memblock_alloc_base_nid(size, size,
						MEMBLOCK_ALLOC_ANYWHERE,
						nid, MEMBLOCK_NONE);

	if (!pa)
		pa = memblock_alloc_base(size, size, MEMBLOCK_ALLOC_ANYWHERE);

	BUG_ON(!pa);

	pt = __va(pa);
71 72 73 74 75
	memset(pt, 0, size);

	return pt;
}

76
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
77
			  pgprot_t flags,
78 79 80
			  unsigned int map_page_size,
			  int nid,
			  unsigned long region_start, unsigned long region_end)
81
{
82
	unsigned long pfn = pa >> PAGE_SHIFT;
83 84 85 86 87 88 89
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

	pgdp = pgd_offset_k(ea);
	if (pgd_none(*pgdp)) {
90 91
		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
						region_start, region_end);
92 93 94 95 96 97 98 99
		pgd_populate(&init_mm, pgdp, pudp);
	}
	pudp = pud_offset(pgdp, ea);
	if (map_page_size == PUD_SIZE) {
		ptep = (pte_t *)pudp;
		goto set_the_pte;
	}
	if (pud_none(*pudp)) {
100 101
		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
						region_start, region_end);
102 103 104 105 106 107 108 109
		pud_populate(&init_mm, pudp, pmdp);
	}
	pmdp = pmd_offset(pudp, ea);
	if (map_page_size == PMD_SIZE) {
		ptep = pmdp_ptep(pmdp);
		goto set_the_pte;
	}
	if (!pmd_present(*pmdp)) {
110 111
		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
						region_start, region_end);
112 113 114 115 116
		pmd_populate_kernel(&init_mm, pmdp, ptep);
	}
	ptep = pte_offset_kernel(pmdp, ea);

set_the_pte:
117
	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
118 119 120 121
	smp_wmb();
	return 0;
}

122 123 124 125 126
/*
 * nid, region_start, and region_end are hints to try to place the page
 * table memory in the same node or region.
 */
static int __map_kernel_page(unsigned long ea, unsigned long pa,
127
			  pgprot_t flags,
128 129 130
			  unsigned int map_page_size,
			  int nid,
			  unsigned long region_start, unsigned long region_end)
131
{
132
	unsigned long pfn = pa >> PAGE_SHIFT;
133 134 135 136 137 138 139 140
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;
	/*
	 * Make sure task size is correct as per the max adddr
	 */
	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
141

142 143 144
	if (unlikely(!slab_is_available()))
		return early_map_kernel_page(ea, pa, flags, map_page_size,
						nid, region_start, region_end);
145

146 147 148 149 150
	/*
	 * Should make page table allocation functions be able to take a
	 * node, so we can place kernel page tables on the right nodes after
	 * boot.
	 */
151 152 153 154 155 156 157
	pgdp = pgd_offset_k(ea);
	pudp = pud_alloc(&init_mm, pgdp, ea);
	if (!pudp)
		return -ENOMEM;
	if (map_page_size == PUD_SIZE) {
		ptep = (pte_t *)pudp;
		goto set_the_pte;
158
	}
159 160 161 162 163 164
	pmdp = pmd_alloc(&init_mm, pudp, ea);
	if (!pmdp)
		return -ENOMEM;
	if (map_page_size == PMD_SIZE) {
		ptep = pmdp_ptep(pmdp);
		goto set_the_pte;
165
	}
166 167 168
	ptep = pte_alloc_kernel(pmdp, ea);
	if (!ptep)
		return -ENOMEM;
169 170

set_the_pte:
171
	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
172 173 174 175
	smp_wmb();
	return 0;
}

176 177 178 179 180 181 182
int radix__map_kernel_page(unsigned long ea, unsigned long pa,
			  pgprot_t flags,
			  unsigned int map_page_size)
{
	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
}

183
#ifdef CONFIG_STRICT_KERNEL_RWX
184 185
void radix__change_memory_range(unsigned long start, unsigned long end,
				unsigned long clear)
186 187 188 189 190 191 192 193 194 195
{
	unsigned long idx;
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

	start = ALIGN_DOWN(start, PAGE_SIZE);
	end = PAGE_ALIGN(end); // aligns up

196 197
	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
		 start, end, clear);
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218

	for (idx = start; idx < end; idx += PAGE_SIZE) {
		pgdp = pgd_offset_k(idx);
		pudp = pud_alloc(&init_mm, pgdp, idx);
		if (!pudp)
			continue;
		if (pud_huge(*pudp)) {
			ptep = (pte_t *)pudp;
			goto update_the_pte;
		}
		pmdp = pmd_alloc(&init_mm, pudp, idx);
		if (!pmdp)
			continue;
		if (pmd_huge(*pmdp)) {
			ptep = pmdp_ptep(pmdp);
			goto update_the_pte;
		}
		ptep = pte_alloc_kernel(pmdp, idx);
		if (!ptep)
			continue;
update_the_pte:
219
		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
220 221 222 223
	}

	radix__flush_tlb_kernel_range(start, end);
}
224 225 226 227 228

void radix__mark_rodata_ro(void)
{
	unsigned long start, end;

229 230 231 232 233 234 235 236 237 238
	/*
	 * mark_rodata_ro() will mark itself as !writable at some point.
	 * Due to DD1 workaround in radix__pte_update(), we'll end up with
	 * an invalid pte and the system will crash quite severly.
	 */
	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
		pr_warn("Warning: Unable to mark rodata read only on P9 DD1\n");
		return;
	}

239 240 241 242 243
	start = (unsigned long)_stext;
	end = (unsigned long)__init_begin;

	radix__change_memory_range(start, end, _PAGE_WRITE);
}
244 245 246 247 248 249 250 251

void radix__mark_initmem_nx(void)
{
	unsigned long start = (unsigned long)__init_begin;
	unsigned long end = (unsigned long)__init_end;

	radix__change_memory_range(start, end, _PAGE_EXEC);
}
252 253
#endif /* CONFIG_STRICT_KERNEL_RWX */

254 255 256 257
static inline void __meminit print_mapping(unsigned long start,
					   unsigned long end,
					   unsigned long size)
{
258 259
	char buf[10];

260 261 262
	if (end <= start)
		return;

263 264 265
	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));

	pr_info("Mapped 0x%016lx-0x%016lx with %s pages\n", start, end, buf);
266 267 268
}

static int __meminit create_physical_mapping(unsigned long start,
269 270
					     unsigned long end,
					     int nid)
271
{
272 273
	unsigned long vaddr, addr, mapping_size = 0;
	pgprot_t prot;
274 275 276 277 278 279
	unsigned long max_mapping_size;
#ifdef CONFIG_STRICT_KERNEL_RWX
	int split_text_mapping = 1;
#else
	int split_text_mapping = 0;
#endif
280 281 282 283 284 285 286 287

	start = _ALIGN_UP(start, PAGE_SIZE);
	for (addr = start; addr < end; addr += mapping_size) {
		unsigned long gap, previous_size;
		int rc;

		gap = end - addr;
		previous_size = mapping_size;
288
		max_mapping_size = PUD_SIZE;
289

290
retry:
291
		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
292 293
		    mmu_psize_defs[MMU_PAGE_1G].shift &&
		    PUD_SIZE <= max_mapping_size)
294 295 296 297 298 299 300
			mapping_size = PUD_SIZE;
		else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
			 mmu_psize_defs[MMU_PAGE_2M].shift)
			mapping_size = PMD_SIZE;
		else
			mapping_size = PAGE_SIZE;

301 302 303 304 305 306 307 308 309 310 311 312
		if (split_text_mapping && (mapping_size == PUD_SIZE) &&
			(addr <= __pa_symbol(__init_begin)) &&
			(addr + mapping_size) >= __pa_symbol(_stext)) {
			max_mapping_size = PMD_SIZE;
			goto retry;
		}

		if (split_text_mapping && (mapping_size == PMD_SIZE) &&
		    (addr <= __pa_symbol(__init_begin)) &&
		    (addr + mapping_size) >= __pa_symbol(_stext))
			mapping_size = PAGE_SIZE;

313 314 315 316 317
		if (mapping_size != previous_size) {
			print_mapping(start, addr, previous_size);
			start = addr;
		}

318 319
		vaddr = (unsigned long)__va(addr);

320 321
		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size))
322 323 324 325
			prot = PAGE_KERNEL_X;
		else
			prot = PAGE_KERNEL;

326
		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
327 328 329 330 331 332 333 334
		if (rc)
			return rc;
	}

	print_mapping(start, addr, mapping_size);
	return 0;
}

335
void __init radix_init_pgtable(void)
336 337 338 339 340 341 342 343 344
{
	unsigned long rts_field;
	struct memblock_region *reg;

	/* We don't support slb for radix */
	mmu_slb_size = 0;
	/*
	 * Create the linear mapping, using standard page size for now
	 */
345 346 347 348 349 350
	for_each_memblock(memory, reg) {
		/*
		 * The memblock allocator  is up at this point, so the
		 * page tables will be allocated within the range. No
		 * need or a node (which we don't have yet).
		 */
351
		WARN_ON(create_physical_mapping(reg->base,
352 353 354
						reg->base + reg->size,
						-1));
	}
355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377

	/* Find out how many PID bits are supported */
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
		if (!mmu_pid_bits)
			mmu_pid_bits = 20;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
		/*
		 * When KVM is possible, we only use the top half of the
		 * PID space to avoid collisions between host and guest PIDs
		 * which can cause problems due to prefetch when exiting the
		 * guest with AIL=3
		 */
		mmu_base_pid = 1 << (mmu_pid_bits - 1);
#else
		mmu_base_pid = 1;
#endif
	} else {
		/* The guest uses the bottom half of the PID space */
		if (!mmu_pid_bits)
			mmu_pid_bits = 19;
		mmu_base_pid = 1;
	}

378 379 380 381
	/*
	 * Allocate Partition table and process table for the
	 * host.
	 */
382
	BUG_ON(PRTB_SIZE_SHIFT > 36);
383
	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
384 385 386
	/*
	 * Fill in the process table.
	 */
387
	rts_field = radix__get_tree_size();
388 389 390 391 392 393
	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
	/*
	 * Fill in the partition table. We are suppose to use effective address
	 * of process table here. But our linear mapping also enable us to use
	 * physical address here.
	 */
394
	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
395
	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
396 397 398 399
	asm volatile("ptesync" : : : "memory");
	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
400
	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416

	/*
	 * The init_mm context is given the first available (non-zero) PID,
	 * which is the "guard PID" and contains no page table. PIDR should
	 * never be set to zero because that duplicates the kernel address
	 * space at the 0x0... offset (quadrant 0)!
	 *
	 * An arbitrary PID that may later be allocated by the PID allocator
	 * for userspace processes must not be used either, because that
	 * would cause stale user mappings for that PID on CPUs outside of
	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
	 *
	 * So permanently carve out one PID for the purpose of a guard PID.
	 */
	init_mm.context.id = mmu_base_pid;
	mmu_base_pid++;
417 418 419 420
}

static void __init radix_init_partition_table(void)
{
421
	unsigned long rts_field, dw0;
422

423
	mmu_partition_table_init();
424
	rts_field = radix__get_tree_size();
425 426
	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
	mmu_partition_table_set_entry(0, dw0, 0);
427

428 429
	pr_info("Initializing Radix MMU\n");
	pr_info("Partition table %p\n", partition_tb);
430 431 432 433
}

void __init radix_init_native(void)
{
434
	register_process_table = native_register_process_table;
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471
}

static int __init get_idx_from_shift(unsigned int shift)
{
	int idx = -1;

	switch (shift) {
	case 0xc:
		idx = MMU_PAGE_4K;
		break;
	case 0x10:
		idx = MMU_PAGE_64K;
		break;
	case 0x15:
		idx = MMU_PAGE_2M;
		break;
	case 0x1e:
		idx = MMU_PAGE_1G;
		break;
	}
	return idx;
}

static int __init radix_dt_scan_page_sizes(unsigned long node,
					   const char *uname, int depth,
					   void *data)
{
	int size = 0;
	int shift, idx;
	unsigned int ap;
	const __be32 *prop;
	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);

	/* We are scanning "cpu" nodes only */
	if (type == NULL || strcmp(type, "cpu") != 0)
		return 0;

472 473 474 475 476 477
	/* Find MMU PID size */
	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
	if (prop && size == 4)
		mmu_pid_bits = be32_to_cpup(prop);

	/* Grab page size encodings */
478 479 480 481 482 483 484 485 486 487 488 489
	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
	if (!prop)
		return 0;

	pr_info("Page sizes from device-tree:\n");
	for (; size >= 4; size -= 4, ++prop) {

		struct mmu_psize_def *def;

		/* top 3 bit is AP encoding */
		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
		ap = be32_to_cpu(prop[0]) >> 29;
490
		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
491 492 493 494 495 496 497 498 499 500 501 502 503 504 505

		idx = get_idx_from_shift(shift);
		if (idx < 0)
			continue;

		def = &mmu_psize_defs[idx];
		def->shift = shift;
		def->ap  = ap;
	}

	/* needed ? */
	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
	return 1;
}

506
void __init radix__early_init_devtree(void)
507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535
{
	int rc;

	/*
	 * Try to find the available page sizes in the device-tree
	 */
	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
	if (rc != 0)  /* Found */
		goto found;
	/*
	 * let's assume we have page 4k and 64k support
	 */
	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;

	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
found:
#ifdef CONFIG_SPARSEMEM_VMEMMAP
	if (mmu_psize_defs[MMU_PAGE_2M].shift) {
		/*
		 * map vmemmap using 2M if available
		 */
		mmu_vmemmap_psize = MMU_PAGE_2M;
	}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
	return;
}

536 537 538 539 540 541 542 543 544 545 546 547 548
static void update_hid_for_radix(void)
{
	unsigned long hid0;
	unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */

	asm volatile("ptesync": : :"memory");
	/* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
		     : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
	/* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
		     : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
	asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
549 550 551
	trace_tlbie(0, 0, rb, 0, 2, 0, 1);
	trace_tlbie(0, 0, rb, 0, 2, 1, 1);

552 553 554 555 556 557 558 559 560 561 562 563 564
	/*
	 * now switch the HID
	 */
	hid0  = mfspr(SPRN_HID0);
	hid0 |= HID0_POWER9_RADIX;
	mtspr(SPRN_HID0, hid0);
	asm volatile("isync": : :"memory");

	/* Wait for it to happen */
	while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
		cpu_relax();
}

565 566 567 568 569 570 571 572 573 574 575 576
static void radix_init_amor(void)
{
	/*
	* In HV mode, we init AMOR (Authority Mask Override Register) so that
	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
	* Register), enable key 0 and set it to 1.
	*
	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
	*/
	mtspr(SPRN_AMOR, (3ul << 62));
}

577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596
static void radix_init_iamr(void)
{
	unsigned long iamr;

	/*
	 * The IAMR should set to 0 on DD1.
	 */
	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
		iamr = 0;
	else
		iamr = (1ul << 62);

	/*
	 * Radix always uses key0 of the IAMR to determine if an access is
	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
	 * fetch.
	 */
	mtspr(SPRN_IAMR, iamr);
}

597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618
void __init radix__early_init_mmu(void)
{
	unsigned long lpcr;

#ifdef CONFIG_PPC_64K_PAGES
	/* PAGE_SIZE mappings */
	mmu_virtual_psize = MMU_PAGE_64K;
#else
	mmu_virtual_psize = MMU_PAGE_4K;
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
	/* vmemmap mapping */
	mmu_vmemmap_psize = mmu_virtual_psize;
#endif
	/*
	 * initialize page table size
	 */
	__pte_index_size = RADIX_PTE_INDEX_SIZE;
	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
	__pud_index_size = RADIX_PUD_INDEX_SIZE;
	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
619
	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
620 621 622 623 624
	__pte_table_size = RADIX_PTE_TABLE_SIZE;
	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
	__pud_table_size = RADIX_PUD_TABLE_SIZE;
	__pgd_table_size = RADIX_PGD_TABLE_SIZE;

625 626 627
	__pmd_val_bits = RADIX_PMD_VAL_BITS;
	__pud_val_bits = RADIX_PUD_VAL_BITS;
	__pgd_val_bits = RADIX_PGD_VAL_BITS;
628

629 630 631 632
	__kernel_virt_start = RADIX_KERN_VIRT_START;
	__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
	__vmalloc_start = RADIX_VMALLOC_START;
	__vmalloc_end = RADIX_VMALLOC_END;
633
	__kernel_io_start = RADIX_KERN_IO_START;
634 635
	vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
	ioremap_bot = IOREMAP_BASE;
636 637 638 639

#ifdef CONFIG_PCI
	pci_io_base = ISA_IO_BASE;
#endif
640 641
	__pte_frag_nr = RADIX_PTE_FRAG_NR;
	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
642 643
	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
644

645
	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
646
		radix_init_native();
647 648
		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
			update_hid_for_radix();
649
		lpcr = mfspr(SPRN_LPCR);
650
		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
651
		radix_init_partition_table();
652
		radix_init_amor();
653 654
	} else {
		radix_init_pseries();
655
	}
656

657 658
	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);

659
	radix_init_iamr();
660
	radix_init_pgtable();
661 662
	/* Switch to the guard PID before turning on MMU */
	radix__switch_mmu_context(NULL, &init_mm);
663 664
	if (cpu_has_feature(CPU_FTR_HVMODE))
		tlbiel_all();
665 666 667 668 669 670
}

void radix__early_init_mmu_secondary(void)
{
	unsigned long lpcr;
	/*
671
	 * update partition table control register and UPRT
672
	 */
673
	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
674 675 676 677

		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
			update_hid_for_radix();

678
		lpcr = mfspr(SPRN_LPCR);
679
		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
680

681 682
		mtspr(SPRN_PTCR,
		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
683
		radix_init_amor();
684
	}
685
	radix_init_iamr();
686

687
	radix__switch_mmu_context(NULL, &init_mm);
688 689
	if (cpu_has_feature(CPU_FTR_HVMODE))
		tlbiel_all();
690 691
}

692 693 694 695 696 697 698 699
void radix__mmu_cleanup_all(void)
{
	unsigned long lpcr;

	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
		lpcr = mfspr(SPRN_LPCR);
		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
		mtspr(SPRN_PTCR, 0);
700
		powernv_set_nmmu_ptcr(0);
701 702 703 704
		radix__flush_tlb_all();
	}
}

705 706 707
void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
				phys_addr_t first_memblock_size)
{
708 709 710 711
	/* We don't currently support the first MEMBLOCK not mapping 0
	 * physical on those processors
	 */
	BUG_ON(first_memblock_base != 0);
712

713 714 715 716
	/*
	 * Radix mode is not limited by RMA / VRMA addressing.
	 */
	ppc64_rma_size = ULONG_MAX;
717
}
718

719
#ifdef CONFIG_MEMORY_HOTPLUG
720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
		if (!pte_none(*pte))
			return;
	}

	pte_free_kernel(&init_mm, pte_start);
	pmd_clear(pmd);
}

static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
		if (!pmd_none(*pmd))
			return;
	}

	pmd_free(&init_mm, pmd_start);
	pud_clear(pud);
}

750 751 752 753 754 755 756 757
struct change_mapping_params {
	pte_t *pte;
	unsigned long start;
	unsigned long end;
	unsigned long aligned_start;
	unsigned long aligned_end;
};

758
static int __meminit stop_machine_change_mapping(void *data)
759 760 761 762 763 764 765 766 767
{
	struct change_mapping_params *params =
			(struct change_mapping_params *)data;

	if (!data)
		return -1;

	spin_unlock(&init_mm.page_table_lock);
	pte_clear(&init_mm, params->aligned_start, params->pte);
768 769
	create_physical_mapping(params->aligned_start, params->start, -1);
	create_physical_mapping(params->end, params->aligned_end, -1);
770 771 772 773
	spin_lock(&init_mm.page_table_lock);
	return 0;
}

774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
static void remove_pte_table(pte_t *pte_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pte_t *pte;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

789 790 791 792 793 794 795 796 797
		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
			/*
			 * The vmemmap_free() and remove_section_mapping()
			 * codepaths call us with aligned addresses.
			 */
			WARN_ONCE(1, "%s: unaligned range\n", __func__);
			continue;
		}

798 799 800 801
		pte_clear(&init_mm, addr, pte);
	}
}

802 803 804
/*
 * clear the pte and potentially split the mapping helper
 */
805
static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
				unsigned long size, pte_t *pte)
{
	unsigned long mask = ~(size - 1);
	unsigned long aligned_start = addr & mask;
	unsigned long aligned_end = addr + size;
	struct change_mapping_params params;
	bool split_region = false;

	if ((end - addr) < size) {
		/*
		 * We're going to clear the PTE, but not flushed
		 * the mapping, time to remap and flush. The
		 * effects if visible outside the processor or
		 * if we are running in code close to the
		 * mapping we cleared, we are in trouble.
		 */
		if (overlaps_kernel_text(aligned_start, addr) ||
			overlaps_kernel_text(end, aligned_end)) {
			/*
			 * Hack, just return, don't pte_clear
			 */
			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
				  "text, not splitting\n", addr, end);
			return;
		}
		split_region = true;
	}

	if (split_region) {
		params.pte = pte;
		params.start = addr;
		params.end = end;
		params.aligned_start = addr & ~(size - 1);
		params.aligned_end = min_t(unsigned long, aligned_end,
				(unsigned long)__va(memblock_end_of_DRAM()));
		stop_machine(stop_machine_change_mapping, &params, NULL);
		return;
	}

	pte_clear(&init_mm, addr, pte);
}

848 849 850 851 852 853 854 855 856 857 858 859 860 861 862
static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pte_t *pte_base;
	pmd_t *pmd;

	pmd = pmd_start + pmd_index(addr);
	for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
			continue;

		if (pmd_huge(*pmd)) {
863
			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887
			continue;
		}

		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
		remove_pte_table(pte_base, addr, next);
		free_pte_table(pte_base, pmd);
	}
}

static void remove_pud_table(pud_t *pud_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pmd_t *pmd_base;
	pud_t *pud;

	pud = pud_start + pud_index(addr);
	for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);

		if (!pud_present(*pud))
			continue;

		if (pud_huge(*pud)) {
888
			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
889 890 891 892 893 894 895 896 897
			continue;
		}

		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
		remove_pmd_table(pmd_base, addr, next);
		free_pmd_table(pmd_base, pud);
	}
}

898
static void __meminit remove_pagetable(unsigned long start, unsigned long end)
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
{
	unsigned long addr, next;
	pud_t *pud_base;
	pgd_t *pgd;

	spin_lock(&init_mm.page_table_lock);

	for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);

		pgd = pgd_offset_k(addr);
		if (!pgd_present(*pgd))
			continue;

		if (pgd_huge(*pgd)) {
914
			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
915 916 917 918 919 920 921 922 923 924 925
			continue;
		}

		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
		remove_pud_table(pud_base, addr, next);
	}

	spin_unlock(&init_mm.page_table_lock);
	radix__flush_tlb_kernel_range(start, end);
}

926
int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
927
{
928
	return create_physical_mapping(start, end, nid);
929
}
930

931
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
932 933 934 935
{
	remove_pagetable(start, end);
	return 0;
}
936 937
#endif /* CONFIG_MEMORY_HOTPLUG */

938
#ifdef CONFIG_SPARSEMEM_VMEMMAP
939 940 941 942 943 944 945
static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
				 pgprot_t flags, unsigned int map_page_size,
				 int nid)
{
	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
}

946 947 948 949 950 951
int __meminit radix__vmemmap_create_mapping(unsigned long start,
				      unsigned long page_size,
				      unsigned long phys)
{
	/* Create a PTE encoding */
	unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
952 953 954 955 956
	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
	int ret;

	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
	BUG_ON(ret);
957 958 959 960 961

	return 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
962
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
963
{
964
	remove_pagetable(start, start + page_size);
965 966 967
}
#endif
#endif
968 969 970 971 972 973 974 975 976 977

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				  pmd_t *pmdp, unsigned long clr,
				  unsigned long set)
{
	unsigned long old;

#ifdef CONFIG_DEBUG_VM
978
	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
979
	assert_spin_locked(pmd_lockptr(mm, pmdp));
980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995
#endif

	old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
	trace_hugepage_update(addr, old, clr, set);

	return old;
}

pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			pmd_t *pmdp)

{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
996
	VM_BUG_ON(pmd_devmap(*pmdp));
997 998 999 1000 1001
	/*
	 * khugepaged calls this for normal pmd
	 */
	pmd = *pmdp;
	pmd_clear(pmdp);
1002

1003
	/*FIXME!!  Verify whether we need this kick below */
1004
	serialize_against_pte_lookup(vma->vm_mm);
1005 1006 1007

	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065
	return pmd;
}

/*
 * For us pgtable_t is pte_t *. Inorder to save the deposisted
 * page table, we consider the allocated page table as a list
 * head. On withdraw we need to make sure we zero out the used
 * list_head memory area.
 */
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				 pgtable_t pgtable)
{
        struct list_head *lh = (struct list_head *) pgtable;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(lh);
        else
                list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
        pmd_huge_pte(mm, pmdp) = pgtable;
}

pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
        pte_t *ptep;
        pgtable_t pgtable;
        struct list_head *lh;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        pgtable = pmd_huge_pte(mm, pmdp);
        lh = (struct list_head *) pgtable;
        if (list_empty(lh))
                pmd_huge_pte(mm, pmdp) = NULL;
        else {
                pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
                list_del(lh);
        }
        ptep = (pte_t *) pgtable;
        *ptep = __pte(0);
        ptep++;
        *ptep = __pte(0);
        return pgtable;
}


pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
			       unsigned long addr, pmd_t *pmdp)
{
	pmd_t old_pmd;
	unsigned long old;

	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
	old_pmd = __pmd(old);
	/*
1066
	 * Serialize against find_current_mm_pte which does lock-less
1067 1068 1069 1070 1071 1072
	 * lookup in page tables with local interrupts disabled. For huge pages
	 * it casts pmd_t to pte_t. Since format of pte_t is different from
	 * pmd_t we want to prevent transit from pmd pointing to page table
	 * to pmd pointing to huge page (and back) while interrupts are disabled.
	 * We clear pmd to possibly replace it with page table pointer in
	 * different code paths. So make sure we wait for the parallel
1073
	 * find_current_mm_pte to finish.
1074
	 */
1075
	serialize_against_pte_lookup(mm);
1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086
	return old_pmd;
}

int radix__has_transparent_hugepage(void)
{
	/* For radix 2M at PMD level means thp */
	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
		return 1;
	return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1087

1088 1089
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
				  pte_t entry, unsigned long address, int psize)
1090
{
1091
	struct mm_struct *mm = vma->vm_mm;
1092 1093
	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
					      _PAGE_RW | _PAGE_EXEC);
1094 1095 1096 1097 1098 1099
	/*
	 * To avoid NMMU hang while relaxing access, we need mark
	 * the pte invalid in between.
	 */
	if (cpu_has_feature(CPU_FTR_POWER9_DD1) ||
	    atomic_read(&mm->context.copros) > 0) {
1100 1101 1102 1103 1104 1105 1106
		unsigned long old_pte, new_pte;

		old_pte = __radix_pte_update(ptep, ~0, 0);
		/*
		 * new value of pte
		 */
		new_pte = old_pte | set;
1107
		radix__flush_tlb_page_psize(mm, address, psize);
1108
		__radix_pte_update(ptep, 0, new_pte);
1109
	} else {
1110
		__radix_pte_update(ptep, 0, set);
1111 1112 1113 1114 1115 1116
		/*
		 * Book3S does not require a TLB flush when relaxing access
		 * restrictions when the address space is not attached to a
		 * NMMU, because the core MMU will reload the pte after taking
		 * an access fault, which is defined by the architectue.
		 */
1117
	}
1118 1119
	asm volatile("ptesync" : : : "memory");
}