radix_pgtable.c 27.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2 3 4 5 6
/*
 * Page table handling routines for radix page table.
 *
 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
 */
7 8 9

#define pr_fmt(fmt) "radix-mmu: " fmt

10
#include <linux/io.h>
11
#include <linux/kernel.h>
12
#include <linux/sched/mm.h>
13
#include <linux/memblock.h>
14
#include <linux/of.h>
15
#include <linux/of_fdt.h>
16
#include <linux/mm.h>
17
#include <linux/hugetlb.h>
18
#include <linux/string_helpers.h>
19
#include <linux/memory.h>
20 21

#include <asm/pgalloc.h>
22
#include <asm/mmu_context.h>
23 24 25 26
#include <asm/dma.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/firmware.h>
27
#include <asm/powernv.h>
28
#include <asm/sections.h>
29
#include <asm/smp.h>
30
#include <asm/trace.h>
31
#include <asm/uaccess.h>
32
#include <asm/ultravisor.h>
33
#include <asm/set_memory.h>
34

35 36
#include <trace/events/thp.h>

37
unsigned int mmu_base_pid;
38
unsigned long radix_mem_block_size __ro_after_init;
39

40 41
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
			unsigned long region_start, unsigned long region_end)
42
{
43 44
	phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
	phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
45
	void *ptr;
46

47 48 49 50
	if (region_start)
		min_addr = region_start;
	if (region_end)
		max_addr = region_end;
51

52 53 54 55 56 57 58
	ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);

	if (!ptr)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
		      __func__, size, size, nid, &min_addr, &max_addr);

	return ptr;
59 60
}

61 62 63 64 65 66 67
/*
 * When allocating pud or pmd pointers, we allocate a complete page
 * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
 * is to ensure that the page obtained from the memblock allocator
 * can be completely used as page table page and can be freed
 * correctly when the page table entries are removed.
 */
68
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
69
			  pgprot_t flags,
70 71 72
			  unsigned int map_page_size,
			  int nid,
			  unsigned long region_start, unsigned long region_end)
73
{
74
	unsigned long pfn = pa >> PAGE_SHIFT;
75
	pgd_t *pgdp;
76
	p4d_t *p4dp;
77 78 79 80 81
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

	pgdp = pgd_offset_k(ea);
82 83
	p4dp = p4d_offset(pgdp, ea);
	if (p4d_none(*p4dp)) {
84 85
		pudp = early_alloc_pgtable(PAGE_SIZE, nid,
					   region_start, region_end);
86
		p4d_populate(&init_mm, p4dp, pudp);
87
	}
88
	pudp = pud_offset(p4dp, ea);
89 90 91 92 93
	if (map_page_size == PUD_SIZE) {
		ptep = (pte_t *)pudp;
		goto set_the_pte;
	}
	if (pud_none(*pudp)) {
94 95
		pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
					   region_end);
96 97 98 99 100 101 102 103
		pud_populate(&init_mm, pudp, pmdp);
	}
	pmdp = pmd_offset(pudp, ea);
	if (map_page_size == PMD_SIZE) {
		ptep = pmdp_ptep(pmdp);
		goto set_the_pte;
	}
	if (!pmd_present(*pmdp)) {
104 105
		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
						region_start, region_end);
106 107 108 109 110
		pmd_populate_kernel(&init_mm, pmdp, ptep);
	}
	ptep = pte_offset_kernel(pmdp, ea);

set_the_pte:
111
	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
112
	asm volatile("ptesync": : :"memory");
113 114 115
	return 0;
}

116 117 118 119 120
/*
 * nid, region_start, and region_end are hints to try to place the page
 * table memory in the same node or region.
 */
static int __map_kernel_page(unsigned long ea, unsigned long pa,
121
			  pgprot_t flags,
122 123 124
			  unsigned int map_page_size,
			  int nid,
			  unsigned long region_start, unsigned long region_end)
125
{
126
	unsigned long pfn = pa >> PAGE_SHIFT;
127
	pgd_t *pgdp;
128
	p4d_t *p4dp;
129 130 131 132 133 134 135
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;
	/*
	 * Make sure task size is correct as per the max adddr
	 */
	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
136

137 138 139 140
#ifdef CONFIG_PPC_64K_PAGES
	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
#endif

141 142 143
	if (unlikely(!slab_is_available()))
		return early_map_kernel_page(ea, pa, flags, map_page_size,
						nid, region_start, region_end);
144

145 146 147 148 149
	/*
	 * Should make page table allocation functions be able to take a
	 * node, so we can place kernel page tables on the right nodes after
	 * boot.
	 */
150
	pgdp = pgd_offset_k(ea);
151 152
	p4dp = p4d_offset(pgdp, ea);
	pudp = pud_alloc(&init_mm, p4dp, ea);
153 154 155 156 157
	if (!pudp)
		return -ENOMEM;
	if (map_page_size == PUD_SIZE) {
		ptep = (pte_t *)pudp;
		goto set_the_pte;
158
	}
159 160 161 162 163 164
	pmdp = pmd_alloc(&init_mm, pudp, ea);
	if (!pmdp)
		return -ENOMEM;
	if (map_page_size == PMD_SIZE) {
		ptep = pmdp_ptep(pmdp);
		goto set_the_pte;
165
	}
166 167 168
	ptep = pte_alloc_kernel(pmdp, ea);
	if (!ptep)
		return -ENOMEM;
169 170

set_the_pte:
171
	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
172
	asm volatile("ptesync": : :"memory");
173 174 175
	return 0;
}

176 177 178 179 180 181 182
int radix__map_kernel_page(unsigned long ea, unsigned long pa,
			  pgprot_t flags,
			  unsigned int map_page_size)
{
	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
}

183
#ifdef CONFIG_STRICT_KERNEL_RWX
184 185
static void radix__change_memory_range(unsigned long start, unsigned long end,
				       unsigned long clear)
186 187 188
{
	unsigned long idx;
	pgd_t *pgdp;
189
	p4d_t *p4dp;
190 191 192 193 194 195 196
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

	start = ALIGN_DOWN(start, PAGE_SIZE);
	end = PAGE_ALIGN(end); // aligns up

197 198
	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
		 start, end, clear);
199 200 201

	for (idx = start; idx < end; idx += PAGE_SIZE) {
		pgdp = pgd_offset_k(idx);
202 203
		p4dp = p4d_offset(pgdp, idx);
		pudp = pud_alloc(&init_mm, p4dp, idx);
204 205
		if (!pudp)
			continue;
206
		if (pud_is_leaf(*pudp)) {
207 208 209 210 211 212
			ptep = (pte_t *)pudp;
			goto update_the_pte;
		}
		pmdp = pmd_alloc(&init_mm, pudp, idx);
		if (!pmdp)
			continue;
213
		if (pmd_is_leaf(*pmdp)) {
214 215 216 217 218 219 220
			ptep = pmdp_ptep(pmdp);
			goto update_the_pte;
		}
		ptep = pte_alloc_kernel(pmdp, idx);
		if (!ptep)
			continue;
update_the_pte:
221
		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
222 223 224 225
	}

	radix__flush_tlb_kernel_range(start, end);
}
226 227 228 229 230 231

void radix__mark_rodata_ro(void)
{
	unsigned long start, end;

	start = (unsigned long)_stext;
232
	end = (unsigned long)__end_rodata;
233 234 235

	radix__change_memory_range(start, end, _PAGE_WRITE);
}
236 237 238 239 240 241 242 243

void radix__mark_initmem_nx(void)
{
	unsigned long start = (unsigned long)__init_begin;
	unsigned long end = (unsigned long)__init_end;

	radix__change_memory_range(start, end, _PAGE_EXEC);
}
244 245
#endif /* CONFIG_STRICT_KERNEL_RWX */

246 247
static inline void __meminit
print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
248
{
249 250
	char buf[10];

251 252 253
	if (end <= start)
		return;

254 255
	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));

256 257
	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
		exec ? " (exec)" : "");
258 259
}

260 261 262
static unsigned long next_boundary(unsigned long addr, unsigned long end)
{
#ifdef CONFIG_STRICT_KERNEL_RWX
263 264
	if (addr < __pa_symbol(__srwx_boundary))
		return __pa_symbol(__srwx_boundary);
265 266 267 268
#endif
	return end;
}

269
static int __meminit create_physical_mapping(unsigned long start,
270
					     unsigned long end,
271
					     int nid, pgprot_t _prot)
272
{
273
	unsigned long vaddr, addr, mapping_size = 0;
274
	bool prev_exec, exec = false;
275
	pgprot_t prot;
276
	int psize;
277 278 279 280
	unsigned long max_mapping_size = radix_mem_block_size;

	if (debug_pagealloc_enabled())
		max_mapping_size = PAGE_SIZE;
281

282
	start = ALIGN(start, PAGE_SIZE);
283
	end   = ALIGN_DOWN(end, PAGE_SIZE);
284 285 286 287
	for (addr = start; addr < end; addr += mapping_size) {
		unsigned long gap, previous_size;
		int rc;

288
		gap = next_boundary(addr, end) - addr;
289 290
		if (gap > max_mapping_size)
			gap = max_mapping_size;
291
		previous_size = mapping_size;
292
		prev_exec = exec;
293 294

		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
295
		    mmu_psize_defs[MMU_PAGE_1G].shift) {
296
			mapping_size = PUD_SIZE;
297 298 299
			psize = MMU_PAGE_1G;
		} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
			   mmu_psize_defs[MMU_PAGE_2M].shift) {
300
			mapping_size = PMD_SIZE;
301 302
			psize = MMU_PAGE_2M;
		} else {
303
			mapping_size = PAGE_SIZE;
304 305
			psize = mmu_virtual_psize;
		}
306

307 308
		vaddr = (unsigned long)__va(addr);

309
		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
310
		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
311
			prot = PAGE_KERNEL_X;
312 313
			exec = true;
		} else {
314
			prot = _prot;
315 316 317 318 319 320 321
			exec = false;
		}

		if (mapping_size != previous_size || exec != prev_exec) {
			print_mapping(start, addr, previous_size, prev_exec);
			start = addr;
		}
322

323
		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
324 325
		if (rc)
			return rc;
326 327

		update_page_count(psize, 1);
328 329
	}

330
	print_mapping(start, addr, mapping_size, exec);
331 332 333
	return 0;
}

334
static void __init radix_init_pgtable(void)
335 336
{
	unsigned long rts_field;
337 338
	phys_addr_t start, end;
	u64 i;
339 340

	/* We don't support slb for radix */
341
	slb_set_size(0);
342

343
	/*
344
	 * Create the linear mapping
345
	 */
346
	for_each_mem_range(i, &start, &end) {
347 348 349 350 351
		/*
		 * The memblock allocator  is up at this point, so the
		 * page tables will be allocated within the range. No
		 * need or a node (which we don't have yet).
		 */
352

353
		if (end >= RADIX_VMALLOC_START) {
354
			pr_warn("Outside the supported range\n");
355 356 357
			continue;
		}

358
		WARN_ON(create_physical_mapping(start, end,
359
						-1, PAGE_KERNEL));
360
	}
361

362 363
	if (!cpu_has_feature(CPU_FTR_HVMODE) &&
			cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
364
		/*
J
Julia Lawall 已提交
365
		 * Older versions of KVM on these machines prefer if the
366
		 * guest only uses the low 19 PID bits.
367
		 */
368
		mmu_pid_bits = 19;
369
	}
370
	mmu_base_pid = 1;
371

372 373 374 375
	/*
	 * Allocate Partition table and process table for the
	 * host.
	 */
376
	BUG_ON(PRTB_SIZE_SHIFT > 36);
377
	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
378 379 380
	/*
	 * Fill in the process table.
	 */
381
	rts_field = radix__get_tree_size();
382
	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
383

384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
	/*
	 * The init_mm context is given the first available (non-zero) PID,
	 * which is the "guard PID" and contains no page table. PIDR should
	 * never be set to zero because that duplicates the kernel address
	 * space at the 0x0... offset (quadrant 0)!
	 *
	 * An arbitrary PID that may later be allocated by the PID allocator
	 * for userspace processes must not be used either, because that
	 * would cause stale user mappings for that PID on CPUs outside of
	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
	 *
	 * So permanently carve out one PID for the purpose of a guard PID.
	 */
	init_mm.context.id = mmu_base_pid;
	mmu_base_pid++;
399 400 401 402
}

static void __init radix_init_partition_table(void)
{
403
	unsigned long rts_field, dw0, dw1;
404

405
	mmu_partition_table_init();
406
	rts_field = radix__get_tree_size();
407
	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
408
	dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
409
	mmu_partition_table_set_entry(0, dw0, dw1, false);
410

411
	pr_info("Initializing Radix MMU\n");
412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448
}

static int __init get_idx_from_shift(unsigned int shift)
{
	int idx = -1;

	switch (shift) {
	case 0xc:
		idx = MMU_PAGE_4K;
		break;
	case 0x10:
		idx = MMU_PAGE_64K;
		break;
	case 0x15:
		idx = MMU_PAGE_2M;
		break;
	case 0x1e:
		idx = MMU_PAGE_1G;
		break;
	}
	return idx;
}

static int __init radix_dt_scan_page_sizes(unsigned long node,
					   const char *uname, int depth,
					   void *data)
{
	int size = 0;
	int shift, idx;
	unsigned int ap;
	const __be32 *prop;
	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);

	/* We are scanning "cpu" nodes only */
	if (type == NULL || strcmp(type, "cpu") != 0)
		return 0;

449
	/* Grab page size encodings */
450 451 452 453 454 455 456 457 458 459 460 461
	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
	if (!prop)
		return 0;

	pr_info("Page sizes from device-tree:\n");
	for (; size >= 4; size -= 4, ++prop) {

		struct mmu_psize_def *def;

		/* top 3 bit is AP encoding */
		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
		ap = be32_to_cpu(prop[0]) >> 29;
462
		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
463 464 465 466 467 468 469 470

		idx = get_idx_from_shift(shift);
		if (idx < 0)
			continue;

		def = &mmu_psize_defs[idx];
		def->shift = shift;
		def->ap  = ap;
471
		def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
472 473 474 475 476 477 478
	}

	/* needed ? */
	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
	return 1;
}

479 480 481 482 483
#ifdef CONFIG_MEMORY_HOTPLUG
static int __init probe_memory_block_size(unsigned long node, const char *uname, int
					  depth, void *data)
{
	unsigned long *mem_block_size = (unsigned long *)data;
484
	const __be32 *prop;
485 486 487 488 489 490 491 492 493
	int len;

	if (depth != 1)
		return 0;

	if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
		return 0;

	prop = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
494 495

	if (!prop || len < dt_root_size_cells * sizeof(__be32))
496 497 498 499 500
		/*
		 * Nothing in the device tree
		 */
		*mem_block_size = MIN_MEMORY_BLOCK_SIZE;
	else
501
		*mem_block_size = of_read_number(prop, dt_root_size_cells);
502 503 504
	return 1;
}

505
static unsigned long __init radix_memory_block_size(void)
506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522
{
	unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;

	/*
	 * OPAL firmware feature is set by now. Hence we are ok
	 * to test OPAL feature.
	 */
	if (firmware_has_feature(FW_FEATURE_OPAL))
		mem_block_size = 1UL * 1024 * 1024 * 1024;
	else
		of_scan_flat_dt(probe_memory_block_size, &mem_block_size);

	return mem_block_size;
}

#else   /* CONFIG_MEMORY_HOTPLUG */

523
static unsigned long __init radix_memory_block_size(void)
524 525 526 527 528 529 530
{
	return 1UL * 1024 * 1024 * 1024;
}

#endif /* CONFIG_MEMORY_HOTPLUG */


531
void __init radix__early_init_devtree(void)
532 533 534 535 536 537 538
{
	int rc;

	/*
	 * Try to find the available page sizes in the device-tree
	 */
	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
539 540 541 542 543 544 545
	if (!rc) {
		/*
		 * No page size details found in device tree.
		 * Let's assume we have page 4k and 64k support
		 */
		mmu_psize_defs[MMU_PAGE_4K].shift = 12;
		mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
546 547
		mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
			psize_to_rpti_pgsize(MMU_PAGE_4K);
548 549 550

		mmu_psize_defs[MMU_PAGE_64K].shift = 16;
		mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
551 552
		mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
			psize_to_rpti_pgsize(MMU_PAGE_64K);
553 554
	}

555
	/*
556 557 558 559 560 561
	 * Max mapping size used when mapping pages. We don't use
	 * ppc_md.memory_block_size() here because this get called
	 * early and we don't have machine probe called yet. Also
	 * the pseries implementation only check for ibm,lmb-size.
	 * All hypervisor supporting radix do expose that device
	 * tree node.
562
	 */
563
	radix_mem_block_size = radix_memory_block_size();
564 565 566 567 568 569 570
	return;
}

void __init radix__early_init_mmu(void)
{
	unsigned long lpcr;

571
#ifdef CONFIG_PPC_64S_HASH_MMU
572 573 574 575 576 577 578 579 580
#ifdef CONFIG_PPC_64K_PAGES
	/* PAGE_SIZE mappings */
	mmu_virtual_psize = MMU_PAGE_64K;
#else
	mmu_virtual_psize = MMU_PAGE_4K;
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
	/* vmemmap mapping */
581 582 583 584 585 586 587
	if (mmu_psize_defs[MMU_PAGE_2M].shift) {
		/*
		 * map vmemmap using 2M if available
		 */
		mmu_vmemmap_psize = MMU_PAGE_2M;
	} else
		mmu_vmemmap_psize = mmu_virtual_psize;
588
#endif
589 590 591 592 593 594 595 596
#endif
	/*
	 * initialize page table size
	 */
	__pte_index_size = RADIX_PTE_INDEX_SIZE;
	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
	__pud_index_size = RADIX_PUD_INDEX_SIZE;
	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
597
	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
598 599 600 601 602
	__pte_table_size = RADIX_PTE_TABLE_SIZE;
	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
	__pud_table_size = RADIX_PUD_TABLE_SIZE;
	__pgd_table_size = RADIX_PGD_TABLE_SIZE;

603 604 605
	__pmd_val_bits = RADIX_PMD_VAL_BITS;
	__pud_val_bits = RADIX_PUD_VAL_BITS;
	__pgd_val_bits = RADIX_PGD_VAL_BITS;
606

607 608 609
	__kernel_virt_start = RADIX_KERN_VIRT_START;
	__vmalloc_start = RADIX_VMALLOC_START;
	__vmalloc_end = RADIX_VMALLOC_END;
610
	__kernel_io_start = RADIX_KERN_IO_START;
611
	__kernel_io_end = RADIX_KERN_IO_END;
612
	vmemmap = (struct page *)RADIX_VMEMMAP_START;
613
	ioremap_bot = IOREMAP_BASE;
614 615 616 617

#ifdef CONFIG_PCI
	pci_io_base = ISA_IO_BASE;
#endif
618 619
	__pte_frag_nr = RADIX_PTE_FRAG_NR;
	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
620 621
	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
622

623 624
	radix_init_pgtable();

625 626
	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
		lpcr = mfspr(SPRN_LPCR);
627
		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
628
		radix_init_partition_table();
629 630
	} else {
		radix_init_pseries();
631
	}
632

633 634
	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);

635 636
	/* Switch to the guard PID before turning on MMU */
	radix__switch_mmu_context(NULL, &init_mm);
637
	tlbiel_all();
638 639 640 641 642 643
}

void radix__early_init_mmu_secondary(void)
{
	unsigned long lpcr;
	/*
644
	 * update partition table control register and UPRT
645
	 */
646 647
	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
		lpcr = mfspr(SPRN_LPCR);
648
		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
649

650 651
		set_ptcr_when_no_uv(__pa(partition_tb) |
				    (PATB_SIZE_SHIFT - 12));
652
	}
653

654
	radix__switch_mmu_context(NULL, &init_mm);
655
	tlbiel_all();
656 657 658

	/* Make sure userspace can't change the AMR */
	mtspr(SPRN_UAMOR, 0);
659 660
}

661 662
/* Called during kexec sequence with MMU off */
notrace void radix__mmu_cleanup_all(void)
663 664 665 666 667 668
{
	unsigned long lpcr;

	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
		lpcr = mfspr(SPRN_LPCR);
		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
669
		set_ptcr_when_no_uv(0);
670
		powernv_set_nmmu_ptcr(0);
671 672 673 674
		radix__flush_tlb_all();
	}
}

675
#ifdef CONFIG_MEMORY_HOTPLUG
676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
		if (!pte_none(*pte))
			return;
	}

	pte_free_kernel(&init_mm, pte_start);
	pmd_clear(pmd);
}

static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
		if (!pmd_none(*pmd))
			return;
	}

	pmd_free(&init_mm, pmd_start);
	pud_clear(pud);
}

706 707 708 709 710 711 712 713 714 715 716 717 718 719 720
static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
{
	pud_t *pud;
	int i;

	for (i = 0; i < PTRS_PER_PUD; i++) {
		pud = pud_start + i;
		if (!pud_none(*pud))
			return;
	}

	pud_free(&init_mm, pud_start);
	p4d_clear(p4d);
}

721 722 723 724 725 726 727 728 729 730 731 732 733 734 735
static void remove_pte_table(pte_t *pte_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pte_t *pte;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

736 737 738 739 740 741 742 743 744
		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
			/*
			 * The vmemmap_free() and remove_section_mapping()
			 * codepaths call us with aligned addresses.
			 */
			WARN_ONCE(1, "%s: unaligned range\n", __func__);
			continue;
		}

745 746 747 748
		pte_clear(&init_mm, addr, pte);
	}
}

749
static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
750 751 752 753 754 755 756 757 758 759 760 761 762
			     unsigned long end)
{
	unsigned long next;
	pte_t *pte_base;
	pmd_t *pmd;

	pmd = pmd_start + pmd_index(addr);
	for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
			continue;

763
		if (pmd_is_leaf(*pmd)) {
764 765 766 767 768 769
			if (!IS_ALIGNED(addr, PMD_SIZE) ||
			    !IS_ALIGNED(next, PMD_SIZE)) {
				WARN_ONCE(1, "%s: unaligned range\n", __func__);
				continue;
			}
			pte_clear(&init_mm, addr, (pte_t *)pmd);
770 771 772 773 774 775 776 777 778
			continue;
		}

		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
		remove_pte_table(pte_base, addr, next);
		free_pte_table(pte_base, pmd);
	}
}

779
static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
780 781 782 783 784 785 786 787 788 789 790 791 792
			     unsigned long end)
{
	unsigned long next;
	pmd_t *pmd_base;
	pud_t *pud;

	pud = pud_start + pud_index(addr);
	for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);

		if (!pud_present(*pud))
			continue;

793
		if (pud_is_leaf(*pud)) {
794 795 796 797 798 799
			if (!IS_ALIGNED(addr, PUD_SIZE) ||
			    !IS_ALIGNED(next, PUD_SIZE)) {
				WARN_ONCE(1, "%s: unaligned range\n", __func__);
				continue;
			}
			pte_clear(&init_mm, addr, (pte_t *)pud);
800 801 802
			continue;
		}

803
		pmd_base = pud_pgtable(*pud);
804 805 806 807 808
		remove_pmd_table(pmd_base, addr, next);
		free_pmd_table(pmd_base, pud);
	}
}

809
static void __meminit remove_pagetable(unsigned long start, unsigned long end)
810 811 812 813
{
	unsigned long addr, next;
	pud_t *pud_base;
	pgd_t *pgd;
814
	p4d_t *p4d;
815 816 817 818 819 820 821

	spin_lock(&init_mm.page_table_lock);

	for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);

		pgd = pgd_offset_k(addr);
822 823
		p4d = p4d_offset(pgd, addr);
		if (!p4d_present(*p4d))
824 825
			continue;

826
		if (p4d_is_leaf(*p4d)) {
827 828 829 830 831 832 833
			if (!IS_ALIGNED(addr, P4D_SIZE) ||
			    !IS_ALIGNED(next, P4D_SIZE)) {
				WARN_ONCE(1, "%s: unaligned range\n", __func__);
				continue;
			}

			pte_clear(&init_mm, addr, (pte_t *)pgd);
834 835 836
			continue;
		}

837
		pud_base = p4d_pgtable(*p4d);
838
		remove_pud_table(pud_base, addr, next);
839
		free_pud_table(pud_base, p4d);
840 841 842 843 844 845
	}

	spin_unlock(&init_mm.page_table_lock);
	radix__flush_tlb_kernel_range(start, end);
}

846 847 848
int __meminit radix__create_section_mapping(unsigned long start,
					    unsigned long end, int nid,
					    pgprot_t prot)
849
{
850
	if (end >= RADIX_VMALLOC_START) {
851
		pr_warn("Outside the supported range\n");
852 853 854
		return -1;
	}

855
	return create_physical_mapping(__pa(start), __pa(end),
856
				       nid, prot);
857
}
858

859
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
860 861 862 863
{
	remove_pagetable(start, end);
	return 0;
}
864 865
#endif /* CONFIG_MEMORY_HOTPLUG */

866
#ifdef CONFIG_SPARSEMEM_VMEMMAP
867 868 869 870 871 872 873
static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
				 pgprot_t flags, unsigned int map_page_size,
				 int nid)
{
	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
}

874 875 876 877 878 879
int __meminit radix__vmemmap_create_mapping(unsigned long start,
				      unsigned long page_size,
				      unsigned long phys)
{
	/* Create a PTE encoding */
	unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
880 881 882
	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
	int ret;

883
	if ((start + page_size) >= RADIX_VMEMMAP_END) {
884
		pr_warn("Outside the supported range\n");
885 886 887
		return -1;
	}

888 889
	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
	BUG_ON(ret);
890 891 892 893 894

	return 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
895
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
896
{
897
	remove_pagetable(start, start + page_size);
898 899 900
}
#endif
#endif
901

902 903 904
#ifdef CONFIG_DEBUG_PAGEALLOC
void radix__kernel_map_pages(struct page *page, int numpages, int enable)
{
905 906 907 908 909 910 911 912
	unsigned long addr;

	addr = (unsigned long)page_address(page);

	if (enable)
		set_memory_p(addr, numpages);
	else
		set_memory_np(addr, numpages);
913 914 915
}
#endif

916 917 918 919 920 921 922 923 924
#ifdef CONFIG_TRANSPARENT_HUGEPAGE

unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				  pmd_t *pmdp, unsigned long clr,
				  unsigned long set)
{
	unsigned long old;

#ifdef CONFIG_DEBUG_VM
925
	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
926
	assert_spin_locked(pmd_lockptr(mm, pmdp));
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942
#endif

	old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
	trace_hugepage_update(addr, old, clr, set);

	return old;
}

pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			pmd_t *pmdp)

{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
943
	VM_BUG_ON(pmd_devmap(*pmdp));
944 945 946 947 948
	/*
	 * khugepaged calls this for normal pmd
	 */
	pmd = *pmdp;
	pmd_clear(pmdp);
949

950 951 952 953 954 955 956
	/*
	 * pmdp collapse_flush need to ensure that there are no parallel gup
	 * walk after this call. This is needed so that we can have stable
	 * page ref count when collapsing a page. We don't allow a collapse page
	 * if we have gup taken on the page. We can ensure that by sending IPI
	 * because gup walk happens with IRQ disabled.
	 */
957
	serialize_against_pte_lookup(vma->vm_mm);
958 959 960

	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);

961 962 963 964 965 966 967 968 969 970 971 972
	return pmd;
}

/*
 * For us pgtable_t is pte_t *. Inorder to save the deposisted
 * page table, we consider the allocated page table as a list
 * head. On withdraw we need to make sure we zero out the used
 * list_head memory area.
 */
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				 pgtable_t pgtable)
{
973
	struct list_head *lh = (struct list_head *) pgtable;
974

975
	assert_spin_locked(pmd_lockptr(mm, pmdp));
976

977 978 979 980 981 982
	/* FIFO */
	if (!pmd_huge_pte(mm, pmdp))
		INIT_LIST_HEAD(lh);
	else
		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
	pmd_huge_pte(mm, pmdp) = pgtable;
983 984 985 986
}

pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
987 988 989
	pte_t *ptep;
	pgtable_t pgtable;
	struct list_head *lh;
990

991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
	assert_spin_locked(pmd_lockptr(mm, pmdp));

	/* FIFO */
	pgtable = pmd_huge_pte(mm, pmdp);
	lh = (struct list_head *) pgtable;
	if (list_empty(lh))
		pmd_huge_pte(mm, pmdp) = NULL;
	else {
		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
		list_del(lh);
	}
	ptep = (pte_t *) pgtable;
	*ptep = __pte(0);
	ptep++;
	*ptep = __pte(0);
	return pgtable;
}
1008 1009

pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1010
				     unsigned long addr, pmd_t *pmdp)
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
{
	pmd_t old_pmd;
	unsigned long old;

	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
	old_pmd = __pmd(old);
	return old_pmd;
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1021

1022 1023
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
				  pte_t entry, unsigned long address, int psize)
1024
{
1025
	struct mm_struct *mm = vma->vm_mm;
1026 1027
	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
					      _PAGE_RW | _PAGE_EXEC);
1028 1029

	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1030
	/*
1031 1032 1033 1034 1035 1036 1037 1038 1039
	 * On POWER9, the NMMU is not able to relax PTE access permissions
	 * for a translation with a TLB. The PTE must be invalidated, TLB
	 * flushed before the new PTE is installed.
	 *
	 * This only needs to be done for radix, because hash translation does
	 * flush when updating the linux pte (and we don't support NMMU
	 * accelerators on HPT on POWER9 anyway XXX: do we?).
	 *
	 * POWER10 (and P9P) NMMU does behave as per ISA.
1040
	 */
1041 1042
	if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
	    atomic_read(&mm->context.copros) > 0) {
1043 1044
		unsigned long old_pte, new_pte;

1045
		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1046
		new_pte = old_pte | set;
1047
		radix__flush_tlb_page_psize(mm, address, psize);
1048
		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1049
	} else {
1050
		__radix_pte_update(ptep, 0, set);
1051 1052
		/*
		 * Book3S does not require a TLB flush when relaxing access
1053 1054 1055 1056 1057 1058
		 * restrictions when the address space (modulo the POWER9 nest
		 * MMU issue above) because the MMU will reload the PTE after
		 * taking an access fault, as defined by the architecture. See
		 * "Setting a Reference or Change Bit or Upgrading Access
		 *  Authority (PTE Subject to Atomic Hardware Updates)" in
		 *  Power ISA Version 3.1B.
1059
		 */
1060
	}
1061
	/* See ptesync comment in radix__set_pte_at */
1062
}
1063 1064 1065 1066 1067 1068 1069 1070

void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
				    unsigned long addr, pte_t *ptep,
				    pte_t old_pte, pte_t pte)
{
	struct mm_struct *mm = vma->vm_mm;

	/*
1071 1072 1073
	 * POWER9 NMMU must flush the TLB after clearing the PTE before
	 * installing a PTE with more relaxed access permissions, see
	 * radix__ptep_set_access_flags.
1074
	 */
1075 1076
	if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
	    is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1077 1078 1079 1080 1081
	    (atomic_read(&mm->context.copros) > 0))
		radix__flush_tlb_page(vma, addr);

	set_pte_at(mm, addr, ptep, pte);
}
1082

1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
	pte_t *ptep = (pte_t *)pud;
	pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);

	if (!radix_enabled())
		return 0;

	set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);

	return 1;
}

int pud_clear_huge(pud_t *pud)
{
1098
	if (pud_is_leaf(*pud)) {
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
		pud_clear(pud);
		return 1;
	}

	return 0;
}

int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
	pmd_t *pmd;
	int i;

1111
	pmd = pud_pgtable(*pud);
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	pud_clear(pud);

	flush_tlb_kernel_range(addr, addr + PUD_SIZE);

	for (i = 0; i < PTRS_PER_PMD; i++) {
		if (!pmd_none(pmd[i])) {
			pte_t *pte;
			pte = (pte_t *)pmd_page_vaddr(pmd[i]);

			pte_free_kernel(&init_mm, pte);
		}
	}

	pmd_free(&init_mm, pmd);

	return 1;
}

int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
	pte_t *ptep = (pte_t *)pmd;
	pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);

	if (!radix_enabled())
		return 0;

	set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);

	return 1;
}

int pmd_clear_huge(pmd_t *pmd)
{
1145
	if (pmd_is_leaf(*pmd)) {
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165
		pmd_clear(pmd);
		return 1;
	}

	return 0;
}

int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
	pte_t *pte;

	pte = (pte_t *)pmd_page_vaddr(*pmd);
	pmd_clear(pmd);

	flush_tlb_kernel_range(addr, addr + PMD_SIZE);

	pte_free_kernel(&init_mm, pte);

	return 1;
}