radix_pgtable.c 27.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10
/*
 * Page table handling routines for radix page table.
 *
 * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
11 12 13

#define pr_fmt(fmt) "radix-mmu: " fmt

14
#include <linux/io.h>
15
#include <linux/kernel.h>
16
#include <linux/sched/mm.h>
17 18
#include <linux/memblock.h>
#include <linux/of_fdt.h>
19
#include <linux/mm.h>
20
#include <linux/string_helpers.h>
21
#include <linux/stop_machine.h>
22 23 24

#include <asm/pgtable.h>
#include <asm/pgalloc.h>
25
#include <asm/mmu_context.h>
26 27 28 29
#include <asm/dma.h>
#include <asm/machdep.h>
#include <asm/mmu.h>
#include <asm/firmware.h>
30
#include <asm/powernv.h>
31
#include <asm/sections.h>
32
#include <asm/trace.h>
33
#include <asm/uaccess.h>
34

35 36
#include <trace/events/thp.h>

37 38 39
unsigned int mmu_pid_bits;
unsigned int mmu_base_pid;

40 41
static int native_register_process_table(unsigned long base, unsigned long pg_sz,
					 unsigned long table_size)
42
{
43 44 45 46 47 48
	unsigned long patb0, patb1;

	patb0 = be64_to_cpu(partition_tb[0].patb0);
	patb1 = base | table_size | PATB_GR;

	mmu_partition_table_set_entry(0, patb0, patb1);
49

50 51 52
	return 0;
}

53 54
static __ref void *early_alloc_pgtable(unsigned long size, int nid,
			unsigned long region_start, unsigned long region_end)
55
{
56 57
	phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
	phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
58
	void *ptr;
59

60 61 62 63
	if (region_start)
		min_addr = region_start;
	if (region_end)
		max_addr = region_end;
64

65 66 67 68 69 70 71
	ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);

	if (!ptr)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
		      __func__, size, size, nid, &min_addr, &max_addr);

	return ptr;
72 73
}

74
static int early_map_kernel_page(unsigned long ea, unsigned long pa,
75
			  pgprot_t flags,
76 77 78
			  unsigned int map_page_size,
			  int nid,
			  unsigned long region_start, unsigned long region_end)
79
{
80
	unsigned long pfn = pa >> PAGE_SHIFT;
81 82 83 84 85 86 87
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

	pgdp = pgd_offset_k(ea);
	if (pgd_none(*pgdp)) {
88 89
		pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid,
						region_start, region_end);
90 91 92 93 94 95 96 97
		pgd_populate(&init_mm, pgdp, pudp);
	}
	pudp = pud_offset(pgdp, ea);
	if (map_page_size == PUD_SIZE) {
		ptep = (pte_t *)pudp;
		goto set_the_pte;
	}
	if (pud_none(*pudp)) {
98 99
		pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid,
						region_start, region_end);
100 101 102 103 104 105 106 107
		pud_populate(&init_mm, pudp, pmdp);
	}
	pmdp = pmd_offset(pudp, ea);
	if (map_page_size == PMD_SIZE) {
		ptep = pmdp_ptep(pmdp);
		goto set_the_pte;
	}
	if (!pmd_present(*pmdp)) {
108 109
		ptep = early_alloc_pgtable(PAGE_SIZE, nid,
						region_start, region_end);
110 111 112 113 114
		pmd_populate_kernel(&init_mm, pmdp, ptep);
	}
	ptep = pte_offset_kernel(pmdp, ea);

set_the_pte:
115
	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
116 117 118 119
	smp_wmb();
	return 0;
}

120 121 122 123 124
/*
 * nid, region_start, and region_end are hints to try to place the page
 * table memory in the same node or region.
 */
static int __map_kernel_page(unsigned long ea, unsigned long pa,
125
			  pgprot_t flags,
126 127 128
			  unsigned int map_page_size,
			  int nid,
			  unsigned long region_start, unsigned long region_end)
129
{
130
	unsigned long pfn = pa >> PAGE_SHIFT;
131 132 133 134 135 136 137 138
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;
	/*
	 * Make sure task size is correct as per the max adddr
	 */
	BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
139

140 141 142 143
#ifdef CONFIG_PPC_64K_PAGES
	BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
#endif

144 145 146
	if (unlikely(!slab_is_available()))
		return early_map_kernel_page(ea, pa, flags, map_page_size,
						nid, region_start, region_end);
147

148 149 150 151 152
	/*
	 * Should make page table allocation functions be able to take a
	 * node, so we can place kernel page tables on the right nodes after
	 * boot.
	 */
153 154 155 156 157 158 159
	pgdp = pgd_offset_k(ea);
	pudp = pud_alloc(&init_mm, pgdp, ea);
	if (!pudp)
		return -ENOMEM;
	if (map_page_size == PUD_SIZE) {
		ptep = (pte_t *)pudp;
		goto set_the_pte;
160
	}
161 162 163 164 165 166
	pmdp = pmd_alloc(&init_mm, pudp, ea);
	if (!pmdp)
		return -ENOMEM;
	if (map_page_size == PMD_SIZE) {
		ptep = pmdp_ptep(pmdp);
		goto set_the_pte;
167
	}
168 169 170
	ptep = pte_alloc_kernel(pmdp, ea);
	if (!ptep)
		return -ENOMEM;
171 172

set_the_pte:
173
	set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
174 175 176 177
	smp_wmb();
	return 0;
}

178 179 180 181 182 183 184
int radix__map_kernel_page(unsigned long ea, unsigned long pa,
			  pgprot_t flags,
			  unsigned int map_page_size)
{
	return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
}

185
#ifdef CONFIG_STRICT_KERNEL_RWX
186 187
void radix__change_memory_range(unsigned long start, unsigned long end,
				unsigned long clear)
188 189 190 191 192 193 194 195 196 197
{
	unsigned long idx;
	pgd_t *pgdp;
	pud_t *pudp;
	pmd_t *pmdp;
	pte_t *ptep;

	start = ALIGN_DOWN(start, PAGE_SIZE);
	end = PAGE_ALIGN(end); // aligns up

198 199
	pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
		 start, end, clear);
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220

	for (idx = start; idx < end; idx += PAGE_SIZE) {
		pgdp = pgd_offset_k(idx);
		pudp = pud_alloc(&init_mm, pgdp, idx);
		if (!pudp)
			continue;
		if (pud_huge(*pudp)) {
			ptep = (pte_t *)pudp;
			goto update_the_pte;
		}
		pmdp = pmd_alloc(&init_mm, pudp, idx);
		if (!pmdp)
			continue;
		if (pmd_huge(*pmdp)) {
			ptep = pmdp_ptep(pmdp);
			goto update_the_pte;
		}
		ptep = pte_alloc_kernel(pmdp, idx);
		if (!ptep)
			continue;
update_the_pte:
221
		radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
222 223 224 225
	}

	radix__flush_tlb_kernel_range(start, end);
}
226 227 228 229 230 231 232 233 234 235

void radix__mark_rodata_ro(void)
{
	unsigned long start, end;

	start = (unsigned long)_stext;
	end = (unsigned long)__init_begin;

	radix__change_memory_range(start, end, _PAGE_WRITE);
}
236 237 238 239 240 241 242 243

void radix__mark_initmem_nx(void)
{
	unsigned long start = (unsigned long)__init_begin;
	unsigned long end = (unsigned long)__init_end;

	radix__change_memory_range(start, end, _PAGE_EXEC);
}
244 245
#endif /* CONFIG_STRICT_KERNEL_RWX */

246 247
static inline void __meminit
print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
248
{
249 250
	char buf[10];

251 252 253
	if (end <= start)
		return;

254 255
	string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));

256 257
	pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
		exec ? " (exec)" : "");
258 259
}

260 261 262 263 264 265 266 267 268
static unsigned long next_boundary(unsigned long addr, unsigned long end)
{
#ifdef CONFIG_STRICT_KERNEL_RWX
	if (addr < __pa_symbol(__init_begin))
		return __pa_symbol(__init_begin);
#endif
	return end;
}

269
static int __meminit create_physical_mapping(unsigned long start,
270 271
					     unsigned long end,
					     int nid)
272
{
273
	unsigned long vaddr, addr, mapping_size = 0;
274
	bool prev_exec, exec = false;
275
	pgprot_t prot;
276
	int psize;
277 278 279 280 281 282

	start = _ALIGN_UP(start, PAGE_SIZE);
	for (addr = start; addr < end; addr += mapping_size) {
		unsigned long gap, previous_size;
		int rc;

283
		gap = next_boundary(addr, end) - addr;
284
		previous_size = mapping_size;
285
		prev_exec = exec;
286 287

		if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
288
		    mmu_psize_defs[MMU_PAGE_1G].shift) {
289
			mapping_size = PUD_SIZE;
290 291 292
			psize = MMU_PAGE_1G;
		} else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
			   mmu_psize_defs[MMU_PAGE_2M].shift) {
293
			mapping_size = PMD_SIZE;
294 295
			psize = MMU_PAGE_2M;
		} else {
296
			mapping_size = PAGE_SIZE;
297 298
			psize = mmu_virtual_psize;
		}
299

300 301
		vaddr = (unsigned long)__va(addr);

302
		if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
303
		    overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
304
			prot = PAGE_KERNEL_X;
305 306
			exec = true;
		} else {
307
			prot = PAGE_KERNEL;
308 309 310 311 312 313 314
			exec = false;
		}

		if (mapping_size != previous_size || exec != prev_exec) {
			print_mapping(start, addr, previous_size, prev_exec);
			start = addr;
		}
315

316
		rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
317 318
		if (rc)
			return rc;
319 320

		update_page_count(psize, 1);
321 322
	}

323
	print_mapping(start, addr, mapping_size, exec);
324 325 326
	return 0;
}

327
static void __init radix_init_pgtable(void)
328 329 330 331 332 333 334 335 336
{
	unsigned long rts_field;
	struct memblock_region *reg;

	/* We don't support slb for radix */
	mmu_slb_size = 0;
	/*
	 * Create the linear mapping, using standard page size for now
	 */
337 338 339 340 341 342
	for_each_memblock(memory, reg) {
		/*
		 * The memblock allocator  is up at this point, so the
		 * page tables will be allocated within the range. No
		 * need or a node (which we don't have yet).
		 */
343 344

		if ((reg->base + reg->size) >= RADIX_VMALLOC_START) {
345
			pr_warn("Outside the supported range\n");
346 347 348
			continue;
		}

349
		WARN_ON(create_physical_mapping(reg->base,
350 351 352
						reg->base + reg->size,
						-1));
	}
353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375

	/* Find out how many PID bits are supported */
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
		if (!mmu_pid_bits)
			mmu_pid_bits = 20;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
		/*
		 * When KVM is possible, we only use the top half of the
		 * PID space to avoid collisions between host and guest PIDs
		 * which can cause problems due to prefetch when exiting the
		 * guest with AIL=3
		 */
		mmu_base_pid = 1 << (mmu_pid_bits - 1);
#else
		mmu_base_pid = 1;
#endif
	} else {
		/* The guest uses the bottom half of the PID space */
		if (!mmu_pid_bits)
			mmu_pid_bits = 19;
		mmu_base_pid = 1;
	}

376 377 378 379
	/*
	 * Allocate Partition table and process table for the
	 * host.
	 */
380
	BUG_ON(PRTB_SIZE_SHIFT > 36);
381
	process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
382 383 384
	/*
	 * Fill in the process table.
	 */
385
	rts_field = radix__get_tree_size();
386 387 388 389 390 391
	process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
	/*
	 * Fill in the partition table. We are suppose to use effective address
	 * of process table here. But our linear mapping also enable us to use
	 * physical address here.
	 */
392
	register_process_table(__pa(process_tb), 0, PRTB_SIZE_SHIFT - 12);
393
	pr_info("Process table %p and radix root for kernel: %p\n", process_tb, init_mm.pgd);
394 395 396 397
	asm volatile("ptesync" : : : "memory");
	asm volatile(PPC_TLBIE_5(%0,%1,2,1,1) : :
		     "r" (TLBIEL_INVAL_SET_LPID), "r" (0));
	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
398
	trace_tlbie(0, 0, TLBIEL_INVAL_SET_LPID, 0, 2, 1, 1);
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414

	/*
	 * The init_mm context is given the first available (non-zero) PID,
	 * which is the "guard PID" and contains no page table. PIDR should
	 * never be set to zero because that duplicates the kernel address
	 * space at the 0x0... offset (quadrant 0)!
	 *
	 * An arbitrary PID that may later be allocated by the PID allocator
	 * for userspace processes must not be used either, because that
	 * would cause stale user mappings for that PID on CPUs outside of
	 * the TLB invalidation scheme (because it won't be in mm_cpumask).
	 *
	 * So permanently carve out one PID for the purpose of a guard PID.
	 */
	init_mm.context.id = mmu_base_pid;
	mmu_base_pid++;
415 416 417 418
}

static void __init radix_init_partition_table(void)
{
419
	unsigned long rts_field, dw0;
420

421
	mmu_partition_table_init();
422
	rts_field = radix__get_tree_size();
423 424
	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
	mmu_partition_table_set_entry(0, dw0, 0);
425

426 427
	pr_info("Initializing Radix MMU\n");
	pr_info("Partition table %p\n", partition_tb);
428 429 430 431
}

void __init radix_init_native(void)
{
432
	register_process_table = native_register_process_table;
433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
}

static int __init get_idx_from_shift(unsigned int shift)
{
	int idx = -1;

	switch (shift) {
	case 0xc:
		idx = MMU_PAGE_4K;
		break;
	case 0x10:
		idx = MMU_PAGE_64K;
		break;
	case 0x15:
		idx = MMU_PAGE_2M;
		break;
	case 0x1e:
		idx = MMU_PAGE_1G;
		break;
	}
	return idx;
}

static int __init radix_dt_scan_page_sizes(unsigned long node,
					   const char *uname, int depth,
					   void *data)
{
	int size = 0;
	int shift, idx;
	unsigned int ap;
	const __be32 *prop;
	const char *type = of_get_flat_dt_prop(node, "device_type", NULL);

	/* We are scanning "cpu" nodes only */
	if (type == NULL || strcmp(type, "cpu") != 0)
		return 0;

470 471 472 473 474 475
	/* Find MMU PID size */
	prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size);
	if (prop && size == 4)
		mmu_pid_bits = be32_to_cpup(prop);

	/* Grab page size encodings */
476 477 478 479 480 481 482 483 484 485 486 487
	prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
	if (!prop)
		return 0;

	pr_info("Page sizes from device-tree:\n");
	for (; size >= 4; size -= 4, ++prop) {

		struct mmu_psize_def *def;

		/* top 3 bit is AP encoding */
		shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
		ap = be32_to_cpu(prop[0]) >> 29;
488
		pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
489 490 491 492 493 494 495 496 497 498 499 500 501 502 503

		idx = get_idx_from_shift(shift);
		if (idx < 0)
			continue;

		def = &mmu_psize_defs[idx];
		def->shift = shift;
		def->ap  = ap;
	}

	/* needed ? */
	cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
	return 1;
}

504
void __init radix__early_init_devtree(void)
505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533
{
	int rc;

	/*
	 * Try to find the available page sizes in the device-tree
	 */
	rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
	if (rc != 0)  /* Found */
		goto found;
	/*
	 * let's assume we have page 4k and 64k support
	 */
	mmu_psize_defs[MMU_PAGE_4K].shift = 12;
	mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;

	mmu_psize_defs[MMU_PAGE_64K].shift = 16;
	mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
found:
#ifdef CONFIG_SPARSEMEM_VMEMMAP
	if (mmu_psize_defs[MMU_PAGE_2M].shift) {
		/*
		 * map vmemmap using 2M if available
		 */
		mmu_vmemmap_psize = MMU_PAGE_2M;
	}
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
	return;
}

534 535 536 537 538 539 540 541 542 543 544 545
static void radix_init_amor(void)
{
	/*
	* In HV mode, we init AMOR (Authority Mask Override Register) so that
	* the hypervisor and guest can setup IAMR (Instruction Authority Mask
	* Register), enable key 0 and set it to 1.
	*
	* AMOR = 0b1100 .... 0000 (Mask for key 0 is 11)
	*/
	mtspr(SPRN_AMOR, (3ul << 62));
}

546 547
#ifdef CONFIG_PPC_KUEP
void setup_kuep(bool disabled)
548
{
549 550 551 552 553 554
	if (disabled || !early_radix_enabled())
		return;

	if (smp_processor_id() == boot_cpuid)
		pr_info("Activating Kernel Userspace Execution Prevention\n");

555 556 557 558 559
	/*
	 * Radix always uses key0 of the IAMR to determine if an access is
	 * allowed. We set bit 0 (IBM bit 1) of key0, to prevent instruction
	 * fetch.
	 */
560
	mtspr(SPRN_IAMR, (1ul << 62));
561
}
562
#endif
563

564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
#ifdef CONFIG_PPC_KUAP
void setup_kuap(bool disabled)
{
	if (disabled || !early_radix_enabled())
		return;

	if (smp_processor_id() == boot_cpuid) {
		pr_info("Activating Kernel Userspace Access Prevention\n");
		cur_cpu_spec->mmu_features |= MMU_FTR_RADIX_KUAP;
	}

	/* Make sure userspace can't change the AMR */
	mtspr(SPRN_UAMOR, 0);
	mtspr(SPRN_AMR, AMR_KUAP_BLOCKED);
	isync();
}
#endif

582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
void __init radix__early_init_mmu(void)
{
	unsigned long lpcr;

#ifdef CONFIG_PPC_64K_PAGES
	/* PAGE_SIZE mappings */
	mmu_virtual_psize = MMU_PAGE_64K;
#else
	mmu_virtual_psize = MMU_PAGE_4K;
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
	/* vmemmap mapping */
	mmu_vmemmap_psize = mmu_virtual_psize;
#endif
	/*
	 * initialize page table size
	 */
	__pte_index_size = RADIX_PTE_INDEX_SIZE;
	__pmd_index_size = RADIX_PMD_INDEX_SIZE;
	__pud_index_size = RADIX_PUD_INDEX_SIZE;
	__pgd_index_size = RADIX_PGD_INDEX_SIZE;
604
	__pud_cache_index = RADIX_PUD_INDEX_SIZE;
605 606 607 608 609
	__pte_table_size = RADIX_PTE_TABLE_SIZE;
	__pmd_table_size = RADIX_PMD_TABLE_SIZE;
	__pud_table_size = RADIX_PUD_TABLE_SIZE;
	__pgd_table_size = RADIX_PGD_TABLE_SIZE;

610 611 612
	__pmd_val_bits = RADIX_PMD_VAL_BITS;
	__pud_val_bits = RADIX_PUD_VAL_BITS;
	__pgd_val_bits = RADIX_PGD_VAL_BITS;
613

614 615 616
	__kernel_virt_start = RADIX_KERN_VIRT_START;
	__vmalloc_start = RADIX_VMALLOC_START;
	__vmalloc_end = RADIX_VMALLOC_END;
617
	__kernel_io_start = RADIX_KERN_IO_START;
618
	__kernel_io_end = RADIX_KERN_IO_END;
619
	vmemmap = (struct page *)RADIX_VMEMMAP_START;
620
	ioremap_bot = IOREMAP_BASE;
621 622 623 624

#ifdef CONFIG_PCI
	pci_io_base = ISA_IO_BASE;
#endif
625 626
	__pte_frag_nr = RADIX_PTE_FRAG_NR;
	__pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
627 628
	__pmd_frag_nr = RADIX_PMD_FRAG_NR;
	__pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
629

630
	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
631
		radix_init_native();
632
		lpcr = mfspr(SPRN_LPCR);
633
		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
634
		radix_init_partition_table();
635
		radix_init_amor();
636 637
	} else {
		radix_init_pseries();
638
	}
639

640 641
	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);

642
	radix_init_pgtable();
643 644
	/* Switch to the guard PID before turning on MMU */
	radix__switch_mmu_context(NULL, &init_mm);
645 646
	if (cpu_has_feature(CPU_FTR_HVMODE))
		tlbiel_all();
647 648 649 650 651 652
}

void radix__early_init_mmu_secondary(void)
{
	unsigned long lpcr;
	/*
653
	 * update partition table control register and UPRT
654
	 */
655 656
	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
		lpcr = mfspr(SPRN_LPCR);
657
		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
658

659 660
		mtspr(SPRN_PTCR,
		      __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
661
		radix_init_amor();
662
	}
663

664
	radix__switch_mmu_context(NULL, &init_mm);
665 666
	if (cpu_has_feature(CPU_FTR_HVMODE))
		tlbiel_all();
667 668
}

669 670 671 672 673 674 675 676
void radix__mmu_cleanup_all(void)
{
	unsigned long lpcr;

	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
		lpcr = mfspr(SPRN_LPCR);
		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
		mtspr(SPRN_PTCR, 0);
677
		powernv_set_nmmu_ptcr(0);
678 679 680 681
		radix__flush_tlb_all();
	}
}

682 683 684
void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
				phys_addr_t first_memblock_size)
{
685 686
	/*
	 * We don't currently support the first MEMBLOCK not mapping 0
687 688 689
	 * physical on those processors
	 */
	BUG_ON(first_memblock_base != 0);
690

691 692 693 694
	/*
	 * Radix mode is not limited by RMA / VRMA addressing.
	 */
	ppc64_rma_size = ULONG_MAX;
695
}
696

697
#ifdef CONFIG_MEMORY_HOTPLUG
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
{
	pte_t *pte;
	int i;

	for (i = 0; i < PTRS_PER_PTE; i++) {
		pte = pte_start + i;
		if (!pte_none(*pte))
			return;
	}

	pte_free_kernel(&init_mm, pte_start);
	pmd_clear(pmd);
}

static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
{
	pmd_t *pmd;
	int i;

	for (i = 0; i < PTRS_PER_PMD; i++) {
		pmd = pmd_start + i;
		if (!pmd_none(*pmd))
			return;
	}

	pmd_free(&init_mm, pmd_start);
	pud_clear(pud);
}

728 729 730 731 732 733 734 735
struct change_mapping_params {
	pte_t *pte;
	unsigned long start;
	unsigned long end;
	unsigned long aligned_start;
	unsigned long aligned_end;
};

736
static int __meminit stop_machine_change_mapping(void *data)
737 738 739 740 741 742 743 744 745
{
	struct change_mapping_params *params =
			(struct change_mapping_params *)data;

	if (!data)
		return -1;

	spin_unlock(&init_mm.page_table_lock);
	pte_clear(&init_mm, params->aligned_start, params->pte);
746 747
	create_physical_mapping(params->aligned_start, params->start, -1);
	create_physical_mapping(params->end, params->aligned_end, -1);
748 749 750 751
	spin_lock(&init_mm.page_table_lock);
	return 0;
}

752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
static void remove_pte_table(pte_t *pte_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pte_t *pte;

	pte = pte_start + pte_index(addr);
	for (; addr < end; addr = next, pte++) {
		next = (addr + PAGE_SIZE) & PAGE_MASK;
		if (next > end)
			next = end;

		if (!pte_present(*pte))
			continue;

767 768 769 770 771 772 773 774 775
		if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) {
			/*
			 * The vmemmap_free() and remove_section_mapping()
			 * codepaths call us with aligned addresses.
			 */
			WARN_ONCE(1, "%s: unaligned range\n", __func__);
			continue;
		}

776 777 778 779
		pte_clear(&init_mm, addr, pte);
	}
}

780 781 782
/*
 * clear the pte and potentially split the mapping helper
 */
783
static void __meminit split_kernel_mapping(unsigned long addr, unsigned long end,
784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
				unsigned long size, pte_t *pte)
{
	unsigned long mask = ~(size - 1);
	unsigned long aligned_start = addr & mask;
	unsigned long aligned_end = addr + size;
	struct change_mapping_params params;
	bool split_region = false;

	if ((end - addr) < size) {
		/*
		 * We're going to clear the PTE, but not flushed
		 * the mapping, time to remap and flush. The
		 * effects if visible outside the processor or
		 * if we are running in code close to the
		 * mapping we cleared, we are in trouble.
		 */
		if (overlaps_kernel_text(aligned_start, addr) ||
			overlaps_kernel_text(end, aligned_end)) {
			/*
			 * Hack, just return, don't pte_clear
			 */
			WARN_ONCE(1, "Linear mapping %lx->%lx overlaps kernel "
				  "text, not splitting\n", addr, end);
			return;
		}
		split_region = true;
	}

	if (split_region) {
		params.pte = pte;
		params.start = addr;
		params.end = end;
		params.aligned_start = addr & ~(size - 1);
		params.aligned_end = min_t(unsigned long, aligned_end,
				(unsigned long)__va(memblock_end_of_DRAM()));
		stop_machine(stop_machine_change_mapping, &params, NULL);
		return;
	}

	pte_clear(&init_mm, addr, pte);
}

826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pte_t *pte_base;
	pmd_t *pmd;

	pmd = pmd_start + pmd_index(addr);
	for (; addr < end; addr = next, pmd++) {
		next = pmd_addr_end(addr, end);

		if (!pmd_present(*pmd))
			continue;

		if (pmd_huge(*pmd)) {
841
			split_kernel_mapping(addr, end, PMD_SIZE, (pte_t *)pmd);
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
			continue;
		}

		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
		remove_pte_table(pte_base, addr, next);
		free_pte_table(pte_base, pmd);
	}
}

static void remove_pud_table(pud_t *pud_start, unsigned long addr,
			     unsigned long end)
{
	unsigned long next;
	pmd_t *pmd_base;
	pud_t *pud;

	pud = pud_start + pud_index(addr);
	for (; addr < end; addr = next, pud++) {
		next = pud_addr_end(addr, end);

		if (!pud_present(*pud))
			continue;

		if (pud_huge(*pud)) {
866
			split_kernel_mapping(addr, end, PUD_SIZE, (pte_t *)pud);
867 868 869 870 871 872 873 874 875
			continue;
		}

		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
		remove_pmd_table(pmd_base, addr, next);
		free_pmd_table(pmd_base, pud);
	}
}

876
static void __meminit remove_pagetable(unsigned long start, unsigned long end)
877 878 879 880 881 882 883 884 885 886 887 888 889 890 891
{
	unsigned long addr, next;
	pud_t *pud_base;
	pgd_t *pgd;

	spin_lock(&init_mm.page_table_lock);

	for (addr = start; addr < end; addr = next) {
		next = pgd_addr_end(addr, end);

		pgd = pgd_offset_k(addr);
		if (!pgd_present(*pgd))
			continue;

		if (pgd_huge(*pgd)) {
892
			split_kernel_mapping(addr, end, PGDIR_SIZE, (pte_t *)pgd);
893 894 895 896 897 898 899 900 901 902 903
			continue;
		}

		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
		remove_pud_table(pud_base, addr, next);
	}

	spin_unlock(&init_mm.page_table_lock);
	radix__flush_tlb_kernel_range(start, end);
}

904
int __meminit radix__create_section_mapping(unsigned long start, unsigned long end, int nid)
905
{
906
	if (end >= RADIX_VMALLOC_START) {
907
		pr_warn("Outside the supported range\n");
908 909 910
		return -1;
	}

911
	return create_physical_mapping(start, end, nid);
912
}
913

914
int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
915 916 917 918
{
	remove_pagetable(start, end);
	return 0;
}
919 920
#endif /* CONFIG_MEMORY_HOTPLUG */

921
#ifdef CONFIG_SPARSEMEM_VMEMMAP
922 923 924 925 926 927 928
static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
				 pgprot_t flags, unsigned int map_page_size,
				 int nid)
{
	return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
}

929 930 931 932 933 934
int __meminit radix__vmemmap_create_mapping(unsigned long start,
				      unsigned long page_size,
				      unsigned long phys)
{
	/* Create a PTE encoding */
	unsigned long flags = _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW;
935 936 937
	int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
	int ret;

938
	if ((start + page_size) >= RADIX_VMEMMAP_END) {
939
		pr_warn("Outside the supported range\n");
940 941 942
		return -1;
	}

943 944
	ret = __map_kernel_page_nid(start, phys, __pgprot(flags), page_size, nid);
	BUG_ON(ret);
945 946 947 948 949

	return 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
950
void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
951
{
952
	remove_pagetable(start, start + page_size);
953 954 955
}
#endif
#endif
956 957 958 959 960 961 962 963 964 965

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
				  pmd_t *pmdp, unsigned long clr,
				  unsigned long set)
{
	unsigned long old;

#ifdef CONFIG_DEBUG_VM
966
	WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
967
	assert_spin_locked(pmd_lockptr(mm, pmdp));
968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
#endif

	old = radix__pte_update(mm, addr, (pte_t *)pmdp, clr, set, 1);
	trace_hugepage_update(addr, old, clr, set);

	return old;
}

pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
			pmd_t *pmdp)

{
	pmd_t pmd;

	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
	VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
984
	VM_BUG_ON(pmd_devmap(*pmdp));
985 986 987 988 989
	/*
	 * khugepaged calls this for normal pmd
	 */
	pmd = *pmdp;
	pmd_clear(pmdp);
990

991
	/*FIXME!!  Verify whether we need this kick below */
992
	serialize_against_pte_lookup(vma->vm_mm);
993 994 995

	radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);

996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
	return pmd;
}

/*
 * For us pgtable_t is pte_t *. Inorder to save the deposisted
 * page table, we consider the allocated page table as a list
 * head. On withdraw we need to make sure we zero out the used
 * list_head memory area.
 */
void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
				 pgtable_t pgtable)
{
1008
	struct list_head *lh = (struct list_head *) pgtable;
1009

1010
	assert_spin_locked(pmd_lockptr(mm, pmdp));
1011

1012 1013 1014 1015 1016 1017
	/* FIFO */
	if (!pmd_huge_pte(mm, pmdp))
		INIT_LIST_HEAD(lh);
	else
		list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
	pmd_huge_pte(mm, pmdp) = pgtable;
1018 1019 1020 1021
}

pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
1022 1023 1024
	pte_t *ptep;
	pgtable_t pgtable;
	struct list_head *lh;
1025

1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
	assert_spin_locked(pmd_lockptr(mm, pmdp));

	/* FIFO */
	pgtable = pmd_huge_pte(mm, pmdp);
	lh = (struct list_head *) pgtable;
	if (list_empty(lh))
		pmd_huge_pte(mm, pmdp) = NULL;
	else {
		pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
		list_del(lh);
	}
	ptep = (pte_t *) pgtable;
	*ptep = __pte(0);
	ptep++;
	*ptep = __pte(0);
	return pgtable;
}
1043 1044

pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1045
				     unsigned long addr, pmd_t *pmdp)
1046 1047 1048 1049 1050 1051 1052
{
	pmd_t old_pmd;
	unsigned long old;

	old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
	old_pmd = __pmd(old);
	/*
1053
	 * Serialize against find_current_mm_pte which does lock-less
1054 1055 1056 1057 1058 1059
	 * lookup in page tables with local interrupts disabled. For huge pages
	 * it casts pmd_t to pte_t. Since format of pte_t is different from
	 * pmd_t we want to prevent transit from pmd pointing to page table
	 * to pmd pointing to huge page (and back) while interrupts are disabled.
	 * We clear pmd to possibly replace it with page table pointer in
	 * different code paths. So make sure we wait for the parallel
1060
	 * find_current_mm_pte to finish.
1061
	 */
1062
	serialize_against_pte_lookup(mm);
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073
	return old_pmd;
}

int radix__has_transparent_hugepage(void)
{
	/* For radix 2M at PMD level means thp */
	if (mmu_psize_defs[MMU_PAGE_2M].shift == PMD_SHIFT)
		return 1;
	return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1074

1075 1076
void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
				  pte_t entry, unsigned long address, int psize)
1077
{
1078
	struct mm_struct *mm = vma->vm_mm;
1079 1080
	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
					      _PAGE_RW | _PAGE_EXEC);
1081 1082

	unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1083 1084 1085 1086
	/*
	 * To avoid NMMU hang while relaxing access, we need mark
	 * the pte invalid in between.
	 */
1087
	if ((change & _PAGE_RW) && atomic_read(&mm->context.copros) > 0) {
1088 1089
		unsigned long old_pte, new_pte;

1090
		old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1091 1092 1093 1094
		/*
		 * new value of pte
		 */
		new_pte = old_pte | set;
1095
		radix__flush_tlb_page_psize(mm, address, psize);
1096
		__radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1097
	} else {
1098
		__radix_pte_update(ptep, 0, set);
1099 1100 1101 1102 1103 1104
		/*
		 * Book3S does not require a TLB flush when relaxing access
		 * restrictions when the address space is not attached to a
		 * NMMU, because the core MMU will reload the pte after taking
		 * an access fault, which is defined by the architectue.
		 */
1105
	}
1106
	/* See ptesync comment in radix__set_pte_at */
1107
}
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125

void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
				    unsigned long addr, pte_t *ptep,
				    pte_t old_pte, pte_t pte)
{
	struct mm_struct *mm = vma->vm_mm;

	/*
	 * To avoid NMMU hang while relaxing access we need to flush the tlb before
	 * we set the new value. We need to do this only for radix, because hash
	 * translation does flush when updating the linux pte.
	 */
	if (is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
	    (atomic_read(&mm->context.copros) > 0))
		radix__flush_tlb_page(vma, addr);

	set_pte_at(mm, addr, ptep, pte);
}
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145

int radix__ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size,
			pgprot_t prot, int nid)
{
	if (likely(slab_is_available())) {
		int err = ioremap_page_range(ea, ea + size, pa, prot);
		if (err)
			unmap_kernel_range(ea, size);
		return err;
	} else {
		unsigned long i;

		for (i = 0; i < size; i += PAGE_SIZE) {
			int err = map_kernel_page(ea + i, pa + i, prot);
			if (WARN_ON_ONCE(err)) /* Should clean up */
				return err;
		}
		return 0;
	}
}