init_64.c 11.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 *  PowerPC version
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 *    Copyright (C) 1996 Paul Mackerras
 *
 *  Derived from "arch/i386/mm/init.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Dave Engebretsen <engebret@us.ibm.com>
 *      Rework for PPC64 port.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

22 23
#undef DEBUG

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/idr.h>
#include <linux/nodemask.h>
#include <linux/module.h>
41
#include <linux/poison.h>
Y
Yinghai Lu 已提交
42
#include <linux/memblock.h>
43
#include <linux/hugetlb.h>
44
#include <linux/slab.h>
45 46
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
47
#include <linux/memremap.h>
48 49 50 51 52 53 54 55 56

#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
57
#include <linux/uaccess.h>
58 59 60 61 62 63 64 65 66 67
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/eeh.h>
#include <asm/processor.h>
#include <asm/mmzone.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/iommu.h>
#include <asm/vdso.h>
D
David Gibson 已提交
68 69

#include "mmu_decl.h"
70

71
phys_addr_t memstart_addr = ~0;
72
EXPORT_SYMBOL_GPL(memstart_addr);
73
phys_addr_t kernstart_addr;
74
EXPORT_SYMBOL_GPL(kernstart_addr);
75

A
Andy Whitcroft 已提交
76 77 78 79 80 81 82
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Given an address within the vmemmap, determine the pfn of the page that
 * represents the start of the section it is within.  Note that we have to
 * do this by hand as the proffered address may not be correctly aligned.
 * Subtraction of non-aligned pointers produces undefined results.
 */
83
static unsigned long __meminit vmemmap_section_start(unsigned long page)
A
Andy Whitcroft 已提交
84 85 86 87 88 89 90 91 92 93 94 95
{
	unsigned long offset = page - ((unsigned long)(vmemmap));

	/* Return the pfn of the start of the section. */
	return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
}

/*
 * Check if this vmemmap page is already initialised.  If any section
 * which overlaps this vmemmap page is initialised then this page is
 * initialised already.
 */
96
static int __meminit vmemmap_populated(unsigned long start, int page_size)
A
Andy Whitcroft 已提交
97 98
{
	unsigned long end = start + page_size;
99
	start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
A
Andy Whitcroft 已提交
100 101

	for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
102
		if (pfn_valid(page_to_pfn((struct page *)start)))
A
Andy Whitcroft 已提交
103 104 105 106 107
			return 1;

	return 0;
}

108 109 110 111 112 113 114 115 116 117 118 119 120
/*
 * vmemmap virtual address space management does not have a traditonal page
 * table to track which virtual struct pages are backed by physical mapping.
 * The virtual to physical mappings are tracked in a simple linked list
 * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
 * all times where as the 'next' list maintains the available
 * vmemmap_backing structures which have been deleted from the
 * 'vmemmap_global' list during system runtime (memory hotplug remove
 * operation). The freed 'vmemmap_backing' structures are reused later when
 * new requests come in without allocating fresh memory. This pointer also
 * tracks the allocated 'vmemmap_backing' structures as we allocate one
 * full page memory at a time when we dont have any.
 */
121
struct vmemmap_backing *vmemmap_list;
122
static struct vmemmap_backing *next;
123 124 125 126 127 128 129 130

/*
 * The same pointer 'next' tracks individual chunks inside the allocated
 * full page during the boot time and again tracks the freeed nodes during
 * runtime. It is racy but it does not happen as they are separated by the
 * boot process. Will create problem if some how we have memory hotplug
 * operation during boot !!
 */
131 132
static int num_left;
static int num_freed;
133 134 135

static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
{
136 137 138 139 140 141 142 143 144
	struct vmemmap_backing *vmem_back;
	/* get from freed entries first */
	if (num_freed) {
		num_freed--;
		vmem_back = next;
		next = next->list;

		return vmem_back;
	}
145 146

	/* allocate a page when required and hand out chunks */
147
	if (!num_left) {
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
		next = vmemmap_alloc_block(PAGE_SIZE, node);
		if (unlikely(!next)) {
			WARN_ON(1);
			return NULL;
		}
		num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
	}

	num_left--;

	return next++;
}

static __meminit void vmemmap_list_populate(unsigned long phys,
					    unsigned long start,
					    int node)
{
	struct vmemmap_backing *vmem_back;

	vmem_back = vmemmap_list_alloc(node);
	if (unlikely(!vmem_back)) {
		WARN_ON(1);
		return;
	}

	vmem_back->phys = phys;
	vmem_back->virt_addr = start;
	vmem_back->list = vmemmap_list;

	vmemmap_list = vmem_back;
}

180 181
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
		struct vmem_altmap *altmap)
L
Li Zhong 已提交
182 183 184 185 186 187 188 189 190
{
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;

	/* Align to the page size of the linear mapping. */
	start = _ALIGN_DOWN(start, page_size);

	pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);

	for (; start < end; start += page_size) {
191
		void *p = NULL;
192
		int rc;
L
Li Zhong 已提交
193 194 195 196

		if (vmemmap_populated(start, page_size))
			continue;

197 198 199 200 201
		/*
		 * Allocate from the altmap first if we have one. This may
		 * fail due to alignment issues when using 16MB hugepages, so
		 * fall back to system memory if the altmap allocation fail.
		 */
202 203
		if (altmap)
			p = altmap_alloc_block_buf(page_size, altmap);
204
		if (!p)
205
			p = vmemmap_alloc_block_buf(page_size, node);
L
Li Zhong 已提交
206 207 208 209 210 211 212 213
		if (!p)
			return -ENOMEM;

		vmemmap_list_populate(__pa(p), start, node);

		pr_debug("      * %016lx..%016lx allocated at %p\n",
			 start, start + page_size, p);

214 215
		rc = vmemmap_create_mapping(start, page_size, __pa(p));
		if (rc < 0) {
216 217
			pr_warn("%s: Unable to create vmemmap mapping: %d\n",
				__func__, rc);
218 219
			return -EFAULT;
		}
L
Li Zhong 已提交
220 221 222 223 224 225
	}

	return 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
static unsigned long vmemmap_list_free(unsigned long start)
{
	struct vmemmap_backing *vmem_back, *vmem_back_prev;

	vmem_back_prev = vmem_back = vmemmap_list;

	/* look for it with prev pointer recorded */
	for (; vmem_back; vmem_back = vmem_back->list) {
		if (vmem_back->virt_addr == start)
			break;
		vmem_back_prev = vmem_back;
	}

	if (unlikely(!vmem_back)) {
		WARN_ON(1);
		return 0;
	}

	/* remove it from vmemmap_list */
	if (vmem_back == vmemmap_list) /* remove head */
		vmemmap_list = vmem_back->list;
	else
		vmem_back_prev->list = vmem_back->list;

	/* next point to this freed entry */
	vmem_back->list = next;
	next = vmem_back;
	num_freed++;

	return vmem_back->phys;
}

258 259
void __ref vmemmap_free(unsigned long start, unsigned long end,
		struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
260
{
261
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
262
	unsigned long page_order = get_order(page_size);
263 264
	unsigned long alt_start = ~0, alt_end = ~0;
	unsigned long base_pfn;
A
Andy Whitcroft 已提交
265 266

	start = _ALIGN_DOWN(start, page_size);
267 268 269 270 271
	if (altmap) {
		alt_start = altmap->base_pfn;
		alt_end = altmap->base_pfn + altmap->reserve +
			  altmap->free + altmap->alloc + altmap->align;
	}
A
Andy Whitcroft 已提交
272

L
Li Zhong 已提交
273
	pr_debug("vmemmap_free %lx...%lx\n", start, end);
274

A
Andy Whitcroft 已提交
275
	for (; start < end; start += page_size) {
276
		unsigned long nr_pages, addr;
277
		struct page *section_base;
278
		struct page *page;
A
Andy Whitcroft 已提交
279

L
Li Zhong 已提交
280 281 282 283 284
		/*
		 * the section has already be marked as invalid, so
		 * vmemmap_populated() true means some other sections still
		 * in this page, so skip it.
		 */
A
Andy Whitcroft 已提交
285 286 287
		if (vmemmap_populated(start, page_size))
			continue;

L
Li Zhong 已提交
288
		addr = vmemmap_list_free(start);
289 290 291 292
		if (!addr)
			continue;

		page = pfn_to_page(addr >> PAGE_SHIFT);
293
		section_base = pfn_to_page(vmemmap_section_start(start));
294
		nr_pages = 1 << page_order;
295
		base_pfn = PHYS_PFN(addr);
296

297
		if (base_pfn >= alt_start && base_pfn < alt_end) {
298 299
			vmem_altmap_free(altmap, nr_pages);
		} else if (PageReserved(page)) {
300 301 302 303 304 305 306 307 308 309 310 311 312
			/* allocated from bootmem */
			if (page_size < PAGE_SIZE) {
				/*
				 * this shouldn't happen, but if it is
				 * the case, leave the memory there
				 */
				WARN_ON_ONCE(1);
			} else {
				while (nr_pages--)
					free_reserved_page(page++);
			}
		} else {
			free_pages((unsigned long)(__va(addr)), page_order);
L
Li Zhong 已提交
313
		}
314 315

		vmemmap_remove_mapping(start, page_size);
A
Andy Whitcroft 已提交
316
	}
317
}
L
Li Zhong 已提交
318
#endif
319 320 321 322
void register_page_bootmem_memmap(unsigned long section_nr,
				  struct page *start_page, unsigned long size)
{
}
323

324
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
325

326
#ifdef CONFIG_PPC_BOOK3S_64
327 328
static bool disable_radix = !IS_ENABLED(CONFIG_PPC_RADIX_MMU_DEFAULT);

329 330
static int __init parse_disable_radix(char *p)
{
331 332
	bool val;

333
	if (!p)
334 335 336 337 338 339
		val = true;
	else if (kstrtobool(p, &val))
		return -EINVAL;

	disable_radix = val;

340 341 342 343
	return 0;
}
early_param("disable_radix", parse_disable_radix);

344
/*
345 346 347
 * If we're running under a hypervisor, we need to check the contents of
 * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
 * radix.  If not, we clear the radix feature bit so we fall back to hash.
348
 */
349
static void __init early_check_vec5(void)
350 351 352 353
{
	unsigned long root, chosen;
	int size;
	const u8 *vec5;
354
	u8 mmu_supported;
355 356 357

	root = of_get_flat_dt_root();
	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
358 359
	if (chosen == -FDT_ERR_NOTFOUND) {
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
360
		return;
361
	}
362
	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
363 364
	if (!vec5) {
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
365
		return;
366 367
	}
	if (size <= OV5_INDX(OV5_MMU_SUPPORT)) {
368
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389
		return;
	}

	/* Check for supported configuration */
	mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] &
			OV5_FEAT(OV5_MMU_SUPPORT);
	if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) {
		/* Hypervisor only supports radix - check enabled && GTSE */
		if (!early_radix_enabled()) {
			pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
		}
		if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
						OV5_FEAT(OV5_RADIX_GTSE))) {
			pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n");
		}
		/* Do radix anyway - the hypervisor said we had to */
		cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
	} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
		/* Hypervisor only supports hash - disable radix */
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
	}
390 391
}

392 393
void __init mmu_early_init_devtree(void)
{
394
	/* Disable radix mode based on kernel command line. */
395
	if (disable_radix)
396
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
397

398 399 400 401 402 403
	/*
	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
	 * When running bare-metal, we can use radix if we like
	 * even though the ibm,architecture-vec-5 property created by
	 * skiboot doesn't have the necessary bits set.
	 */
404
	if (!(mfmsr() & MSR_HV))
405 406
		early_check_vec5();

407
	if (early_radix_enabled())
408 409
		radix__early_init_devtree();
	else
410
		hash__early_init_devtree();
411
}
412
#endif /* CONFIG_PPC_BOOK3S_64 */