init_64.c 12.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 *  PowerPC version
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 *    Copyright (C) 1996 Paul Mackerras
 *
 *  Derived from "arch/i386/mm/init.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Dave Engebretsen <engebret@us.ibm.com>
 *      Rework for PPC64 port.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

22 23
#undef DEBUG

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/idr.h>
#include <linux/nodemask.h>
#include <linux/module.h>
41
#include <linux/poison.h>
Y
Yinghai Lu 已提交
42
#include <linux/memblock.h>
43
#include <linux/hugetlb.h>
44
#include <linux/slab.h>
45 46
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
47
#include <linux/memremap.h>
48 49 50 51 52 53 54 55 56

#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
57
#include <linux/uaccess.h>
58 59 60 61 62 63 64 65 66 67
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/eeh.h>
#include <asm/processor.h>
#include <asm/mmzone.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/iommu.h>
#include <asm/vdso.h>
D
David Gibson 已提交
68 69

#include "mmu_decl.h"
70

71
#ifdef CONFIG_PPC_BOOK3S_64
72
#if H_PGTABLE_RANGE > USER_VSID_RANGE
73 74
#warning Limited user VSID range means pagetable space is wasted
#endif
75
#endif /* CONFIG_PPC_BOOK3S_64 */
76

77
phys_addr_t memstart_addr = ~0;
78
EXPORT_SYMBOL_GPL(memstart_addr);
79
phys_addr_t kernstart_addr;
80
EXPORT_SYMBOL_GPL(kernstart_addr);
81

A
Andy Whitcroft 已提交
82 83 84 85 86 87 88
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Given an address within the vmemmap, determine the pfn of the page that
 * represents the start of the section it is within.  Note that we have to
 * do this by hand as the proffered address may not be correctly aligned.
 * Subtraction of non-aligned pointers produces undefined results.
 */
89
static unsigned long __meminit vmemmap_section_start(unsigned long page)
A
Andy Whitcroft 已提交
90 91 92 93 94 95 96 97 98 99 100 101
{
	unsigned long offset = page - ((unsigned long)(vmemmap));

	/* Return the pfn of the start of the section. */
	return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
}

/*
 * Check if this vmemmap page is already initialised.  If any section
 * which overlaps this vmemmap page is initialised then this page is
 * initialised already.
 */
102
static int __meminit vmemmap_populated(unsigned long start, int page_size)
A
Andy Whitcroft 已提交
103 104
{
	unsigned long end = start + page_size;
105
	start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
A
Andy Whitcroft 已提交
106 107

	for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
108
		if (pfn_valid(page_to_pfn((struct page *)start)))
A
Andy Whitcroft 已提交
109 110 111 112 113
			return 1;

	return 0;
}

114 115 116 117 118 119 120 121 122 123 124 125 126
/*
 * vmemmap virtual address space management does not have a traditonal page
 * table to track which virtual struct pages are backed by physical mapping.
 * The virtual to physical mappings are tracked in a simple linked list
 * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
 * all times where as the 'next' list maintains the available
 * vmemmap_backing structures which have been deleted from the
 * 'vmemmap_global' list during system runtime (memory hotplug remove
 * operation). The freed 'vmemmap_backing' structures are reused later when
 * new requests come in without allocating fresh memory. This pointer also
 * tracks the allocated 'vmemmap_backing' structures as we allocate one
 * full page memory at a time when we dont have any.
 */
127
struct vmemmap_backing *vmemmap_list;
128
static struct vmemmap_backing *next;
129 130 131 132 133 134 135 136

/*
 * The same pointer 'next' tracks individual chunks inside the allocated
 * full page during the boot time and again tracks the freeed nodes during
 * runtime. It is racy but it does not happen as they are separated by the
 * boot process. Will create problem if some how we have memory hotplug
 * operation during boot !!
 */
137 138
static int num_left;
static int num_freed;
139 140 141

static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
{
142 143 144 145 146 147 148 149 150
	struct vmemmap_backing *vmem_back;
	/* get from freed entries first */
	if (num_freed) {
		num_freed--;
		vmem_back = next;
		next = next->list;

		return vmem_back;
	}
151 152

	/* allocate a page when required and hand out chunks */
153
	if (!num_left) {
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
		next = vmemmap_alloc_block(PAGE_SIZE, node);
		if (unlikely(!next)) {
			WARN_ON(1);
			return NULL;
		}
		num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
	}

	num_left--;

	return next++;
}

static __meminit void vmemmap_list_populate(unsigned long phys,
					    unsigned long start,
					    int node)
{
	struct vmemmap_backing *vmem_back;

	vmem_back = vmemmap_list_alloc(node);
	if (unlikely(!vmem_back)) {
		WARN_ON(1);
		return;
	}

	vmem_back->phys = phys;
	vmem_back->virt_addr = start;
	vmem_back->list = vmemmap_list;

	vmemmap_list = vmem_back;
}

L
Li Zhong 已提交
186 187 188 189 190 191 192 193 194 195
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;

	/* Align to the page size of the linear mapping. */
	start = _ALIGN_DOWN(start, page_size);

	pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);

	for (; start < end; start += page_size) {
196
		struct vmem_altmap *altmap;
L
Li Zhong 已提交
197
		void *p;
198
		int rc;
L
Li Zhong 已提交
199 200 201 202

		if (vmemmap_populated(start, page_size))
			continue;

203 204 205 206
		/* altmap lookups only work at section boundaries */
		altmap = to_vmem_altmap(SECTION_ALIGN_DOWN(start));

		p =  __vmemmap_alloc_block_buf(page_size, node, altmap);
L
Li Zhong 已提交
207 208 209 210 211 212 213 214
		if (!p)
			return -ENOMEM;

		vmemmap_list_populate(__pa(p), start, node);

		pr_debug("      * %016lx..%016lx allocated at %p\n",
			 start, start + page_size, p);

215 216 217 218 219 220 221
		rc = vmemmap_create_mapping(start, page_size, __pa(p));
		if (rc < 0) {
			pr_warning(
				"vmemmap_populate: Unable to create vmemmap mapping: %d\n",
				rc);
			return -EFAULT;
		}
L
Li Zhong 已提交
222 223 224 225 226 227
	}

	return 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
static unsigned long vmemmap_list_free(unsigned long start)
{
	struct vmemmap_backing *vmem_back, *vmem_back_prev;

	vmem_back_prev = vmem_back = vmemmap_list;

	/* look for it with prev pointer recorded */
	for (; vmem_back; vmem_back = vmem_back->list) {
		if (vmem_back->virt_addr == start)
			break;
		vmem_back_prev = vmem_back;
	}

	if (unlikely(!vmem_back)) {
		WARN_ON(1);
		return 0;
	}

	/* remove it from vmemmap_list */
	if (vmem_back == vmemmap_list) /* remove head */
		vmemmap_list = vmem_back->list;
	else
		vmem_back_prev->list = vmem_back->list;

	/* next point to this freed entry */
	vmem_back->list = next;
	next = vmem_back;
	num_freed++;

	return vmem_back->phys;
}

L
Li Zhong 已提交
260
void __ref vmemmap_free(unsigned long start, unsigned long end)
A
Andy Whitcroft 已提交
261
{
262
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
263
	unsigned long page_order = get_order(page_size);
A
Andy Whitcroft 已提交
264 265 266

	start = _ALIGN_DOWN(start, page_size);

L
Li Zhong 已提交
267
	pr_debug("vmemmap_free %lx...%lx\n", start, end);
268

A
Andy Whitcroft 已提交
269
	for (; start < end; start += page_size) {
270
		unsigned long nr_pages, addr;
271 272
		struct vmem_altmap *altmap;
		struct page *section_base;
273
		struct page *page;
A
Andy Whitcroft 已提交
274

L
Li Zhong 已提交
275 276 277 278 279
		/*
		 * the section has already be marked as invalid, so
		 * vmemmap_populated() true means some other sections still
		 * in this page, so skip it.
		 */
A
Andy Whitcroft 已提交
280 281 282
		if (vmemmap_populated(start, page_size))
			continue;

L
Li Zhong 已提交
283
		addr = vmemmap_list_free(start);
284 285 286 287
		if (!addr)
			continue;

		page = pfn_to_page(addr >> PAGE_SHIFT);
288
		section_base = pfn_to_page(vmemmap_section_start(start));
289 290
		nr_pages = 1 << page_order;

291 292 293 294
		altmap = to_vmem_altmap((unsigned long) section_base);
		if (altmap) {
			vmem_altmap_free(altmap, nr_pages);
		} else if (PageReserved(page)) {
295 296 297 298 299 300 301 302 303 304 305 306 307
			/* allocated from bootmem */
			if (page_size < PAGE_SIZE) {
				/*
				 * this shouldn't happen, but if it is
				 * the case, leave the memory there
				 */
				WARN_ON_ONCE(1);
			} else {
				while (nr_pages--)
					free_reserved_page(page++);
			}
		} else {
			free_pages((unsigned long)(__va(addr)), page_order);
L
Li Zhong 已提交
308
		}
309 310

		vmemmap_remove_mapping(start, page_size);
A
Andy Whitcroft 已提交
311
	}
312
}
L
Li Zhong 已提交
313
#endif
314 315 316 317
void register_page_bootmem_memmap(unsigned long section_nr,
				  struct page *start_page, unsigned long size)
{
}
318

319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
/*
 * We do not have access to the sparsemem vmemmap, so we fallback to
 * walking the list of sparsemem blocks which we already maintain for
 * the sake of crashdump. In the long run, we might want to maintain
 * a tree if performance of that linear walk becomes a problem.
 *
 * realmode_pfn_to_page functions can fail due to:
 * 1) As real sparsemem blocks do not lay in RAM continously (they
 * are in virtual address space which is not available in the real mode),
 * the requested page struct can be split between blocks so get_page/put_page
 * may fail.
 * 2) When huge pages are used, the get_page/put_page API will fail
 * in real mode as the linked addresses in the page struct are virtual
 * too.
 */
struct page *realmode_pfn_to_page(unsigned long pfn)
{
	struct vmemmap_backing *vmem_back;
	struct page *page;
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
	unsigned long pg_va = (unsigned long) pfn_to_page(pfn);

	for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
		if (pg_va < vmem_back->virt_addr)
			continue;

345 346 347 348
		/* After vmemmap_list entry free is possible, need check all */
		if ((pg_va + sizeof(struct page)) <=
				(vmem_back->virt_addr + page_size)) {
			page = (struct page *) (vmem_back->phys + pg_va -
349
				vmem_back->virt_addr);
350 351
			return page;
		}
352 353
	}

354
	/* Probably that page struct is split between real pages */
355 356 357 358
	return NULL;
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);

359
#else
360 361 362 363 364 365 366 367

struct page *realmode_pfn_to_page(unsigned long pfn)
{
	struct page *page = pfn_to_page(pfn);
	return page;
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);

368
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
369

370
#ifdef CONFIG_PPC_BOOK3S_64
371 372 373 374 375 376 377 378
static bool disable_radix;
static int __init parse_disable_radix(char *p)
{
	disable_radix = true;
	return 0;
}
early_param("disable_radix", parse_disable_radix);

379
/*
380 381 382
 * If we're running under a hypervisor, we need to check the contents of
 * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
 * radix.  If not, we clear the radix feature bit so we fall back to hash.
383
 */
384
static void __init early_check_vec5(void)
385 386 387 388
{
	unsigned long root, chosen;
	int size;
	const u8 *vec5;
389
	u8 mmu_supported;
390 391 392

	root = of_get_flat_dt_root();
	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
393 394
	if (chosen == -FDT_ERR_NOTFOUND) {
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
395
		return;
396
	}
397
	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
398 399
	if (!vec5) {
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
400
		return;
401 402
	}
	if (size <= OV5_INDX(OV5_MMU_SUPPORT)) {
403
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
		return;
	}

	/* Check for supported configuration */
	mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] &
			OV5_FEAT(OV5_MMU_SUPPORT);
	if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) {
		/* Hypervisor only supports radix - check enabled && GTSE */
		if (!early_radix_enabled()) {
			pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
		}
		if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
						OV5_FEAT(OV5_RADIX_GTSE))) {
			pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n");
		}
		/* Do radix anyway - the hypervisor said we had to */
		cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
	} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
		/* Hypervisor only supports hash - disable radix */
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
	}
425 426
}

427 428
void __init mmu_early_init_devtree(void)
{
429
	/* Disable radix mode based on kernel command line. */
430
	if (disable_radix)
431
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
432

433 434 435 436 437 438
	/*
	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
	 * When running bare-metal, we can use radix if we like
	 * even though the ibm,architecture-vec-5 property created by
	 * skiboot doesn't have the necessary bits set.
	 */
439
	if (!(mfmsr() & MSR_HV))
440 441
		early_check_vec5();

442
	if (early_radix_enabled())
443 444
		radix__early_init_devtree();
	else
445
		hash__early_init_devtree();
446
}
447
#endif /* CONFIG_PPC_BOOK3S_64 */