init_64.c 11.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 *  PowerPC version
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  Modifications by Paul Mackerras (PowerMac) (paulus@cs.anu.edu.au)
 *  and Cort Dougan (PReP) (cort@cs.nmt.edu)
 *    Copyright (C) 1996 Paul Mackerras
 *
 *  Derived from "arch/i386/mm/init.c"
 *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Dave Engebretsen <engebret@us.ibm.com>
 *      Rework for PPC64 port.
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 */

22 23
#undef DEBUG

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/stddef.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/highmem.h>
#include <linux/idr.h>
#include <linux/nodemask.h>
#include <linux/module.h>
41
#include <linux/poison.h>
Y
Yinghai Lu 已提交
42
#include <linux/memblock.h>
43
#include <linux/hugetlb.h>
44
#include <linux/slab.h>
45 46
#include <linux/of_fdt.h>
#include <linux/libfdt.h>
47 48 49 50 51 52 53 54 55

#include <asm/pgalloc.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
56
#include <linux/uaccess.h>
57 58 59 60 61 62 63 64 65 66
#include <asm/smp.h>
#include <asm/machdep.h>
#include <asm/tlb.h>
#include <asm/eeh.h>
#include <asm/processor.h>
#include <asm/mmzone.h>
#include <asm/cputable.h>
#include <asm/sections.h>
#include <asm/iommu.h>
#include <asm/vdso.h>
D
David Gibson 已提交
67 68

#include "mmu_decl.h"
69

70
#ifdef CONFIG_PPC_STD_MMU_64
71
#if H_PGTABLE_RANGE > USER_VSID_RANGE
72 73
#warning Limited user VSID range means pagetable space is wasted
#endif
74
#endif /* CONFIG_PPC_STD_MMU_64 */
75

76
phys_addr_t memstart_addr = ~0;
77
EXPORT_SYMBOL_GPL(memstart_addr);
78
phys_addr_t kernstart_addr;
79
EXPORT_SYMBOL_GPL(kernstart_addr);
80

A
Andy Whitcroft 已提交
81 82 83 84 85 86 87
#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Given an address within the vmemmap, determine the pfn of the page that
 * represents the start of the section it is within.  Note that we have to
 * do this by hand as the proffered address may not be correctly aligned.
 * Subtraction of non-aligned pointers produces undefined results.
 */
88
static unsigned long __meminit vmemmap_section_start(unsigned long page)
A
Andy Whitcroft 已提交
89 90 91 92 93 94 95 96 97 98 99 100
{
	unsigned long offset = page - ((unsigned long)(vmemmap));

	/* Return the pfn of the start of the section. */
	return (offset / sizeof(struct page)) & PAGE_SECTION_MASK;
}

/*
 * Check if this vmemmap page is already initialised.  If any section
 * which overlaps this vmemmap page is initialised then this page is
 * initialised already.
 */
101
static int __meminit vmemmap_populated(unsigned long start, int page_size)
A
Andy Whitcroft 已提交
102 103
{
	unsigned long end = start + page_size;
104
	start = (unsigned long)(pfn_to_page(vmemmap_section_start(start)));
A
Andy Whitcroft 已提交
105 106

	for (; start < end; start += (PAGES_PER_SECTION * sizeof(struct page)))
107
		if (pfn_valid(page_to_pfn((struct page *)start)))
A
Andy Whitcroft 已提交
108 109 110 111 112
			return 1;

	return 0;
}

113 114 115 116 117 118 119 120 121 122 123 124 125
/*
 * vmemmap virtual address space management does not have a traditonal page
 * table to track which virtual struct pages are backed by physical mapping.
 * The virtual to physical mappings are tracked in a simple linked list
 * format. 'vmemmap_list' maintains the entire vmemmap physical mapping at
 * all times where as the 'next' list maintains the available
 * vmemmap_backing structures which have been deleted from the
 * 'vmemmap_global' list during system runtime (memory hotplug remove
 * operation). The freed 'vmemmap_backing' structures are reused later when
 * new requests come in without allocating fresh memory. This pointer also
 * tracks the allocated 'vmemmap_backing' structures as we allocate one
 * full page memory at a time when we dont have any.
 */
126
struct vmemmap_backing *vmemmap_list;
127
static struct vmemmap_backing *next;
128 129 130 131 132 133 134 135

/*
 * The same pointer 'next' tracks individual chunks inside the allocated
 * full page during the boot time and again tracks the freeed nodes during
 * runtime. It is racy but it does not happen as they are separated by the
 * boot process. Will create problem if some how we have memory hotplug
 * operation during boot !!
 */
136 137
static int num_left;
static int num_freed;
138 139 140

static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
{
141 142 143 144 145 146 147 148 149
	struct vmemmap_backing *vmem_back;
	/* get from freed entries first */
	if (num_freed) {
		num_freed--;
		vmem_back = next;
		next = next->list;

		return vmem_back;
	}
150 151

	/* allocate a page when required and hand out chunks */
152
	if (!num_left) {
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
		next = vmemmap_alloc_block(PAGE_SIZE, node);
		if (unlikely(!next)) {
			WARN_ON(1);
			return NULL;
		}
		num_left = PAGE_SIZE / sizeof(struct vmemmap_backing);
	}

	num_left--;

	return next++;
}

static __meminit void vmemmap_list_populate(unsigned long phys,
					    unsigned long start,
					    int node)
{
	struct vmemmap_backing *vmem_back;

	vmem_back = vmemmap_list_alloc(node);
	if (unlikely(!vmem_back)) {
		WARN_ON(1);
		return;
	}

	vmem_back->phys = phys;
	vmem_back->virt_addr = start;
	vmem_back->list = vmemmap_list;

	vmemmap_list = vmem_back;
}

L
Li Zhong 已提交
185 186 187 188 189 190 191 192 193 194 195
int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
{
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;

	/* Align to the page size of the linear mapping. */
	start = _ALIGN_DOWN(start, page_size);

	pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);

	for (; start < end; start += page_size) {
		void *p;
196
		int rc;
L
Li Zhong 已提交
197 198 199 200 201 202 203 204 205 206 207 208 209

		if (vmemmap_populated(start, page_size))
			continue;

		p = vmemmap_alloc_block(page_size, node);
		if (!p)
			return -ENOMEM;

		vmemmap_list_populate(__pa(p), start, node);

		pr_debug("      * %016lx..%016lx allocated at %p\n",
			 start, start + page_size, p);

210 211 212 213 214 215 216
		rc = vmemmap_create_mapping(start, page_size, __pa(p));
		if (rc < 0) {
			pr_warning(
				"vmemmap_populate: Unable to create vmemmap mapping: %d\n",
				rc);
			return -EFAULT;
		}
L
Li Zhong 已提交
217 218 219 220 221 222
	}

	return 0;
}

#ifdef CONFIG_MEMORY_HOTPLUG
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
static unsigned long vmemmap_list_free(unsigned long start)
{
	struct vmemmap_backing *vmem_back, *vmem_back_prev;

	vmem_back_prev = vmem_back = vmemmap_list;

	/* look for it with prev pointer recorded */
	for (; vmem_back; vmem_back = vmem_back->list) {
		if (vmem_back->virt_addr == start)
			break;
		vmem_back_prev = vmem_back;
	}

	if (unlikely(!vmem_back)) {
		WARN_ON(1);
		return 0;
	}

	/* remove it from vmemmap_list */
	if (vmem_back == vmemmap_list) /* remove head */
		vmemmap_list = vmem_back->list;
	else
		vmem_back_prev->list = vmem_back->list;

	/* next point to this freed entry */
	vmem_back->list = next;
	next = vmem_back;
	num_freed++;

	return vmem_back->phys;
}

L
Li Zhong 已提交
255
void __ref vmemmap_free(unsigned long start, unsigned long end)
A
Andy Whitcroft 已提交
256
{
257
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
258
	unsigned long page_order = get_order(page_size);
A
Andy Whitcroft 已提交
259 260 261

	start = _ALIGN_DOWN(start, page_size);

L
Li Zhong 已提交
262
	pr_debug("vmemmap_free %lx...%lx\n", start, end);
263

A
Andy Whitcroft 已提交
264
	for (; start < end; start += page_size) {
265 266
		unsigned long nr_pages, addr;
		struct page *page;
A
Andy Whitcroft 已提交
267

L
Li Zhong 已提交
268 269 270 271 272
		/*
		 * the section has already be marked as invalid, so
		 * vmemmap_populated() true means some other sections still
		 * in this page, so skip it.
		 */
A
Andy Whitcroft 已提交
273 274 275
		if (vmemmap_populated(start, page_size))
			continue;

L
Li Zhong 已提交
276
		addr = vmemmap_list_free(start);
277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
		if (!addr)
			continue;

		page = pfn_to_page(addr >> PAGE_SHIFT);
		nr_pages = 1 << page_order;

		if (PageReserved(page)) {
			/* allocated from bootmem */
			if (page_size < PAGE_SIZE) {
				/*
				 * this shouldn't happen, but if it is
				 * the case, leave the memory there
				 */
				WARN_ON_ONCE(1);
			} else {
				while (nr_pages--)
					free_reserved_page(page++);
			}
		} else {
			free_pages((unsigned long)(__va(addr)), page_order);
L
Li Zhong 已提交
297
		}
298 299

		vmemmap_remove_mapping(start, page_size);
A
Andy Whitcroft 已提交
300
	}
301
}
L
Li Zhong 已提交
302
#endif
303 304 305 306
void register_page_bootmem_memmap(unsigned long section_nr,
				  struct page *start_page, unsigned long size)
{
}
307

308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333
/*
 * We do not have access to the sparsemem vmemmap, so we fallback to
 * walking the list of sparsemem blocks which we already maintain for
 * the sake of crashdump. In the long run, we might want to maintain
 * a tree if performance of that linear walk becomes a problem.
 *
 * realmode_pfn_to_page functions can fail due to:
 * 1) As real sparsemem blocks do not lay in RAM continously (they
 * are in virtual address space which is not available in the real mode),
 * the requested page struct can be split between blocks so get_page/put_page
 * may fail.
 * 2) When huge pages are used, the get_page/put_page API will fail
 * in real mode as the linked addresses in the page struct are virtual
 * too.
 */
struct page *realmode_pfn_to_page(unsigned long pfn)
{
	struct vmemmap_backing *vmem_back;
	struct page *page;
	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
	unsigned long pg_va = (unsigned long) pfn_to_page(pfn);

	for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back->list) {
		if (pg_va < vmem_back->virt_addr)
			continue;

334 335 336 337
		/* After vmemmap_list entry free is possible, need check all */
		if ((pg_va + sizeof(struct page)) <=
				(vmem_back->virt_addr + page_size)) {
			page = (struct page *) (vmem_back->phys + pg_va -
338
				vmem_back->virt_addr);
339 340
			return page;
		}
341 342
	}

343
	/* Probably that page struct is split between real pages */
344 345 346 347 348 349 350 351 352 353 354 355 356 357
	return NULL;
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);

#elif defined(CONFIG_FLATMEM)

struct page *realmode_pfn_to_page(unsigned long pfn)
{
	struct page *page = pfn_to_page(pfn);
	return page;
}
EXPORT_SYMBOL_GPL(realmode_pfn_to_page);

#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
358 359

#ifdef CONFIG_PPC_STD_MMU_64
360 361 362 363 364 365 366 367
static bool disable_radix;
static int __init parse_disable_radix(char *p)
{
	disable_radix = true;
	return 0;
}
early_param("disable_radix", parse_disable_radix);

368
/*
369 370 371
 * If we're running under a hypervisor, we need to check the contents of
 * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do
 * radix.  If not, we clear the radix feature bit so we fall back to hash.
372 373 374 375 376 377
 */
static void early_check_vec5(void)
{
	unsigned long root, chosen;
	int size;
	const u8 *vec5;
378
	u8 mmu_supported;
379 380 381

	root = of_get_flat_dt_root();
	chosen = of_get_flat_dt_subnode_by_name(root, "chosen");
382 383
	if (chosen == -FDT_ERR_NOTFOUND) {
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
384
		return;
385
	}
386
	vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size);
387 388
	if (!vec5) {
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
389
		return;
390 391
	}
	if (size <= OV5_INDX(OV5_MMU_SUPPORT)) {
392
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
		return;
	}

	/* Check for supported configuration */
	mmu_supported = vec5[OV5_INDX(OV5_MMU_SUPPORT)] &
			OV5_FEAT(OV5_MMU_SUPPORT);
	if (mmu_supported == OV5_FEAT(OV5_MMU_RADIX)) {
		/* Hypervisor only supports radix - check enabled && GTSE */
		if (!early_radix_enabled()) {
			pr_warn("WARNING: Ignoring cmdline option disable_radix\n");
		}
		if (!(vec5[OV5_INDX(OV5_RADIX_GTSE)] &
						OV5_FEAT(OV5_RADIX_GTSE))) {
			pr_warn("WARNING: Hypervisor doesn't support RADIX with GTSE\n");
		}
		/* Do radix anyway - the hypervisor said we had to */
		cur_cpu_spec->mmu_features |= MMU_FTR_TYPE_RADIX;
	} else if (mmu_supported == OV5_FEAT(OV5_MMU_HASH)) {
		/* Hypervisor only supports hash - disable radix */
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
	}
414 415
}

416 417
void __init mmu_early_init_devtree(void)
{
418
	/* Disable radix mode based on kernel command line. */
419
	if (disable_radix)
420
		cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX;
421

422 423 424 425 426 427
	/*
	 * Check /chosen/ibm,architecture-vec-5 if running as a guest.
	 * When running bare-metal, we can use radix if we like
	 * even though the ibm,architecture-vec-5 property created by
	 * skiboot doesn't have the necessary bits set.
	 */
428
	if (!(mfmsr() & MSR_HV))
429 430
		early_check_vec5();

431
	if (early_radix_enabled())
432 433
		radix__early_init_devtree();
	else
434
		hash__early_init_devtree();
435 436
}
#endif /* CONFIG_PPC_STD_MMU_64 */