setup_64.c 25.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * 
 * Common boot and setup code.
 *
 * Copyright (C) 2001 PPC64 Team, IBM Corp
 *
 *      This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

13
#include <linux/export.h>
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/initrd.h>
#include <linux/seq_file.h>
#include <linux/ioport.h>
#include <linux/console.h>
#include <linux/utsname.h>
#include <linux/tty.h>
#include <linux/root_dev.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/unistd.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
32
#include <linux/bootmem.h>
33
#include <linux/pci.h>
34
#include <linux/lockdep.h>
Y
Yinghai Lu 已提交
35
#include <linux/memblock.h>
36
#include <linux/memory.h>
37
#include <linux/nmi.h>
38

39
#include <asm/debugfs.h>
40
#include <asm/io.h>
41
#include <asm/kdump.h>
42 43 44 45 46 47 48 49 50
#include <asm/prom.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/smp.h>
#include <asm/elf.h>
#include <asm/machdep.h>
#include <asm/paca.h>
#include <asm/time.h>
#include <asm/cputable.h>
51
#include <asm/dt_cpu_ftrs.h>
52 53 54 55 56 57 58 59 60 61 62
#include <asm/sections.h>
#include <asm/btext.h>
#include <asm/nvram.h>
#include <asm/setup.h>
#include <asm/rtas.h>
#include <asm/iommu.h>
#include <asm/serial.h>
#include <asm/cache.h>
#include <asm/page.h>
#include <asm/mmu.h>
#include <asm/firmware.h>
P
Paul Mackerras 已提交
63
#include <asm/xmon.h>
D
David Gibson 已提交
64
#include <asm/udbg.h>
65
#include <asm/kexec.h>
66
#include <asm/code-patching.h>
67
#include <asm/livepatch.h>
68
#include <asm/opal.h>
69
#include <asm/cputhreads.h>
70
#include <asm/hw_irq.h>
71
#include <asm/feature-fixups.h>
72

73 74
#include "setup.h"

75 76 77 78 79 80
#ifdef DEBUG
#define DBG(fmt...) udbg_printf(fmt)
#else
#define DBG(fmt...)
#endif

81
int spinning_secondaries;
82 83
u64 ppc64_pft_size;

84
struct ppc64_caches ppc64_caches = {
85 86 87 88 89 90 91 92
	.l1d = {
		.block_size = 0x40,
		.log_block_size = 6,
	},
	.l1i = {
		.block_size = 0x40,
		.log_block_size = 6
	},
93
};
94 95
EXPORT_SYMBOL_GPL(ppc64_caches);

96
#if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP)
97
void __init setup_tlb_core_data(void)
98 99 100
{
	int cpu;

101 102
	BUILD_BUG_ON(offsetof(struct tlb_core_data, lock) != 0);

103 104 105
	for_each_possible_cpu(cpu) {
		int first = cpu_first_thread_sibling(cpu);

106 107 108 109 110 111 112 113
		/*
		 * If we boot via kdump on a non-primary thread,
		 * make sure we point at the thread that actually
		 * set up this TLB.
		 */
		if (cpu_first_thread_sibling(boot_cpuid) == first)
			first = boot_cpuid;

114
		paca_ptrs[cpu]->tcd_ptr = &paca_ptrs[first]->tcd;
115 116 117 118 119

		/*
		 * If we have threads, we need either tlbsrx.
		 * or e6500 tablewalk mode, or else TLB handlers
		 * will be racy and could produce duplicate entries.
120
		 * Should we panic instead?
121
		 */
122 123 124 125
		WARN_ONCE(smt_enabled_at_boot >= 2 &&
			  !mmu_has_feature(MMU_FTR_USE_TLBRSRV) &&
			  book3e_htw_mode != PPC_HTW_E6500,
			  "%s: unsupported MMU configuration\n", __func__);
126 127 128 129
	}
}
#endif

130 131
#ifdef CONFIG_SMP

132
static char *smt_enabled_cmdline;
133 134

/* Look for ibm,smt-enabled OF option */
135
void __init check_smt_enabled(void)
136 137
{
	struct device_node *dn;
138
	const char *smt_option;
139

140 141
	/* Default to enabling all threads */
	smt_enabled_at_boot = threads_per_core;
142

143 144 145 146 147 148 149
	/* Allow the command line to overrule the OF option */
	if (smt_enabled_cmdline) {
		if (!strcmp(smt_enabled_cmdline, "on"))
			smt_enabled_at_boot = threads_per_core;
		else if (!strcmp(smt_enabled_cmdline, "off"))
			smt_enabled_at_boot = 0;
		else {
150
			int smt;
151 152
			int rc;

153
			rc = kstrtoint(smt_enabled_cmdline, 10, &smt);
154 155
			if (!rc)
				smt_enabled_at_boot =
156
					min(threads_per_core, smt);
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
		}
	} else {
		dn = of_find_node_by_path("/options");
		if (dn) {
			smt_option = of_get_property(dn, "ibm,smt-enabled",
						     NULL);

			if (smt_option) {
				if (!strcmp(smt_option, "on"))
					smt_enabled_at_boot = threads_per_core;
				else if (!strcmp(smt_option, "off"))
					smt_enabled_at_boot = 0;
			}

			of_node_put(dn);
		}
	}
174 175 176 177 178
}

/* Look for smt-enabled= cmdline option */
static int __init early_smt_enabled(char *p)
{
179
	smt_enabled_cmdline = p;
180 181 182 183 184 185
	return 0;
}
early_param("smt-enabled", early_smt_enabled);

#endif /* CONFIG_SMP */

186
/** Fix up paca fields required for the boot cpu */
187
static void __init fixup_boot_paca(void)
188 189 190 191 192
{
	/* The boot cpu is started */
	get_paca()->cpu_start = 1;
	/* Allow percpu accesses to work until we setup percpu data */
	get_paca()->data_offset = 0;
193
	/* Mark interrupts disabled in PACA */
194
	irq_soft_mask_set(IRQS_DISABLED);
195 196
}

197
static void __init configure_exceptions(void)
198
{
199
	/*
200 201
	 * Setup the trampolines from the lowmem exception vectors
	 * to the kdump kernel when not using a relocatable kernel.
202
	 */
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
	setup_kdump_trampoline();

	/* Under a PAPR hypervisor, we need hypercalls */
	if (firmware_has_feature(FW_FEATURE_SET_MODE)) {
		/* Enable AIL if possible */
		pseries_enable_reloc_on_exc();

		/*
		 * Tell the hypervisor that we want our exceptions to
		 * be taken in little endian mode.
		 *
		 * We don't call this for big endian as our calling convention
		 * makes us always enter in BE, and the call may fail under
		 * some circumstances with kdump.
		 */
#ifdef __LITTLE_ENDIAN__
		pseries_little_endian_exceptions();
#endif
	} else {
		/* Set endian mode using OPAL */
		if (firmware_has_feature(FW_FEATURE_OPAL))
			opal_configure_cores();

226
		/* AIL on native is done in cpu_ready_for_interrupts() */
227 228 229
	}
}

230 231
static void cpu_ready_for_interrupts(void)
{
232 233 234 235 236 237 238
	/*
	 * Enable AIL if supported, and we are in hypervisor mode. This
	 * is called once for every processor.
	 *
	 * If we are not in hypervisor mode the job is done once for
	 * the whole partition in configure_exceptions().
	 */
239 240
	if (cpu_has_feature(CPU_FTR_HVMODE) &&
	    cpu_has_feature(CPU_FTR_ARCH_207S)) {
241 242 243 244
		unsigned long lpcr = mfspr(SPRN_LPCR);
		mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
	}

245
	/*
246 247 248 249 250 251
	 * Set HFSCR:TM based on CPU features:
	 * In the special case of TM no suspend (P9N DD2.1), Linux is
	 * told TM is off via the dt-ftrs but told to (partially) use
	 * it via OPAL_REINIT_CPUS_TM_SUSPEND_DISABLED. So HFSCR[TM]
	 * will be off from dt-ftrs but we need to turn it on for the
	 * no suspend case.
252
	 */
253 254 255 256 257 258
	if (cpu_has_feature(CPU_FTR_HVMODE)) {
		if (cpu_has_feature(CPU_FTR_TM_COMP))
			mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) | HFSCR_TM);
		else
			mtspr(SPRN_HFSCR, mfspr(SPRN_HFSCR) & ~HFSCR_TM);
	}
259

260 261 262 263
	/* Set IR and DR in PACA MSR */
	get_paca()->kernel_msr = MSR_KERNEL;
}

264 265 266 267 268 269 270 271
unsigned long spr_default_dscr = 0;

void __init record_spr_defaults(void)
{
	if (early_cpu_has_feature(CPU_FTR_DSCR))
		spr_default_dscr = mfspr(SPRN_DSCR);
}

272 273 274 275 276 277
/*
 * Early initialization entry point. This is called by head.S
 * with MMU translation disabled. We rely on the "feature" of
 * the CPU that ignores the top 2 bits of the address in real
 * mode so we can access kernel globals normally provided we
 * only toy with things in the RMO region. From here, we do
Y
Yinghai Lu 已提交
278
 * some early parsing of the device-tree to setup out MEMBLOCK
279 280 281 282 283 284 285 286 287 288 289 290 291 292
 * data structures, and allocate & initialize the hash table
 * and segment tables so we can start running with translation
 * enabled.
 *
 * It is this function which will call the probe() callback of
 * the various platform types and copy the matching one to the
 * global ppc_md structure. Your platform can eventually do
 * some very early initializations from the probe() routine, but
 * this is not recommended, be very careful as, for example, the
 * device-tree is not accessible via normal means at this point.
 */

void __init early_setup(unsigned long dt_ptr)
{
293 294
	static __initdata struct paca_struct boot_paca;

295 296
	/* -------- printk is _NOT_ safe to use here ! ------- */

297 298 299 300
	/* Try new device tree based feature discovery ... */
	if (!dt_cpu_ftrs_init(__va(dt_ptr)))
		/* Otherwise use the old style CPU table */
		identify_cpu(0, mfspr(SPRN_PVR));
301

302
	/* Assume we're on cpu 0 for now. Don't write to the paca yet! */
303 304
	initialise_paca(&boot_paca, 0);
	setup_paca(&boot_paca);
305
	fixup_boot_paca();
306

307 308
	/* -------- printk is now safe to use ------- */

309 310 311
	/* Enable early debugging if any specified (see udbg.h) */
	udbg_early_init();

312
 	DBG(" -> early_setup(), dt_ptr: 0x%lx\n", dt_ptr);
313 314

	/*
315 316 317
	 * Do early initialization using the flattened device
	 * tree, such as retrieving the physical memory map or
	 * calculating/retrieving the hash table size.
318 319 320
	 */
	early_init_devtree(__va(dt_ptr));

321
	/* Now we know the logical id of our boot cpu, setup the paca. */
322 323 324 325
	if (boot_cpuid != 0) {
		/* Poison paca_ptrs[0] again if it's not the boot cpu */
		memset(&paca_ptrs[0], 0x88, sizeof(paca_ptrs[0]));
	}
326
	setup_paca(paca_ptrs[boot_cpuid]);
327
	fixup_boot_paca();
328

329
	/*
330 331
	 * Configure exception handlers. This include setting up trampolines
	 * if needed, setting exception endian mode, etc...
332
	 */
333
	configure_exceptions();
334

335 336
	/* Apply all the dynamic patching */
	apply_feature_fixups();
337
	setup_feature_keys();
338

339 340 341
	/* Initialize the hash table or TLB handling */
	early_init_mmu();

342 343 344 345 346 347 348
	/*
	 * After firmware and early platform setup code has set things up,
	 * we note the SPR values for configurable control/performance
	 * registers, and use those as initial defaults.
	 */
	record_spr_defaults();

349 350 351
	/*
	 * At this point, we can let interrupts switch to virtual mode
	 * (the MMU has been setup), so adjust the MSR in the PACA to
352
	 * have IR and DR set and enable AIL if it exists
353
	 */
354
	cpu_ready_for_interrupts();
355

356 357 358 359 360 361 362
	/*
	 * We enable ftrace here, but since we only support DYNAMIC_FTRACE, it
	 * will only actually get enabled on the boot cpu much later once
	 * ftrace itself has been initialized.
	 */
	this_cpu_enable_ftrace();

363
	DBG(" <- early_setup()\n");
364 365 366 367 368 369 370 371 372 373 374 375

#ifdef CONFIG_PPC_EARLY_DEBUG_BOOTX
	/*
	 * This needs to be done *last* (after the above DBG() even)
	 *
	 * Right after we return from this function, we turn on the MMU
	 * which means the real-mode access trick that btext does will
	 * no longer work, it needs to switch to using a real MMU
	 * mapping. This call will ensure that it does
	 */
	btext_map();
#endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */
376 377
}

378 379 380
#ifdef CONFIG_SMP
void early_setup_secondary(void)
{
381
	/* Mark interrupts disabled in PACA */
382
	irq_soft_mask_set(IRQS_DISABLED);
383

384 385
	/* Initialize the hash table or TLB handling */
	early_init_mmu_secondary();
386 387 388 389 390 391

	/*
	 * At this point, we can let interrupts switch to virtual mode
	 * (the MMU has been setup), so adjust the MSR in the PACA to
	 * have IR and DR set.
	 */
392
	cpu_ready_for_interrupts();
393 394 395
}

#endif /* CONFIG_SMP */
396

397 398 399 400 401 402 403 404
void panic_smp_self_stop(void)
{
	hard_irq_disable();
	spin_begin();
	while (1)
		spin_cpu_relax();
}

405
#if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
406 407
static bool use_spinloop(void)
{
408 409 410 411 412 413 414 415
	if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
		/*
		 * See comments in head_64.S -- not all platforms insert
		 * secondaries at __secondary_hold and wait at the spin
		 * loop.
		 */
		if (firmware_has_feature(FW_FEATURE_OPAL))
			return false;
416
		return true;
417
	}
418 419 420 421 422 423 424 425

	/*
	 * When book3e boots from kexec, the ePAPR spin table does
	 * not get used.
	 */
	return of_property_read_bool(of_chosen, "linux,booted-from-kexec");
}

426 427
void smp_release_cpus(void)
{
428
	unsigned long *ptr;
429
	int i;
430

431 432 433
	if (!use_spinloop())
		return;

434 435 436 437 438 439
	DBG(" -> smp_release_cpus()\n");

	/* All secondary cpus are spinning on a common spinloop, release them
	 * all now so they can start to spin on their individual paca
	 * spinloops. For non SMP kernels, the secondary cpus never get out
	 * of the common spinloop.
440
	 */
441

442 443
	ptr  = (unsigned long *)((unsigned long)&__secondary_hold_spinloop
			- PHYSICAL_START);
444
	*ptr = ppc_function_entry(generic_secondary_smp_init);
445 446 447 448 449

	/* And wait a bit for them to catch up */
	for (i = 0; i < 100000; i++) {
		mb();
		HMT_low();
450
		if (spinning_secondaries == 0)
451 452 453
			break;
		udelay(1);
	}
454
	DBG("spinning_secondaries = %d\n", spinning_secondaries);
455 456 457

	DBG(" <- smp_release_cpus()\n");
}
458
#endif /* CONFIG_SMP || CONFIG_KEXEC_CORE */
459

460
/*
461 462
 * Initialize some remaining members of the ppc64_caches and systemcfg
 * structures
463 464 465 466
 * (at least until we get rid of them completely). This is mostly some
 * cache informations about the CPU that will be used by cache flush
 * routines and/or provided to userland
 */
467 468 469 470 471 472 473 474 475

static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize,
			    u32 bsize, u32 sets)
{
	info->size = size;
	info->sets = sets;
	info->line_size = lsize;
	info->block_size = bsize;
	info->log_block_size = __ilog2(bsize);
476 477 478 479
	if (bsize)
		info->blocks_per_page = PAGE_SIZE / bsize;
	else
		info->blocks_per_page = 0;
480 481 482 483 484

	if (sets == 0)
		info->assoc = 0xffff;
	else
		info->assoc = size / (sets * lsize);
485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543
}

static bool __init parse_cache_info(struct device_node *np,
				    bool icache,
				    struct ppc_cache_info *info)
{
	static const char *ipropnames[] __initdata = {
		"i-cache-size",
		"i-cache-sets",
		"i-cache-block-size",
		"i-cache-line-size",
	};
	static const char *dpropnames[] __initdata = {
		"d-cache-size",
		"d-cache-sets",
		"d-cache-block-size",
		"d-cache-line-size",
	};
	const char **propnames = icache ? ipropnames : dpropnames;
	const __be32 *sizep, *lsizep, *bsizep, *setsp;
	u32 size, lsize, bsize, sets;
	bool success = true;

	size = 0;
	sets = -1u;
	lsize = bsize = cur_cpu_spec->dcache_bsize;
	sizep = of_get_property(np, propnames[0], NULL);
	if (sizep != NULL)
		size = be32_to_cpu(*sizep);
	setsp = of_get_property(np, propnames[1], NULL);
	if (setsp != NULL)
		sets = be32_to_cpu(*setsp);
	bsizep = of_get_property(np, propnames[2], NULL);
	lsizep = of_get_property(np, propnames[3], NULL);
	if (bsizep == NULL)
		bsizep = lsizep;
	if (lsizep != NULL)
		lsize = be32_to_cpu(*lsizep);
	if (bsizep != NULL)
		bsize = be32_to_cpu(*bsizep);
	if (sizep == NULL || bsizep == NULL || lsizep == NULL)
		success = false;

	/*
	 * OF is weird .. it represents fully associative caches
	 * as "1 way" which doesn't make much sense and doesn't
	 * leave room for direct mapped. We'll assume that 0
	 * in OF means direct mapped for that reason.
	 */
	if (sets == 1)
		sets = 0;
	else if (sets == 0)
		sets = 1;

	init_cache_info(info, size, lsize, bsize, sets);

	return success;
}

544
void __init initialize_cache_info(void)
545
{
546 547
	struct device_node *cpu = NULL, *l2, *l3 = NULL;
	u32 pvr;
548 549 550

	DBG(" -> initialize_cache_info()\n");

551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
	/*
	 * All shipping POWER8 machines have a firmware bug that
	 * puts incorrect information in the device-tree. This will
	 * be (hopefully) fixed for future chips but for now hard
	 * code the values if we are running on one of these
	 */
	pvr = PVR_VER(mfspr(SPRN_PVR));
	if (pvr == PVR_POWER8 || pvr == PVR_POWER8E ||
	    pvr == PVR_POWER8NVL) {
						/* size    lsize   blk  sets */
		init_cache_info(&ppc64_caches.l1i, 0x8000,   128,  128, 32);
		init_cache_info(&ppc64_caches.l1d, 0x10000,  128,  128, 64);
		init_cache_info(&ppc64_caches.l2,  0x80000,  128,  0,   512);
		init_cache_info(&ppc64_caches.l3,  0x800000, 128,  0,   8192);
	} else
		cpu = of_find_node_by_type(NULL, "cpu");
567

568 569 570 571
	/*
	 * We're assuming *all* of the CPUs have the same
	 * d-cache and i-cache sizes... -Peter
	 */
572 573
	if (cpu) {
		if (!parse_cache_info(cpu, false, &ppc64_caches.l1d))
574 575
			DBG("Argh, can't find dcache properties !\n");

576
		if (!parse_cache_info(cpu, true, &ppc64_caches.l1i))
577
			DBG("Argh, can't find icache properties !\n");
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593

		/*
		 * Try to find the L2 and L3 if any. Assume they are
		 * unified and use the D-side properties.
		 */
		l2 = of_find_next_cache_node(cpu);
		of_node_put(cpu);
		if (l2) {
			parse_cache_info(l2, false, &ppc64_caches.l2);
			l3 = of_find_next_cache_node(l2);
			of_node_put(l2);
		}
		if (l3) {
			parse_cache_info(l3, false, &ppc64_caches.l3);
			of_node_put(l3);
		}
594 595
	}

596
	/* For use by binfmt_elf */
597 598
	dcache_bsize = ppc64_caches.l1d.block_size;
	icache_bsize = ppc64_caches.l1i.block_size;
599

600 601 602
	cur_cpu_spec->dcache_bsize = dcache_bsize;
	cur_cpu_spec->icache_bsize = icache_bsize;

603 604 605
	DBG(" <- initialize_cache_info()\n");
}

606 607 608 609 610 611 612 613
/*
 * This returns the limit below which memory accesses to the linear
 * mapping are guarnateed not to cause an architectural exception (e.g.,
 * TLB or SLB miss fault).
 *
 * This is used to allocate PACAs and various interrupt stacks that
 * that are accessed early in interrupt handlers that must not cause
 * re-entrant interrupts.
614
 */
615
__init u64 ppc64_bolted_size(void)
616
{
617 618
#ifdef CONFIG_PPC_BOOK3E
	/* Freescale BookE bolts the entire linear mapping */
619 620
	/* XXX: BookE ppc64_rma_limit setup seems to disagree? */
	if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E))
621 622 623 624
		return linear_map_top;
	/* Other BookE, we assume the first GB is bolted */
	return 1ul << 30;
#else
625
	/* BookS radix, does not take faults on linear mapping */
626 627 628
	if (early_radix_enabled())
		return ULONG_MAX;

629 630
	/* BookS hash, the first segment is bolted */
	if (early_mmu_has_feature(MMU_FTR_1T_SEGMENT))
631 632
		return 1UL << SID_SHIFT_1T;
	return 1UL << SID_SHIFT;
633
#endif
634 635
}

636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
static void *__init alloc_stack(unsigned long limit, int cpu)
{
	unsigned long pa;

	pa = memblock_alloc_base_nid(THREAD_SIZE, THREAD_SIZE, limit,
					early_cpu_to_node(cpu), MEMBLOCK_NONE);
	if (!pa) {
		pa = memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit);
		if (!pa)
			panic("cannot allocate stacks");
	}

	return __va(pa);
}

651
void __init irqstack_early_init(void)
652
{
653
	u64 limit = ppc64_bolted_size();
654 655 656
	unsigned int i;

	/*
657
	 * Interrupt stacks must be in the first segment since we
658 659
	 * cannot afford to take SLB misses on them. They are not
	 * accessed in realmode.
660
	 */
661
	for_each_possible_cpu(i) {
662 663
		softirq_ctx[i] = alloc_stack(limit, i);
		hardirq_ctx[i] = alloc_stack(limit, i);
664 665 666
	}
}

667
#ifdef CONFIG_PPC_BOOK3E
668
void __init exc_lvl_early_init(void)
669 670 671 672
{
	unsigned int i;

	for_each_possible_cpu(i) {
673 674 675 676 677
		void *sp;

		sp = alloc_stack(ULONG_MAX, i);
		critirq_ctx[i] = sp;
		paca_ptrs[i]->crit_kstack = sp + THREAD_SIZE;
678

679 680 681
		sp = alloc_stack(ULONG_MAX, i);
		dbgirq_ctx[i] = sp;
		paca_ptrs[i]->dbg_kstack = sp + THREAD_SIZE;
682

683 684 685
		sp = alloc_stack(ULONG_MAX, i);
		mcheckirq_ctx[i] = sp;
		paca_ptrs[i]->mc_kstack = sp + THREAD_SIZE;
686
	}
687 688

	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC))
689
		patch_exception(0x040, exc_debug_debug_book3e);
690 691 692
}
#endif

693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710
/*
 * Emergency stacks are used for a range of things, from asynchronous
 * NMIs (system reset, machine check) to synchronous, process context.
 * We set preempt_count to zero, even though that isn't necessarily correct. To
 * get the right value we'd need to copy it from the previous thread_info, but
 * doing that might fault causing more problems.
 * TODO: what to do with accounting?
 */
static void emerg_stack_init_thread_info(struct thread_info *ti, int cpu)
{
	ti->task = NULL;
	ti->cpu = cpu;
	ti->preempt_count = 0;
	ti->local_flags = 0;
	ti->flags = 0;
	klp_init_thread_info(ti);
}

711 712
/*
 * Stack space used when we detect a bad kernel stack pointer, and
713 714
 * early in SMP boots before relocation is enabled. Exclusive emergency
 * stack for machine checks.
715
 */
716
void __init emergency_stack_init(void)
717
{
718
	u64 limit;
719 720 721 722 723 724 725 726
	unsigned int i;

	/*
	 * Emergency stacks must be under 256MB, we cannot afford to take
	 * SLB misses on them. The ABI also requires them to be 128-byte
	 * aligned.
	 *
	 * Since we use these as temporary stacks during secondary CPU
727 728 729
	 * bringup, machine check, system reset, and HMI, we need to get
	 * at them in real mode. This means they must also be within the RMO
	 * region.
730 731 732 733
	 *
	 * The IRQ stacks allocated elsewhere in this file are zeroed and
	 * initialized in kernel/irq.c. These are initialized here in order
	 * to have emergency stacks available as early as possible.
734
	 */
735
	limit = min(ppc64_bolted_size(), ppc64_rma_size);
736

737
	for_each_possible_cpu(i) {
738
		struct thread_info *ti;
739 740

		ti = alloc_stack(limit, i);
741 742
		memset(ti, 0, THREAD_SIZE);
		emerg_stack_init_thread_info(ti, i);
743
		paca_ptrs[i]->emergency_sp = (void *)ti + THREAD_SIZE;
744 745

#ifdef CONFIG_PPC_BOOK3S_64
746
		/* emergency stack for NMI exception handling. */
747
		ti = alloc_stack(limit, i);
748 749
		memset(ti, 0, THREAD_SIZE);
		emerg_stack_init_thread_info(ti, i);
750
		paca_ptrs[i]->nmi_emergency_sp = (void *)ti + THREAD_SIZE;
751

752
		/* emergency stack for machine check exception handling. */
753
		ti = alloc_stack(limit, i);
754 755
		memset(ti, 0, THREAD_SIZE);
		emerg_stack_init_thread_info(ti, i);
756
		paca_ptrs[i]->mc_emergency_sp = (void *)ti + THREAD_SIZE;
757
#endif
758
	}
759 760
}

761
#ifdef CONFIG_SMP
762 763 764
#define PCPU_DYN_SIZE		()

static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
765
{
766
	return __alloc_bootmem_node(NODE_DATA(early_cpu_to_node(cpu)), size, align,
767 768
				    __pa(MAX_DMA_ADDRESS));
}
769

770 771 772 773
static void __init pcpu_fc_free(void *ptr, size_t size)
{
	free_bootmem(__pa(ptr), size);
}
774

775 776
static int pcpu_cpu_distance(unsigned int from, unsigned int to)
{
777
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
778 779 780 781 782
		return LOCAL_DISTANCE;
	else
		return REMOTE_DISTANCE;
}

783 784 785
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809
void __init setup_per_cpu_areas(void)
{
	const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE;
	size_t atom_size;
	unsigned long delta;
	unsigned int cpu;
	int rc;

	/*
	 * Linear mapping is one of 4K, 1M and 16M.  For 4K, no need
	 * to group units.  For larger mappings, use 1M atom which
	 * should be large enough to contain a number of units.
	 */
	if (mmu_linear_psize == MMU_PAGE_4K)
		atom_size = PAGE_SIZE;
	else
		atom_size = 1 << 20;

	rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance,
				    pcpu_fc_alloc, pcpu_fc_free);
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);

	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
810 811
	for_each_possible_cpu(cpu) {
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
812
		paca_ptrs[cpu]->data_offset = __per_cpu_offset[cpu];
813
	}
814 815
}
#endif
816

817 818 819 820 821 822 823 824 825
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
unsigned long memory_block_size_bytes(void)
{
	if (ppc_md.memory_block_size)
		return ppc_md.memory_block_size();

	return MIN_MEMORY_BLOCK_SIZE;
}
#endif
826

827
#if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
828 829
struct ppc_pci_io ppc_pci_io;
EXPORT_SYMBOL(ppc_pci_io);
830
#endif
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858

#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
	return ppc_proc_freq * watchdog_thresh;
}
#endif

/*
 * The perf based hardlockup detector breaks PMU event based branches, so
 * disable it by default. Book3S has a soft-nmi hardlockup detector based
 * on the decrementer interrupt, so it does not suffer from this problem.
 *
 * It is likely to get false positives in VM guests, so disable it there
 * by default too.
 */
static int __init disable_hardlockup_detector(void)
{
#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
	hardlockup_detector_disable();
#else
	if (firmware_has_feature(FW_FEATURE_LPAR))
		hardlockup_detector_disable();
#endif

	return 0;
}
early_initcall(disable_hardlockup_detector);
859 860 861 862

#ifdef CONFIG_PPC_BOOK3S_64
static enum l1d_flush_type enabled_flush_types;
static void *l1d_flush_fallback_area;
863
static bool no_rfi_flush;
864 865
bool rfi_flush;

866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
static int __init handle_no_rfi_flush(char *p)
{
	pr_info("rfi-flush: disabled on command line.");
	no_rfi_flush = true;
	return 0;
}
early_param("no_rfi_flush", handle_no_rfi_flush);

/*
 * The RFI flush is not KPTI, but because users will see doco that says to use
 * nopti we hijack that option here to also disable the RFI flush.
 */
static int __init handle_no_pti(char *p)
{
	pr_info("rfi-flush: disabling due to 'nopti' on command line.\n");
	handle_no_rfi_flush(NULL);
	return 0;
}
early_param("nopti", handle_no_pti);

886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
static void do_nothing(void *unused)
{
	/*
	 * We don't need to do the flush explicitly, just enter+exit kernel is
	 * sufficient, the RFI exit handlers will do the right thing.
	 */
}

void rfi_flush_enable(bool enable)
{
	if (enable) {
		do_rfi_flush_fixups(enabled_flush_types);
		on_each_cpu(do_nothing, NULL, 1);
	} else
		do_rfi_flush_fixups(L1D_FLUSH_NONE);

	rfi_flush = enable;
}

905
static void __ref init_fallback_flush(void)
906 907 908 909
{
	u64 l1d_size, limit;
	int cpu;

910 911 912 913
	/* Only allocate the fallback flush area once (at boot time). */
	if (l1d_flush_fallback_area)
		return;

914
	l1d_size = ppc64_caches.l1d.size;
915 916 917 918 919 920 921 922 923 924 925

	/*
	 * If there is no d-cache-size property in the device tree, l1d_size
	 * could be zero. That leads to the loop in the asm wrapping around to
	 * 2^64-1, and then walking off the end of the fallback area and
	 * eventually causing a page fault which is fatal. Just default to
	 * something vaguely sane.
	 */
	if (!l1d_size)
		l1d_size = (64 * 1024);

926
	limit = min(ppc64_bolted_size(), ppc64_rma_size);
927 928 929 930 931 932 933 934 935 936

	/*
	 * Align to L1d size, and size it at 2x L1d size, to catch possible
	 * hardware prefetch runoff. We don't have a recipe for load patterns to
	 * reliably avoid the prefetcher.
	 */
	l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
	memset(l1d_flush_fallback_area, 0, l1d_size * 2);

	for_each_possible_cpu(cpu) {
937 938 939
		struct paca_struct *paca = paca_ptrs[cpu];
		paca->rfi_flush_fallback_area = l1d_flush_fallback_area;
		paca->l1d_flush_size = l1d_size;
940 941 942
	}
}

943
void setup_rfi_flush(enum l1d_flush_type types, bool enable)
944 945
{
	if (types & L1D_FLUSH_FALLBACK) {
946
		pr_info("rfi-flush: fallback displacement flush available\n");
947 948 949 950
		init_fallback_flush();
	}

	if (types & L1D_FLUSH_ORI)
951
		pr_info("rfi-flush: ori type flush available\n");
952 953

	if (types & L1D_FLUSH_MTTRIG)
954
		pr_info("rfi-flush: mttrig type flush available\n");
955 956 957

	enabled_flush_types = types;

958 959
	if (!no_rfi_flush)
		rfi_flush_enable(enable);
960
}
961

962 963 964
#ifdef CONFIG_DEBUG_FS
static int rfi_flush_set(void *data, u64 val)
{
965 966
	bool enable;

967
	if (val == 1)
968
		enable = true;
969
	else if (val == 0)
970
		enable = false;
971 972 973
	else
		return -EINVAL;

974 975 976 977
	/* Only do anything if we're changing state */
	if (enable != rfi_flush)
		rfi_flush_enable(enable);

978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995
	return 0;
}

static int rfi_flush_get(void *data, u64 *val)
{
	*val = rfi_flush ? 1 : 0;
	return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n");

static __init int rfi_flush_debugfs_init(void)
{
	debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
	return 0;
}
device_initcall(rfi_flush_debugfs_init);
#endif
996
#endif /* CONFIG_PPC_BOOK3S_64 */