smpboot_32.c 28.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
/*
 *	x86 SMP booting functions
 *
 *	(c) 1995 Alan Cox, Building #3 <alan@redhat.com>
 *	(c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
 *
 *	Much of the core SMP work is based on previous work by Thomas Radke, to
 *	whom a great many thanks are extended.
 *
 *	Thanks to Intel for making available several different Pentium,
 *	Pentium Pro and Pentium-II/Xeon MP machines.
 *	Original development of Linux SMP code supported by Caldera.
 *
 *	This code is released under the GNU General Public License version 2 or
 *	later.
 *
 *	Fixes
 *		Felix Koop	:	NR_CPUS used properly
 *		Jose Renau	:	Handle single CPU case.
 *		Alan Cox	:	By repeated request 8) - Total BogoMIPS report.
 *		Greg Wright	:	Fix for kernel stacks panic.
 *		Erich Boleyn	:	MP v1.4 and additional changes.
 *	Matthias Sattler	:	Changes for 2.1 kernel map.
 *	Michel Lespinasse	:	Changes for 2.1 kernel map.
 *	Michael Chastain	:	Change trampoline.S to gnu as.
 *		Alan Cox	:	Dumb bug: 'B' step PPro's are fine
 *		Ingo Molnar	:	Added APIC timers, based on code
 *					from Jose Renau
 *		Ingo Molnar	:	various cleanups and rewrites
 *		Tigran Aivazian	:	fixed "0.00 in /proc/uptime on SMP" bug.
 *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs
 *		Martin J. Bligh	: 	Added support for multi-quad systems
 *		Dave Jones	:	Report invalid combinations of Athlon CPUs.
*		Rusty Russell	:	Hacked into shape for new "hotplug" boot process. */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/bootmem.h>
Z
Zwane Mwaikambo 已提交
44 45 46
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/percpu.h>
47
#include <linux/nmi.h>
L
Linus Torvalds 已提交
48 49 50 51 52 53

#include <linux/delay.h>
#include <linux/mc146818rtc.h>
#include <asm/tlbflush.h>
#include <asm/desc.h>
#include <asm/arch_hooks.h>
54
#include <asm/nmi.h>
L
Linus Torvalds 已提交
55 56 57 58

#include <mach_apic.h>
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
59
#include <asm/vmi.h>
60
#include <asm/mtrr.h>
L
Linus Torvalds 已提交
61 62

/* Set if we find a B stepping CPU */
A
Adrian Bunk 已提交
63
static int __cpuinitdata smp_b_stepping;
L
Linus Torvalds 已提交
64 65 66

static cpumask_t smp_commenced_mask;

67
/* which logical CPU number maps to which CPU (physical APIC ID) */
68
u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
69
			{ [0 ... NR_CPUS-1] = BAD_APICID };
70
void *x86_cpu_to_apicid_early_ptr;
71
DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
72
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
L
Linus Torvalds 已提交
73

74 75
u8 apicid_2_node[MAX_APICID];

L
Linus Torvalds 已提交
76 77 78 79
/*
 * Trampoline 80x86 program as an array.
 */

J
Jan Beulich 已提交
80 81
extern const unsigned char trampoline_data [];
extern const unsigned char trampoline_end  [];
L
Linus Torvalds 已提交
82 83 84 85
static unsigned char *trampoline_base;

static void map_cpu_to_logical_apicid(void);

Z
Zwane Mwaikambo 已提交
86 87 88
/* State of each CPU. */
DEFINE_PER_CPU(int, cpu_state) = { 0 };

L
Linus Torvalds 已提交
89 90 91 92 93 94
/*
 * Currently trivial. Write the real->protected mode
 * bootstrap into the page concerned. The caller
 * has made sure it's suitably aligned.
 */

95
static unsigned long __cpuinit setup_trampoline(void)
L
Linus Torvalds 已提交
96 97 98 99 100 101 102 103 104 105 106
{
	memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
	return virt_to_phys(trampoline_base);
}

/*
 * We are called very early to get the low memory for the
 * SMP bootup trampoline page.
 */
void __init smp_alloc_memory(void)
{
J
Jan Engelhardt 已提交
107
	trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
L
Linus Torvalds 已提交
108 109 110 111 112 113 114 115 116 117 118 119 120
	/*
	 * Has to be in very low memory so we can execute
	 * real-mode AP code.
	 */
	if (__pa(trampoline_base) >= 0x9F000)
		BUG();
}

/*
 * The bootstrap kernel entry code has set these up. Save them for
 * a given CPU
 */

121
void __cpuinit smp_store_cpu_info(int id)
L
Linus Torvalds 已提交
122
{
123
	struct cpuinfo_x86 *c = &cpu_data(id);
L
Linus Torvalds 已提交
124 125

	*c = boot_cpu_data;
126
	c->cpu_index = id;
L
Linus Torvalds 已提交
127
	if (id!=0)
128
		identify_secondary_cpu(c);
L
Linus Torvalds 已提交
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
	/*
	 * Mask B, Pentium, but not Pentium MMX
	 */
	if (c->x86_vendor == X86_VENDOR_INTEL &&
	    c->x86 == 5 &&
	    c->x86_mask >= 1 && c->x86_mask <= 4 &&
	    c->x86_model <= 3)
		/*
		 * Remember we have B step Pentia with bugs
		 */
		smp_b_stepping = 1;

	/*
	 * Certain Athlons might work (for various values of 'work') in SMP
	 * but they are not certified as MP capable.
	 */
	if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {

147 148 149
		if (num_possible_cpus() == 1)
			goto valid_k7;

L
Linus Torvalds 已提交
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
		/* Athlon 660/661 is valid. */	
		if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
			goto valid_k7;

		/* Duron 670 is valid */
		if ((c->x86_model==7) && (c->x86_mask==0))
			goto valid_k7;

		/*
		 * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
		 * It's worth noting that the A5 stepping (662) of some Athlon XP's
		 * have the MP bit set.
		 * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
		 */
		if (((c->x86_model==6) && (c->x86_mask>=2)) ||
		    ((c->x86_model==7) && (c->x86_mask>=1)) ||
		     (c->x86_model> 7))
			if (cpu_has_mp)
				goto valid_k7;

		/* If we get here, it's not a certified SMP capable AMD system. */
171
		add_taint(TAINT_UNSAFE_SMP);
L
Linus Torvalds 已提交
172 173 174 175 176 177 178 179
	}

valid_k7:
	;
}

static atomic_t init_deasserted;

180
static void __cpuinit smp_callin(void)
L
Linus Torvalds 已提交
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252
{
	int cpuid, phys_id;
	unsigned long timeout;

	/*
	 * If waken up by an INIT in an 82489DX configuration
	 * we may get here before an INIT-deassert IPI reaches
	 * our local APIC.  We have to wait for the IPI or we'll
	 * lock up on an APIC access.
	 */
	wait_for_init_deassert(&init_deasserted);

	/*
	 * (This works even if the APIC is not enabled.)
	 */
	phys_id = GET_APIC_ID(apic_read(APIC_ID));
	cpuid = smp_processor_id();
	if (cpu_isset(cpuid, cpu_callin_map)) {
		printk("huh, phys CPU#%d, CPU#%d already present??\n",
					phys_id, cpuid);
		BUG();
	}
	Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);

	/*
	 * STARTUP IPIs are fragile beasts as they might sometimes
	 * trigger some glue motherboard logic. Complete APIC bus
	 * silence for 1 second, this overestimates the time the
	 * boot CPU is spending to send the up to 2 STARTUP IPIs
	 * by a factor of two. This should be enough.
	 */

	/*
	 * Waiting 2s total for startup (udelay is not yet working)
	 */
	timeout = jiffies + 2*HZ;
	while (time_before(jiffies, timeout)) {
		/*
		 * Has the boot CPU finished it's STARTUP sequence?
		 */
		if (cpu_isset(cpuid, cpu_callout_map))
			break;
		rep_nop();
	}

	if (!time_before(jiffies, timeout)) {
		printk("BUG: CPU%d started up but did not get a callout!\n",
			cpuid);
		BUG();
	}

	/*
	 * the boot CPU has finished the init stage and is spinning
	 * on callin_map until we finish. We are free to set up this
	 * CPU, first the APIC. (this is probably redundant on most
	 * boards)
	 */

	Dprintk("CALLIN, before setup_local_APIC().\n");
	smp_callin_clear_local_apic();
	setup_local_APIC();
	map_cpu_to_logical_apicid();

	/*
	 * Get our bogomips.
	 */
	calibrate_delay();
	Dprintk("Stack at about %p\n",&cpuid);

	/*
	 * Save our processor parameters
	 */
253
	smp_store_cpu_info(cpuid);
L
Linus Torvalds 已提交
254 255 256 257 258 259 260 261 262 263 264 265

	/*
	 * Allow the master to continue.
	 */
	cpu_set(cpuid, cpu_callin_map);
}

static int cpucount;

/*
 * Activate a secondary processor.
 */
266
static void __cpuinit start_secondary(void *unused)
L
Linus Torvalds 已提交
267 268
{
	/*
269 270 271
	 * Don't put *anything* before cpu_init(), SMP booting is too
	 * fragile that we want to limit the things done here to the
	 * most necessary things.
L
Linus Torvalds 已提交
272
	 */
273 274 275
#ifdef CONFIG_VMI
	vmi_bringup();
#endif
276
	cpu_init();
277
	preempt_disable();
L
Linus Torvalds 已提交
278 279 280
	smp_callin();
	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
		rep_nop();
281 282 283 284 285
	/*
	 * Check TSC synchronization with the BP:
	 */
	check_tsc_sync_target();

Z
Zachary Amsden 已提交
286
	setup_secondary_clock();
L
Linus Torvalds 已提交
287 288
	if (nmi_watchdog == NMI_IO_APIC) {
		disable_8259A_irq(0);
289
		enable_NMI_through_LVT0();
L
Linus Torvalds 已提交
290 291 292 293 294 295 296
		enable_8259A_irq(0);
	}
	/*
	 * low-memory mappings have been cleared, flush them from
	 * the local TLBs too.
	 */
	local_flush_tlb();
L
Li Shaohua 已提交
297

298 299 300 301
	/* This must be done before setting cpu_online_map */
	set_cpu_sibling_map(raw_smp_processor_id());
	wmb();

L
Li Shaohua 已提交
302 303 304
	/*
	 * We need to hold call_lock, so there is no inconsistency
	 * between the time smp_call_function() determines number of
S
Simon Arlott 已提交
305
	 * IPI recipients, and the time when the determination is made
L
Li Shaohua 已提交
306 307 308 309 310
	 * for which cpus receive the IPI. Holding this
	 * lock helps us to not include this cpu in a currently in progress
	 * smp_call_function().
	 */
	lock_ipi_call_lock();
L
Linus Torvalds 已提交
311
	cpu_set(smp_processor_id(), cpu_online_map);
L
Li Shaohua 已提交
312
	unlock_ipi_call_lock();
313
	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
L
Linus Torvalds 已提交
314 315 316 317 318 319 320 321 322 323 324 325 326 327

	/* We can take interrupts now: we're officially "up". */
	local_irq_enable();

	wmb();
	cpu_idle();
}

/*
 * Everything has been set up for the secondary
 * CPUs - they just need to reload everything
 * from the task structure
 * This function must not return.
 */
L
Li Shaohua 已提交
328
void __devinit initialize_secondary(void)
L
Linus Torvalds 已提交
329 330 331
{
	/*
	 * We don't actually need to load the full TSS,
332
	 * basically just the stack pointer and the ip.
L
Linus Torvalds 已提交
333 334 335 336 337 338
	 */

	asm volatile(
		"movl %0,%%esp\n\t"
		"jmp *%1"
		:
339
		:"m" (current->thread.sp),"m" (current->thread.ip));
L
Linus Torvalds 已提交
340 341
}

342
/* Static state in head.S used to set up a CPU */
L
Linus Torvalds 已提交
343
extern struct {
344
	void * sp;
L
Linus Torvalds 已提交
345 346 347 348 349 350
	unsigned short ss;
} stack_start;

#ifdef CONFIG_NUMA

/* which logical CPUs are on which nodes */
T
Thomas Gleixner 已提交
351
cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
L
Linus Torvalds 已提交
352
				{ [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
T
Thomas Gleixner 已提交
353
EXPORT_SYMBOL(node_to_cpumask_map);
L
Linus Torvalds 已提交
354
/* which node each logical CPU is on */
355
int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
T
Thomas Gleixner 已提交
356
EXPORT_SYMBOL(cpu_to_node_map);
L
Linus Torvalds 已提交
357 358 359 360 361

/* set up a mapping between cpu and node. */
static inline void map_cpu_to_node(int cpu, int node)
{
	printk("Mapping cpu %d to node %d\n", cpu, node);
T
Thomas Gleixner 已提交
362 363
	cpu_set(cpu, node_to_cpumask_map[node]);
	cpu_to_node_map[cpu] = node;
L
Linus Torvalds 已提交
364 365 366 367 368 369 370 371 372
}

/* undo a mapping between cpu and node. */
static inline void unmap_cpu_to_node(int cpu)
{
	int node;

	printk("Unmapping cpu %d from all nodes\n", cpu);
	for (node = 0; node < MAX_NUMNODES; node ++)
T
Thomas Gleixner 已提交
373 374
		cpu_clear(cpu, node_to_cpumask_map[node]);
	cpu_to_node_map[cpu] = 0;
L
Linus Torvalds 已提交
375 376 377 378 379 380 381 382
}
#else /* !CONFIG_NUMA */

#define map_cpu_to_node(cpu, node)	({})
#define unmap_cpu_to_node(cpu)	({})

#endif /* CONFIG_NUMA */

383
u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
L
Linus Torvalds 已提交
384 385 386 387 388

static void map_cpu_to_logical_apicid(void)
{
	int cpu = smp_processor_id();
	int apicid = logical_smp_processor_id();
389
	int node = apicid_to_node(apicid);
390 391 392

	if (!node_online(node))
		node = first_online_node;
L
Linus Torvalds 已提交
393 394

	cpu_2_logical_apicid[cpu] = apicid;
395
	map_cpu_to_node(cpu, node);
L
Linus Torvalds 已提交
396 397 398 399 400 401 402 403 404 405 406 407
}

static void unmap_cpu_to_logical_apicid(int cpu)
{
	cpu_2_logical_apicid[cpu] = BAD_APICID;
	unmap_cpu_to_node(cpu);
}

static inline void __inquire_remote_apic(int apicid)
{
	int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
	char *names[] = { "ID", "VERSION", "SPIV" };
408 409
	int timeout;
	unsigned long status;
L
Linus Torvalds 已提交
410 411 412

	printk("Inquiring remote APIC #%d...\n", apicid);

413
	for (i = 0; i < ARRAY_SIZE(regs); i++) {
L
Linus Torvalds 已提交
414 415 416 417 418
		printk("... APIC #%d %s: ", apicid, names[i]);

		/*
		 * Wait for idle.
		 */
419 420 421
		status = safe_apic_wait_icr_idle();
		if (status)
			printk("a previous APIC delivery may have failed\n");
L
Linus Torvalds 已提交
422 423 424 425 426 427 428 429 430 431 432 433 434

		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);

		timeout = 0;
		do {
			udelay(100);
			status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
		} while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);

		switch (status) {
		case APIC_ICR_RR_VALID:
			status = apic_read(APIC_RRR);
435
			printk("%lx\n", status);
L
Linus Torvalds 已提交
436 437 438 439 440 441 442 443 444 445 446 447 448
			break;
		default:
			printk("failed\n");
		}
	}
}

#ifdef WAKE_SECONDARY_VIA_NMI
/* 
 * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
 * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
 * won't ... remember to clear down the APIC, etc later.
 */
L
Li Shaohua 已提交
449
static int __devinit
L
Linus Torvalds 已提交
450 451
wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
{
452 453
	unsigned long send_status, accept_status = 0;
	int maxlvt;
L
Linus Torvalds 已提交
454 455 456 457 458 459 460 461 462

	/* Target chip */
	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));

	/* Boot on the stack */
	/* Kick the second */
	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);

	Dprintk("Waiting for send to finish...\n");
463
	send_status = safe_apic_wait_icr_idle();
L
Linus Torvalds 已提交
464 465 466 467 468 469 470 471

	/*
	 * Give the other CPU some time to accept the IPI.
	 */
	udelay(200);
	/*
	 * Due to the Pentium erratum 3AP.
	 */
472
	maxlvt = lapic_get_maxlvt();
L
Linus Torvalds 已提交
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489
	if (maxlvt > 3) {
		apic_read_around(APIC_SPIV);
		apic_write(APIC_ESR, 0);
	}
	accept_status = (apic_read(APIC_ESR) & 0xEF);
	Dprintk("NMI sent.\n");

	if (send_status)
		printk("APIC never delivered???\n");
	if (accept_status)
		printk("APIC delivery error (%lx).\n", accept_status);

	return (send_status | accept_status);
}
#endif	/* WAKE_SECONDARY_VIA_NMI */

#ifdef WAKE_SECONDARY_VIA_INIT
L
Li Shaohua 已提交
490
static int __devinit
L
Linus Torvalds 已提交
491 492
wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
{
493 494
	unsigned long send_status, accept_status = 0;
	int maxlvt, num_starts, j;
L
Linus Torvalds 已提交
495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518

	/*
	 * Be paranoid about clearing APIC errors.
	 */
	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
		apic_read_around(APIC_SPIV);
		apic_write(APIC_ESR, 0);
		apic_read(APIC_ESR);
	}

	Dprintk("Asserting INIT.\n");

	/*
	 * Turn INIT on target chip
	 */
	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));

	/*
	 * Send IPI
	 */
	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
				| APIC_DM_INIT);

	Dprintk("Waiting for send to finish...\n");
519
	send_status = safe_apic_wait_icr_idle();
L
Linus Torvalds 已提交
520 521 522 523 524 525 526 527 528 529 530 531

	mdelay(10);

	Dprintk("Deasserting INIT.\n");

	/* Target chip */
	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));

	/* Send IPI */
	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);

	Dprintk("Waiting for send to finish...\n");
532
	send_status = safe_apic_wait_icr_idle();
L
Linus Torvalds 已提交
533 534 535 536 537 538 539 540 541 542 543 544 545 546

	atomic_set(&init_deasserted, 1);

	/*
	 * Should we send STARTUP IPIs ?
	 *
	 * Determine this based on the APIC version.
	 * If we don't have an integrated APIC, don't send the STARTUP IPIs.
	 */
	if (APIC_INTEGRATED(apic_version[phys_apicid]))
		num_starts = 2;
	else
		num_starts = 0;

547 548 549 550 551
	/*
	 * Paravirt / VMI wants a startup IPI hook here to set up the
	 * target processor state.
	 */
	startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
552
		         (unsigned long) stack_start.sp);
553

L
Linus Torvalds 已提交
554 555 556 557 558
	/*
	 * Run STARTUP IPI loop.
	 */
	Dprintk("#startup loops: %d.\n", num_starts);

559
	maxlvt = lapic_get_maxlvt();
L
Linus Torvalds 已提交
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587

	for (j = 1; j <= num_starts; j++) {
		Dprintk("Sending STARTUP #%d.\n",j);
		apic_read_around(APIC_SPIV);
		apic_write(APIC_ESR, 0);
		apic_read(APIC_ESR);
		Dprintk("After apic_write.\n");

		/*
		 * STARTUP IPI
		 */

		/* Target chip */
		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));

		/* Boot on the stack */
		/* Kick the second */
		apic_write_around(APIC_ICR, APIC_DM_STARTUP
					| (start_eip >> 12));

		/*
		 * Give the other CPU some time to accept the IPI.
		 */
		udelay(300);

		Dprintk("Startup point 1.\n");

		Dprintk("Waiting for send to finish...\n");
588
		send_status = safe_apic_wait_icr_idle();
L
Linus Torvalds 已提交
589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616

		/*
		 * Give the other CPU some time to accept the IPI.
		 */
		udelay(200);
		/*
		 * Due to the Pentium erratum 3AP.
		 */
		if (maxlvt > 3) {
			apic_read_around(APIC_SPIV);
			apic_write(APIC_ESR, 0);
		}
		accept_status = (apic_read(APIC_ESR) & 0xEF);
		if (send_status || accept_status)
			break;
	}
	Dprintk("After Startup.\n");

	if (send_status)
		printk("APIC never delivered???\n");
	if (accept_status)
		printk("APIC delivery error (%lx).\n", accept_status);

	return (send_status | accept_status);
}
#endif	/* WAKE_SECONDARY_VIA_INIT */

extern cpumask_t cpu_initialized;
617 618 619 620 621 622 623 624 625 626 627 628
static inline int alloc_cpu_id(void)
{
	cpumask_t	tmp_map;
	int cpu;
	cpus_complement(tmp_map, cpu_present_map);
	cpu = first_cpu(tmp_map);
	if (cpu >= NR_CPUS)
		return -ENODEV;
	return cpu;
}

#ifdef CONFIG_HOTPLUG_CPU
A
Adrian Bunk 已提交
629 630
static struct task_struct * __cpuinitdata cpu_idle_tasks[NR_CPUS];
static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
631 632 633 634 635 636 637
{
	struct task_struct *idle;

	if ((idle = cpu_idle_tasks[cpu]) != NULL) {
		/* initialize thread_struct.  we really want to avoid destroy
		 * idle tread
		 */
638
		idle->thread.sp = (unsigned long)task_pt_regs(idle);
639 640 641 642 643 644 645 646 647 648 649 650
		init_idle(idle, cpu);
		return idle;
	}
	idle = fork_idle(cpu);

	if (!IS_ERR(idle))
		cpu_idle_tasks[cpu] = idle;
	return idle;
}
#else
#define alloc_idle_task(cpu) fork_idle(cpu)
#endif
L
Linus Torvalds 已提交
651

652
static int __cpuinit do_boot_cpu(int apicid, int cpu)
L
Linus Torvalds 已提交
653 654 655 656 657 658 659 660
/*
 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
 * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
 */
{
	struct task_struct *idle;
	unsigned long boot_error;
661
	int timeout;
L
Linus Torvalds 已提交
662 663 664
	unsigned long start_eip;
	unsigned short nmi_high = 0, nmi_low = 0;

665 666 667 668 669 670
	/*
	 * Save current MTRR state in case it was changed since early boot
	 * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
	 */
	mtrr_save_state();

L
Linus Torvalds 已提交
671 672 673 674
	/*
	 * We can't use kernel_thread since we must avoid to
	 * reschedule the child.
	 */
675
	idle = alloc_idle_task(cpu);
L
Linus Torvalds 已提交
676 677
	if (IS_ERR(idle))
		panic("failed fork for CPU %d", cpu);
678

679 680
	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
681
	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
682

683
	idle->thread.ip = (unsigned long) start_secondary;
L
Linus Torvalds 已提交
684 685 686
	/* start_eip had better be page-aligned! */
	start_eip = setup_trampoline();

687 688 689
	++cpucount;
	alternatives_smp_switch(1);

L
Linus Torvalds 已提交
690
	/* So we see what's up   */
691
	printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
L
Linus Torvalds 已提交
692
	/* Stack for startup_32 can be just as for start_secondary onwards */
693
	stack_start.sp = (void *) idle->thread.sp;
L
Linus Torvalds 已提交
694 695 696

	irq_ctx_init(cpu);

697
	per_cpu(x86_cpu_to_apicid, cpu) = apicid;
L
Linus Torvalds 已提交
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736
	/*
	 * This grunge runs the startup process for
	 * the targeted processor.
	 */

	atomic_set(&init_deasserted, 0);

	Dprintk("Setting warm reset code and vector.\n");

	store_NMI_vector(&nmi_high, &nmi_low);

	smpboot_setup_warm_reset_vector(start_eip);

	/*
	 * Starting actual IPI sequence...
	 */
	boot_error = wakeup_secondary_cpu(apicid, start_eip);

	if (!boot_error) {
		/*
		 * allow APs to start initializing.
		 */
		Dprintk("Before Callout %d.\n", cpu);
		cpu_set(cpu, cpu_callout_map);
		Dprintk("After Callout %d.\n", cpu);

		/*
		 * Wait 5s total for a response
		 */
		for (timeout = 0; timeout < 50000; timeout++) {
			if (cpu_isset(cpu, cpu_callin_map))
				break;	/* It has booted */
			udelay(100);
		}

		if (cpu_isset(cpu, cpu_callin_map)) {
			/* number CPUs logically, starting from 1 (BSP is 0) */
			Dprintk("OK.\n");
			printk("CPU%d: ", cpu);
737
			print_cpu_info(&cpu_data(cpu));
L
Linus Torvalds 已提交
738 739 740 741 742 743 744 745 746 747 748 749 750
			Dprintk("CPU has booted.\n");
		} else {
			boot_error= 1;
			if (*((volatile unsigned char *)trampoline_base)
					== 0xA5)
				/* trampoline started but...? */
				printk("Stuck ??\n");
			else
				/* trampoline code not run */
				printk("Not responding.\n");
			inquire_remote_apic(apicid);
		}
	}
751

L
Linus Torvalds 已提交
752 753 754 755 756 757
	if (boot_error) {
		/* Try to put things back the way they were before ... */
		unmap_cpu_to_logical_apicid(cpu);
		cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
		cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
		cpucount--;
758
	} else {
759
		per_cpu(x86_cpu_to_apicid, cpu) = apicid;
760
		cpu_set(cpu, cpu_present_map);
L
Linus Torvalds 已提交
761 762 763 764 765 766 767 768
	}

	/* mark "stuck" area as not stuck */
	*((volatile unsigned long *)trampoline_base) = 0;

	return boot_error;
}

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
#ifdef CONFIG_HOTPLUG_CPU
void cpu_exit_clear(void)
{
	int cpu = raw_smp_processor_id();

	idle_task_exit();

	cpucount --;
	cpu_uninit();
	irq_ctx_exit(cpu);

	cpu_clear(cpu, cpu_callout_map);
	cpu_clear(cpu, cpu_callin_map);

	cpu_clear(cpu, smp_commenced_mask);
	unmap_cpu_to_logical_apicid(cpu);
}

struct warm_boot_cpu_info {
	struct completion *complete;
D
David Howells 已提交
789
	struct work_struct task;
790 791 792 793
	int apicid;
	int cpu;
};

D
David Howells 已提交
794
static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
795
{
D
David Howells 已提交
796 797
	struct warm_boot_cpu_info *info =
		container_of(work, struct warm_boot_cpu_info, task);
798 799 800 801
	do_boot_cpu(info->apicid, info->cpu);
	complete(info->complete);
}

802
static int __cpuinit __smp_prepare_cpu(int cpu)
803
{
804
	DECLARE_COMPLETION_ONSTACK(done);
805 806 807
	struct warm_boot_cpu_info info;
	int	apicid, ret;

808
	apicid = per_cpu(x86_cpu_to_apicid, cpu);
809 810 811 812 813 814 815 816
	if (apicid == BAD_APICID) {
		ret = -ENODEV;
		goto exit;
	}

	info.complete = &done;
	info.apicid = apicid;
	info.cpu = cpu;
D
David Howells 已提交
817
	INIT_WORK(&info.task, do_warm_boot_cpu);
818 819

	/* init low mem mapping */
820
	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
821
			min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
822
	flush_tlb_all();
D
David Howells 已提交
823
	schedule_work(&info.task);
824 825 826 827 828 829 830 831 832
	wait_for_completion(&done);

	zap_low_mappings();
	ret = 0;
exit:
	return ret;
}
#endif

L
Linus Torvalds 已提交
833 834 835 836 837 838 839
/*
 * Cycle through the processors sending APIC IPIs to boot each.
 */

static int boot_cpu_logical_apicid;
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void *xquad_portio;
840 841 842
#ifdef CONFIG_X86_NUMAQ
EXPORT_SYMBOL(xquad_portio);
#endif
L
Linus Torvalds 已提交
843 844 845 846 847 848 849 850 851 852 853

static void __init smp_boot_cpus(unsigned int max_cpus)
{
	int apicid, cpu, bit, kicked;
	unsigned long bogosum = 0;

	/*
	 * Setup boot CPU information
	 */
	smp_store_cpu_info(0); /* Final full version of the data */
	printk("CPU%d: ", 0);
854
	print_cpu_info(&cpu_data(0));
L
Linus Torvalds 已提交
855

856
	boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
L
Linus Torvalds 已提交
857
	boot_cpu_logical_apicid = logical_smp_processor_id();
858
	per_cpu(x86_cpu_to_apicid, 0) = boot_cpu_physical_apicid;
L
Linus Torvalds 已提交
859 860 861

	current_thread_info()->cpu = 0;

862
	set_cpu_sibling_map(0);
863

L
Linus Torvalds 已提交
864 865 866 867 868 869
	/*
	 * If we couldn't find an SMP configuration at boot time,
	 * get out of here now!
	 */
	if (!smp_found_config && !acpi_lapic) {
		printk(KERN_NOTICE "SMP motherboard not detected.\n");
870 871 872 873 874 875
		smpboot_clear_io_apic_irqs();
		phys_cpu_present_map = physid_mask_of_physid(0);
		if (APIC_init_uniprocessor())
			printk(KERN_NOTICE "Local APIC not detected."
					   " Using dummy APIC emulation.\n");
		map_cpu_to_logical_apicid();
876
		cpu_set(0, per_cpu(cpu_sibling_map, 0));
877
		cpu_set(0, per_cpu(cpu_core_map, 0));
878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
		return;
	}

	/*
	 * Should not be necessary because the MP table should list the boot
	 * CPU too, but we do it for the sake of robustness anyway.
	 * Makes no sense to do this check in clustered apic mode, so skip it
	 */
	if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
		printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
				boot_cpu_physical_apicid);
		physid_set(hard_smp_processor_id(), phys_cpu_present_map);
	}

	/*
	 * If we couldn't find a local APIC, then get out of here now!
	 */
	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
			boot_cpu_physical_apicid);
		printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
		smpboot_clear_io_apic_irqs();
		phys_cpu_present_map = physid_mask_of_physid(0);
901
		map_cpu_to_logical_apicid();
902
		cpu_set(0, per_cpu(cpu_sibling_map, 0));
903
		cpu_set(0, per_cpu(cpu_core_map, 0));
L
Linus Torvalds 已提交
904 905 906
		return;
	}

907 908
	verify_local_APIC();

L
Linus Torvalds 已提交
909 910 911
	/*
	 * If SMP should be disabled, then really disable it!
	 */
912 913 914
	if (!max_cpus) {
		smp_found_config = 0;
		printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
I
Ingo Molnar 已提交
915 916 917 918 919 920

		if (nmi_watchdog == NMI_LOCAL_APIC) {
			printk(KERN_INFO "activating minimal APIC for NMI watchdog use.\n");
			connect_bsp_APIC();
			setup_local_APIC();
		}
921 922
		smpboot_clear_io_apic_irqs();
		phys_cpu_present_map = physid_mask_of_physid(0);
923
		map_cpu_to_logical_apicid();
924
		cpu_set(0, per_cpu(cpu_sibling_map, 0));
925
		cpu_set(0, per_cpu(cpu_core_map, 0));
L
Linus Torvalds 已提交
926 927 928
		return;
	}

929 930 931 932 933
	connect_bsp_APIC();
	setup_local_APIC();
	map_cpu_to_logical_apicid();


L
Linus Torvalds 已提交
934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958
	setup_portio_remap();

	/*
	 * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
	 *
	 * In clustered apic mode, phys_cpu_present_map is a constructed thus:
	 * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the 
	 * clustered apic ID.
	 */
	Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));

	kicked = 1;
	for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
		apicid = cpu_present_to_apicid(bit);
		/*
		 * Don't even attempt to start the boot CPU!
		 */
		if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
			continue;

		if (!check_apicid_present(bit))
			continue;
		if (max_cpus <= cpucount+1)
			continue;

959
		if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
L
Linus Torvalds 已提交
960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
			printk("CPU #%d not responding - cannot use it.\n",
								apicid);
		else
			++kicked;
	}

	/*
	 * Cleanup possible dangling ends...
	 */
	smpboot_restore_warm_reset_vector();

	/*
	 * Allow the user to impress friends.
	 */
	Dprintk("Before bogomips.\n");
975
	for_each_possible_cpu(cpu)
L
Linus Torvalds 已提交
976
		if (cpu_isset(cpu, cpu_callout_map))
977
			bogosum += cpu_data(cpu).loops_per_jiffy;
L
Linus Torvalds 已提交
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
	printk(KERN_INFO
		"Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
		cpucount+1,
		bogosum/(500000/HZ),
		(bogosum/(5000/HZ))%100);
	
	Dprintk("Before bogocount - setting activated=1.\n");

	if (smp_b_stepping)
		printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");

	/*
	 * Don't taint if we are running SMP kernel on a single non-MP
	 * approved Athlon
	 */
	if (tainted & TAINT_UNSAFE_SMP) {
		if (cpucount)
			printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
		else
			tainted &= ~TAINT_UNSAFE_SMP;
	}

	Dprintk("Boot done.\n");

	/*
1003
	 * construct cpu_sibling_map, so that we can tell sibling CPUs
L
Linus Torvalds 已提交
1004 1005
	 * efficiently.
	 */
1006
	for_each_possible_cpu(cpu) {
1007
		cpus_clear(per_cpu(cpu_sibling_map, cpu));
1008
		cpus_clear(per_cpu(cpu_core_map, cpu));
1009
	}
L
Linus Torvalds 已提交
1010

1011
	cpu_set(0, per_cpu(cpu_sibling_map, 0));
1012
	cpu_set(0, per_cpu(cpu_core_map, 0));
L
Linus Torvalds 已提交
1013

1014 1015
	smpboot_setup_io_apic();

Z
Zachary Amsden 已提交
1016
	setup_boot_clock();
L
Linus Torvalds 已提交
1017 1018 1019 1020
}

/* These are wrappers to interface to the new boot process.  Someone
   who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
1021
void __init native_smp_prepare_cpus(unsigned int max_cpus)
L
Linus Torvalds 已提交
1022
{
Z
Zwane Mwaikambo 已提交
1023 1024 1025
	smp_commenced_mask = cpumask_of_cpu(0);
	cpu_callin_map = cpumask_of_cpu(0);
	mb();
L
Linus Torvalds 已提交
1026 1027 1028
	smp_boot_cpus(max_cpus);
}

1029
void __init native_smp_prepare_boot_cpu(void)
1030 1031 1032
{
	unsigned int cpu = smp_processor_id();

1033
	init_gdt(cpu);
1034 1035 1036 1037 1038 1039 1040
	switch_to_new_gdt();

	cpu_set(cpu, cpu_online_map);
	cpu_set(cpu, cpu_callout_map);
	cpu_set(cpu, cpu_present_map);
	cpu_set(cpu, cpu_possible_map);
	__get_cpu_var(cpu_state) = CPU_ONLINE;
L
Linus Torvalds 已提交
1041 1042
}

Z
Zwane Mwaikambo 已提交
1043
#ifdef CONFIG_HOTPLUG_CPU
1044 1045 1046 1047 1048
static void __ref remove_cpu_from_maps(int cpu)
{
	cpu_clear(cpu, cpu_online_map);
}

Z
Zwane Mwaikambo 已提交
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063
int __cpu_disable(void)
{
	cpumask_t map = cpu_online_map;
	int cpu = smp_processor_id();

	/*
	 * Perhaps use cpufreq to drop frequency, but that could go
	 * into generic code.
 	 *
	 * We won't take down the boot processor on i386 due to some
	 * interrupts only being able to be serviced by the BSP.
	 * Especially so if we're not using an IOAPIC	-zwane
	 */
	if (cpu == 0)
		return -EBUSY;
1064 1065
	if (nmi_watchdog == NMI_LOCAL_APIC)
		stop_apic_nmi_watchdog(NULL);
1066
	clear_local_APIC();
Z
Zwane Mwaikambo 已提交
1067 1068 1069 1070 1071
	/* Allow any queued timer interrupts to get serviced */
	local_irq_enable();
	mdelay(1);
	local_irq_disable();

1072 1073
	remove_siblinginfo(cpu);

1074
	remove_cpu_from_maps(cpu);
Z
Zwane Mwaikambo 已提交
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
	fixup_irqs(map);
	/* It's now safe to remove this processor from the online map */
	cpu_clear(cpu, cpu_online_map);
	return 0;
}

void __cpu_die(unsigned int cpu)
{
	/* We don't do anything here: idle task is faking death itself. */
	unsigned int i;

	for (i = 0; i < 10; i++) {
		/* They ack this in play_dead by setting CPU_DEAD */
1088 1089
		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
			printk ("CPU %d is now offline\n", cpu);
G
Gerd Hoffmann 已提交
1090 1091
			if (1 == num_online_cpus())
				alternatives_smp_switch(0);
Z
Zwane Mwaikambo 已提交
1092
			return;
1093
		}
1094
		msleep(100);
L
Linus Torvalds 已提交
1095
	}
Z
Zwane Mwaikambo 已提交
1096 1097 1098 1099 1100 1101 1102
 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
#else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable(void)
{
	return -ENOSYS;
}
L
Linus Torvalds 已提交
1103

Z
Zwane Mwaikambo 已提交
1104 1105 1106 1107 1108 1109 1110
void __cpu_die(unsigned int cpu)
{
	/* We said "no" in __cpu_disable */
	BUG();
}
#endif /* CONFIG_HOTPLUG_CPU */

1111
int __cpuinit native_cpu_up(unsigned int cpu)
Z
Zwane Mwaikambo 已提交
1112
{
1113
	unsigned long flags;
1114
#ifdef CONFIG_HOTPLUG_CPU
1115
	int ret = 0;
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129

	/*
	 * We do warm boot only on cpus that had booted earlier
	 * Otherwise cold boot is all handled from smp_boot_cpus().
	 * cpu_callin_map is set during AP kickstart process. Its reset
	 * when a cpu is taken offline from cpu_exit_clear().
	 */
	if (!cpu_isset(cpu, cpu_callin_map))
		ret = __smp_prepare_cpu(cpu);

	if (ret)
		return -EIO;
#endif

L
Linus Torvalds 已提交
1130 1131
	/* In case one didn't come up */
	if (!cpu_isset(cpu, cpu_callin_map)) {
Z
Zwane Mwaikambo 已提交
1132
		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
L
Linus Torvalds 已提交
1133 1134 1135
		return -EIO;
	}

1136
	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
L
Linus Torvalds 已提交
1137 1138
	/* Unleash the CPU! */
	cpu_set(cpu, smp_commenced_mask);
1139 1140

	/*
1141 1142
	 * Check TSC synchronization with the AP (keep irqs disabled
	 * while doing so):
1143
	 */
1144
	local_irq_save(flags);
1145
	check_tsc_sync_source(cpu);
1146
	local_irq_restore(flags);
1147

1148
	while (!cpu_isset(cpu, cpu_online_map)) {
A
Andreas Mohr 已提交
1149
		cpu_relax();
1150 1151
		touch_nmi_watchdog();
	}
1152

L
Linus Torvalds 已提交
1153 1154 1155
	return 0;
}

1156
void __init native_smp_cpus_done(unsigned int max_cpus)
L
Linus Torvalds 已提交
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
{
#ifdef CONFIG_X86_IO_APIC
	setup_ioapic_dest();
#endif
	zap_low_mappings();
}

void __init smp_intr_init(void)
{
	/*
	 * IRQ0 must be given a fixed assignment and initialized,
	 * because it's used before the IO-APIC is set up.
	 */
	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);

	/*
	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
	 * IPI, driven by wakeup.
	 */
	set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);

	/* IPI for invalidation */
	set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);

	/* IPI for generic function call */
	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
}
1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196

/*
 * If the BIOS enumerates physical processors before logical,
 * maxcpus=N at enumeration-time can be used to disable HT.
 */
static int __init parse_maxcpus(char *arg)
{
	extern unsigned int maxcpus;

	maxcpus = simple_strtoul(arg, NULL, 0);
	return 0;
}
early_param("maxcpus", parse_maxcpus);