cpu.c 13.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/* CPU control.
 * (C) 2001, 2002, 2003, 2004 Rusty Russell
 *
 * This code is licenced under the GPL.
 */
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
16
#include <linux/mutex.h>
17
#include <linux/gfp.h>
L
Linus Torvalds 已提交
18

19
#ifdef CONFIG_SMP
20
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
21
static DEFINE_MUTEX(cpu_add_remove_lock);
L
Linus Torvalds 已提交
22

23
static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
L
Linus Torvalds 已提交
24

25 26 27 28 29
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
static int cpu_hotplug_disabled;

30 31 32 33 34 35 36 37
static struct {
	struct task_struct *active_writer;
	struct mutex lock; /* Synchronizes accesses to refcount, */
	/*
	 * Also blocks the new readers during
	 * an ongoing cpu hotplug operation.
	 */
	int refcount;
38 39 40 41 42
} cpu_hotplug = {
	.active_writer = NULL,
	.lock = __MUTEX_INITIALIZER(cpu_hotplug.lock),
	.refcount = 0,
};
43 44

#ifdef CONFIG_HOTPLUG_CPU
45

46
void get_online_cpus(void)
47
{
48 49
	might_sleep();
	if (cpu_hotplug.active_writer == current)
50
		return;
51 52 53 54
	mutex_lock(&cpu_hotplug.lock);
	cpu_hotplug.refcount++;
	mutex_unlock(&cpu_hotplug.lock);

55
}
56
EXPORT_SYMBOL_GPL(get_online_cpus);
57

58
void put_online_cpus(void)
59
{
60
	if (cpu_hotplug.active_writer == current)
61
		return;
62
	mutex_lock(&cpu_hotplug.lock);
63 64
	if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer))
		wake_up_process(cpu_hotplug.active_writer);
65 66
	mutex_unlock(&cpu_hotplug.lock);

67
}
68
EXPORT_SYMBOL_GPL(put_online_cpus);
69 70

#endif	/* CONFIG_HOTPLUG_CPU */
71

72 73
/*
 * The following two API's must be used when attempting
74
 * to serialize the updates to cpu_online_mask, cpu_present_mask.
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
 */
void cpu_maps_update_begin(void)
{
	mutex_lock(&cpu_add_remove_lock);
}

void cpu_maps_update_done(void)
{
	mutex_unlock(&cpu_add_remove_lock);
}

/*
 * This ensures that the hotplug operation can begin only when the
 * refcount goes to zero.
 *
 * Note that during a cpu-hotplug operation, the new readers, if any,
 * will be blocked by the cpu_hotplug.lock
 *
93 94
 * Since cpu_hotplug_begin() is always called after invoking
 * cpu_maps_update_begin(), we can be sure that only one writer is active.
95 96 97 98 99 100 101 102 103 104
 *
 * Note that theoretically, there is a possibility of a livelock:
 * - Refcount goes to zero, last reader wakes up the sleeping
 *   writer.
 * - Last reader unlocks the cpu_hotplug.lock.
 * - A new reader arrives at this moment, bumps up the refcount.
 * - The writer acquires the cpu_hotplug.lock finds the refcount
 *   non zero and goes to sleep again.
 *
 * However, this is very difficult to achieve in practice since
105
 * get_online_cpus() not an api which is called all that often.
106 107 108 109 110
 *
 */
static void cpu_hotplug_begin(void)
{
	cpu_hotplug.active_writer = current;
111 112 113 114 115 116

	for (;;) {
		mutex_lock(&cpu_hotplug.lock);
		if (likely(!cpu_hotplug.refcount))
			break;
		__set_current_state(TASK_UNINTERRUPTIBLE);
117 118 119 120 121 122 123 124 125 126
		mutex_unlock(&cpu_hotplug.lock);
		schedule();
	}
}

static void cpu_hotplug_done(void)
{
	cpu_hotplug.active_writer = NULL;
	mutex_unlock(&cpu_hotplug.lock);
}
L
Linus Torvalds 已提交
127
/* Need to know about CPUs going up/down? */
128
int __ref register_cpu_notifier(struct notifier_block *nb)
L
Linus Torvalds 已提交
129
{
130
	int ret;
131
	cpu_maps_update_begin();
132
	ret = raw_notifier_chain_register(&cpu_chain, nb);
133
	cpu_maps_update_done();
134
	return ret;
L
Linus Torvalds 已提交
135
}
136 137 138

#ifdef CONFIG_HOTPLUG_CPU

L
Linus Torvalds 已提交
139 140
EXPORT_SYMBOL(register_cpu_notifier);

141
void __ref unregister_cpu_notifier(struct notifier_block *nb)
L
Linus Torvalds 已提交
142
{
143
	cpu_maps_update_begin();
144
	raw_notifier_chain_unregister(&cpu_chain, nb);
145
	cpu_maps_update_done();
L
Linus Torvalds 已提交
146 147 148 149 150 151 152 153 154
}
EXPORT_SYMBOL(unregister_cpu_notifier);

static inline void check_for_tasks(int cpu)
{
	struct task_struct *p;

	write_lock_irq(&tasklist_lock);
	for_each_process(p) {
155
		if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
L
Linus Torvalds 已提交
156 157
		    (!cputime_eq(p->utime, cputime_zero) ||
		     !cputime_eq(p->stime, cputime_zero)))
158 159 160 161
			printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
				"(state = %ld, flags = %x)\n",
				p->comm, task_pid_nr(p), cpu,
				p->state, p->flags);
L
Linus Torvalds 已提交
162 163 164 165
	}
	write_unlock_irq(&tasklist_lock);
}

A
Avi Kivity 已提交
166
struct take_cpu_down_param {
167
	struct task_struct *caller;
A
Avi Kivity 已提交
168 169 170 171
	unsigned long mod;
	void *hcpu;
};

L
Linus Torvalds 已提交
172
/* Take this CPU down. */
173
static int __ref take_cpu_down(void *_param)
L
Linus Torvalds 已提交
174
{
A
Avi Kivity 已提交
175
	struct take_cpu_down_param *param = _param;
176
	unsigned int cpu = (unsigned long)param->hcpu;
L
Linus Torvalds 已提交
177 178 179 180 181
	int err;

	/* Ensure this CPU doesn't handle any more interrupts. */
	err = __cpu_disable();
	if (err < 0)
Z
Zwane Mwaikambo 已提交
182
		return err;
L
Linus Torvalds 已提交
183

184 185 186
	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
				param->hcpu);

187 188
	if (task_cpu(param->caller) == cpu)
		move_task_off_dead_cpu(cpu, param->caller);
Z
Zwane Mwaikambo 已提交
189 190 191 192
	/* Force idle task to run as soon as we yield: it should
	   immediately notice cpu is offline and die quickly. */
	sched_idle_next();
	return 0;
L
Linus Torvalds 已提交
193 194
}

195
/* Requires cpu_add_remove_lock to be held */
196
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
L
Linus Torvalds 已提交
197
{
198 199
	int err, nr_calls = 0;
	void *hcpu = (void *)(long)cpu;
200
	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
A
Avi Kivity 已提交
201
	struct take_cpu_down_param tcd_param = {
202
		.caller = current,
A
Avi Kivity 已提交
203 204 205
		.mod = mod,
		.hcpu = hcpu,
	};
L
Linus Torvalds 已提交
206

207 208
	if (num_online_cpus() == 1)
		return -EBUSY;
L
Linus Torvalds 已提交
209

210 211
	if (!cpu_online(cpu))
		return -EINVAL;
L
Linus Torvalds 已提交
212

213
	cpu_hotplug_begin();
214
	set_cpu_active(cpu, false);
215
	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
216
					hcpu, -1, &nr_calls);
L
Linus Torvalds 已提交
217
	if (err == NOTIFY_BAD) {
218 219
		set_cpu_active(cpu, true);

220
		nr_calls--;
221 222
		__raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
					  hcpu, nr_calls, NULL);
L
Linus Torvalds 已提交
223
		printk("%s: attempt to take down CPU %u failed\n",
224
				__func__, cpu);
225 226
		err = -EINVAL;
		goto out_release;
L
Linus Torvalds 已提交
227 228
	}

R
Rusty Russell 已提交
229
	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230
	if (err) {
231
		set_cpu_active(cpu, true);
L
Linus Torvalds 已提交
232
		/* CPU didn't die: tell everyone.  Can't complain. */
233
		if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
234
					    hcpu) == NOTIFY_BAD)
L
Linus Torvalds 已提交
235 236
			BUG();

237
		goto out_release;
238
	}
239
	BUG_ON(cpu_online(cpu));
L
Linus Torvalds 已提交
240 241 242 243 244 245 246 247 248

	/* Wait for it to sleep (leaving idle task). */
	while (!idle_cpu(cpu))
		yield();

	/* This actually kills the CPU. */
	__cpu_die(cpu);

	/* CPU is completely dead: tell everyone.  Too late to complain. */
249 250
	if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod,
				    hcpu) == NOTIFY_BAD)
L
Linus Torvalds 已提交
251 252 253 254
		BUG();

	check_for_tasks(cpu);

255
out_release:
256
	cpu_hotplug_done();
257 258 259 260 261
	if (!err) {
		if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
					    hcpu) == NOTIFY_BAD)
			BUG();
	}
262 263 264
	return err;
}

265
int __ref cpu_down(unsigned int cpu)
266
{
267
	int err;
268

269
	cpu_maps_update_begin();
270 271

	if (cpu_hotplug_disabled) {
272
		err = -EBUSY;
273 274 275 276
		goto out;
	}

	err = _cpu_down(cpu, 0);
277

278
out:
279
	cpu_maps_update_done();
L
Linus Torvalds 已提交
280 281
	return err;
}
282
EXPORT_SYMBOL(cpu_down);
L
Linus Torvalds 已提交
283 284
#endif /*CONFIG_HOTPLUG_CPU*/

285
/* Requires cpu_add_remove_lock to be held */
286
static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
L
Linus Torvalds 已提交
287
{
288
	int ret, nr_calls = 0;
L
Linus Torvalds 已提交
289
	void *hcpu = (void *)(long)cpu;
290
	unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
L
Linus Torvalds 已提交
291

292 293
	if (cpu_online(cpu) || !cpu_present(cpu))
		return -EINVAL;
294

295
	cpu_hotplug_begin();
296
	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
297
							-1, &nr_calls);
L
Linus Torvalds 已提交
298
	if (ret == NOTIFY_BAD) {
299
		nr_calls--;
L
Linus Torvalds 已提交
300
		printk("%s: attempt to bring up CPU %u failed\n",
301
				__func__, cpu);
L
Linus Torvalds 已提交
302 303 304 305 306 307 308 309
		ret = -EINVAL;
		goto out_notify;
	}

	/* Arch-specific enabling code. */
	ret = __cpu_up(cpu);
	if (ret != 0)
		goto out_notify;
310
	BUG_ON(!cpu_online(cpu));
L
Linus Torvalds 已提交
311

312
	set_cpu_active(cpu, true);
313

L
Linus Torvalds 已提交
314
	/* Now call notifier in preparation. */
315
	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
L
Linus Torvalds 已提交
316 317 318

out_notify:
	if (ret != 0)
319
		__raw_notifier_call_chain(&cpu_chain,
320
				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
321
	cpu_hotplug_done();
322 323 324 325

	return ret;
}

326
int __cpuinit cpu_up(unsigned int cpu)
327 328
{
	int err = 0;
329 330 331 332 333 334

#ifdef	CONFIG_MEMORY_HOTPLUG
	int nid;
	pg_data_t	*pgdat;
#endif

R
Rusty Russell 已提交
335
	if (!cpu_possible(cpu)) {
336 337
		printk(KERN_ERR "can't online cpu %d because it is not "
			"configured as may-hotadd at boot time\n", cpu);
338
#if defined(CONFIG_IA64)
339 340 341 342 343
		printk(KERN_ERR "please check additional_cpus= boot "
				"parameter\n");
#endif
		return -EINVAL;
	}
344

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
#ifdef	CONFIG_MEMORY_HOTPLUG
	nid = cpu_to_node(cpu);
	if (!node_online(nid)) {
		err = mem_online_node(nid);
		if (err)
			return err;
	}

	pgdat = NODE_DATA(nid);
	if (!pgdat) {
		printk(KERN_ERR
			"Can't online cpu %d due to NULL pgdat\n", cpu);
		return -ENOMEM;
	}

360 361
	if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
		mutex_lock(&zonelists_mutex);
362
		build_all_zonelists(NULL);
363 364
		mutex_unlock(&zonelists_mutex);
	}
365 366
#endif

367
	cpu_maps_update_begin();
368 369

	if (cpu_hotplug_disabled) {
370
		err = -EBUSY;
371 372 373 374 375 376
		goto out;
	}

	err = _cpu_up(cpu, 0);

out:
377
	cpu_maps_update_done();
378 379 380
	return err;
}

381
#ifdef CONFIG_PM_SLEEP_SMP
R
Rusty Russell 已提交
382
static cpumask_var_t frozen_cpus;
383 384 385

int disable_nonboot_cpus(void)
{
386
	int cpu, first_cpu, error;
387

388
	cpu_maps_update_begin();
R
Rusty Russell 已提交
389
	first_cpu = cpumask_first(cpu_online_mask);
390 391
	/*
	 * We take down all of the non-boot CPUs in one shot to avoid races
392 393
	 * with the userspace trying to use the CPU hotplug at the same time
	 */
R
Rusty Russell 已提交
394
	cpumask_clear(frozen_cpus);
395

396 397 398 399
	printk("Disabling non-boot CPUs ...\n");
	for_each_online_cpu(cpu) {
		if (cpu == first_cpu)
			continue;
400
		error = _cpu_down(cpu, 1);
401
		if (!error)
R
Rusty Russell 已提交
402
			cpumask_set_cpu(cpu, frozen_cpus);
403
		else {
404 405 406 407 408
			printk(KERN_ERR "Error taking CPU%d down: %d\n",
				cpu, error);
			break;
		}
	}
409

410 411 412 413 414
	if (!error) {
		BUG_ON(num_online_cpus() > 1);
		/* Make sure the CPUs won't be enabled by someone else */
		cpu_hotplug_disabled = 1;
	} else {
415
		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
416
	}
417
	cpu_maps_update_done();
418 419 420
	return error;
}

421 422 423 424 425 426 427 428
void __weak arch_enable_nonboot_cpus_begin(void)
{
}

void __weak arch_enable_nonboot_cpus_end(void)
{
}

429
void __ref enable_nonboot_cpus(void)
430 431 432 433
{
	int cpu, error;

	/* Allow everyone to use the CPU hotplug again */
434
	cpu_maps_update_begin();
435
	cpu_hotplug_disabled = 0;
R
Rusty Russell 已提交
436
	if (cpumask_empty(frozen_cpus))
437
		goto out;
438 439

	printk("Enabling non-boot CPUs ...\n");
440 441 442

	arch_enable_nonboot_cpus_begin();

R
Rusty Russell 已提交
443
	for_each_cpu(cpu, frozen_cpus) {
444
		error = _cpu_up(cpu, 1);
445 446 447 448
		if (!error) {
			printk("CPU%d is up\n", cpu);
			continue;
		}
449
		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
450
	}
451 452 453

	arch_enable_nonboot_cpus_end();

R
Rusty Russell 已提交
454
	cpumask_clear(frozen_cpus);
455
out:
456
	cpu_maps_update_done();
L
Linus Torvalds 已提交
457
}
R
Rusty Russell 已提交
458 459 460 461 462 463 464 465

static int alloc_frozen_cpus(void)
{
	if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
		return -ENOMEM;
	return 0;
}
core_initcall(alloc_frozen_cpus);
466
#endif /* CONFIG_PM_SLEEP_SMP */
467

468 469 470 471 472 473 474 475
/**
 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
 * @cpu: cpu that just started
 *
 * This function calls the cpu_chain notifiers with CPU_STARTING.
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
A
Al Viro 已提交
476
void __cpuinit notify_cpu_starting(unsigned int cpu)
477 478 479 480
{
	unsigned long val = CPU_STARTING;

#ifdef CONFIG_PM_SLEEP_SMP
R
Rusty Russell 已提交
481
	if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
482 483 484 485 486
		val = CPU_STARTING_FROZEN;
#endif /* CONFIG_PM_SLEEP_SMP */
	raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
}

487
#endif /* CONFIG_SMP */
488

489 490 491 492
/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
R
Rusty Russell 已提交
493
 * It is used by cpumask_of() to get a constant address to a CPU
494 495
 * mask value that has a single bit set only.
 */
496

497 498 499 500 501
/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x)	[x+1][0] = 1UL << (x)
#define MASK_DECLARE_2(x)	MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)	MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)	MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
502

503 504 505 506 507 508 509
const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

	MASK_DECLARE_8(0),	MASK_DECLARE_8(8),
	MASK_DECLARE_8(16),	MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
	MASK_DECLARE_8(32),	MASK_DECLARE_8(40),
	MASK_DECLARE_8(48),	MASK_DECLARE_8(56),
510 511
#endif
};
512
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
513 514 515

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536

#ifdef CONFIG_INIT_ALL_POSSIBLE
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
	= CPU_BITS_ALL;
#else
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
#endif
const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
EXPORT_SYMBOL(cpu_possible_mask);

static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
EXPORT_SYMBOL(cpu_online_mask);

static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
EXPORT_SYMBOL(cpu_present_mask);

static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
EXPORT_SYMBOL(cpu_active_mask);
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583

void set_cpu_possible(unsigned int cpu, bool possible)
{
	if (possible)
		cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits));
	else
		cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits));
}

void set_cpu_present(unsigned int cpu, bool present)
{
	if (present)
		cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits));
	else
		cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits));
}

void set_cpu_online(unsigned int cpu, bool online)
{
	if (online)
		cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
	else
		cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
}

void set_cpu_active(unsigned int cpu, bool active)
{
	if (active)
		cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
	else
		cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits));
}

void init_cpu_present(const struct cpumask *src)
{
	cpumask_copy(to_cpumask(cpu_present_bits), src);
}

void init_cpu_possible(const struct cpumask *src)
{
	cpumask_copy(to_cpumask(cpu_possible_bits), src);
}

void init_cpu_online(const struct cpumask *src)
{
	cpumask_copy(to_cpumask(cpu_online_bits), src);
}