setup_percpu.c 10.2 KB
Newer Older
1 2 3 4 5
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
6
#include <linux/kexec.h>
7
#include <linux/crash_dump.h>
8 9
#include <linux/smp.h>
#include <linux/topology.h>
10 11 12
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
13
#include <asm/mpspec.h>
14
#include <asm/apicdef.h>
15
#include <asm/highmem.h>
16
#include <asm/cpumask.h>
17

18 19 20 21 22 23
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
#else
# define DBG(x...)
#endif

24
#ifdef CONFIG_X86_LOCAL_APIC
25 26 27 28 29
unsigned int num_processors;
unsigned disabled_cpus __cpuinitdata;
/* Processor that is doing the boot up */
unsigned int boot_cpu_physical_apicid = -1U;
EXPORT_SYMBOL(boot_cpu_physical_apicid);
30
unsigned int max_physical_apicid;
31

32 33
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map;
34
#endif
35

36 37 38
/*
 * Map cpu index to physical APIC ID
 */
39 40 41 42 43 44
DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);

#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
45
#define	X86_64_NUMA	1	/* (used later) */
46

47 48 49
/*
 * Map cpu index to node index
 */
50 51
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
52

53 54 55
/*
 * Which logical CPUs are on which nodes
 */
56 57 58
cpumask_t *node_to_cpumask_map;
EXPORT_SYMBOL(node_to_cpumask_map);

59 60 61
/*
 * Setup node_to_cpumask_map
 */
62 63 64 65
static void __init setup_node_to_cpumask_map(void);

#else
static inline void setup_node_to_cpumask_map(void) { }
66 67
#endif

68
#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
69 70 71 72 73 74 75 76 77 78
/*
 * Copy data used in early init routines from the initial arrays to the
 * per cpu data areas.  These arrays then become expendable and the
 * *_early_ptr's are zeroed indicating that the static arrays are gone.
 */
static void __init setup_per_cpu_maps(void)
{
	int cpu;

	for_each_possible_cpu(cpu) {
79 80
		per_cpu(x86_cpu_to_apicid, cpu) =
				early_per_cpu_map(x86_cpu_to_apicid, cpu);
81
		per_cpu(x86_bios_cpu_apicid, cpu) =
82 83
				early_per_cpu_map(x86_bios_cpu_apicid, cpu);
#ifdef X86_64_NUMA
84
		per_cpu(x86_cpu_to_node_map, cpu) =
85
				early_per_cpu_map(x86_cpu_to_node_map, cpu);
86 87 88 89
#endif
	}

	/* indicate the early static arrays will soon be gone */
90 91 92 93
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
#ifdef X86_64_NUMA
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
94 95 96 97 98 99 100 101 102 103
#endif
}

#ifdef CONFIG_X86_32
/*
 * Great future not-so-futuristic plan: make i386 and x86_64 do it
 * the same way
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
static inline void setup_cpu_pda_map(void) { }

#elif !defined(CONFIG_SMP)
static inline void setup_cpu_pda_map(void) { }

#else /* CONFIG_SMP && CONFIG_X86_64 */

/*
 * Allocate cpu_pda pointer table and array via alloc_bootmem.
 */
static void __init setup_cpu_pda_map(void)
{
	char *pda;
	struct x8664_pda **new_cpu_pda;
	unsigned long size;
	int cpu;

	size = roundup(sizeof(struct x8664_pda), cache_line_size());

	/* allocate cpu_pda array and pointer table */
	{
		unsigned long tsize = nr_cpu_ids * sizeof(void *);
		unsigned long asize = size * (nr_cpu_ids - 1);

		tsize = roundup(tsize, cache_line_size());
		new_cpu_pda = alloc_bootmem(tsize + asize);
		pda = (char *)new_cpu_pda + tsize;
	}

	/* initialize pointer table to static pda's */
	for_each_possible_cpu(cpu) {
		if (cpu == 0) {
			/* leave boot cpu pda in place */
			new_cpu_pda[0] = cpu_pda(0);
			continue;
		}
		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
		new_cpu_pda[cpu]->in_bootmem = 1;
		pda += size;
	}

	/* point to new pointer table */
	_cpu_pda = new_cpu_pda;
}
148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168

#endif /* CONFIG_SMP && CONFIG_X86_64 */

#ifdef CONFIG_X86_64

/* correctly size the local cpu masks */
static void setup_cpu_local_masks(void)
{
	alloc_bootmem_cpumask_var(&cpu_initialized_mask);
	alloc_bootmem_cpumask_var(&cpu_callin_mask);
	alloc_bootmem_cpumask_var(&cpu_callout_mask);
	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
}

#else /* CONFIG_X86_32 */

static inline void setup_cpu_local_masks(void)
{
}

#endif /* CONFIG_X86_32 */
169 170 171 172 173 174 175 176

/*
 * Great future plan:
 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
 * Always point %gs to its beginning
 */
void __init setup_per_cpu_areas(void)
{
T
Thomas Gleixner 已提交
177
	ssize_t size, old_size;
178 179
	char *ptr;
	int cpu;
Y
Yinghai Lu 已提交
180
	unsigned long align = 1;
181

182 183 184
	/* Setup cpu_pda map */
	setup_cpu_pda_map();

185
	/* Copy section for each CPU (we discard the original) */
Y
Yinghai Lu 已提交
186
	old_size = PERCPU_ENOUGH_ROOM;
Y
Yinghai Lu 已提交
187
	align = max_t(unsigned long, PAGE_SIZE, align);
T
Thomas Gleixner 已提交
188
	size = roundup(old_size, align);
189

190
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
191 192
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);

193
	pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
194

195
	for_each_possible_cpu(cpu) {
196
#ifndef CONFIG_NEED_MULTIPLE_NODES
Y
Yinghai Lu 已提交
197 198
		ptr = __alloc_bootmem(size, align,
				 __pa(MAX_DMA_ADDRESS));
199
#else
200
		int node = early_cpu_to_node(cpu);
201
		if (!node_online(node) || !NODE_DATA(node)) {
Y
Yinghai Lu 已提交
202 203
			ptr = __alloc_bootmem(size, align,
					 __pa(MAX_DMA_ADDRESS));
204
			pr_info("cpu %d has no node %d or node-local memory\n",
205
				cpu, node);
206 207 208
			pr_debug("per cpu data for cpu%d at %016lx\n",
				 cpu, __pa(ptr));
		} else {
Y
Yinghai Lu 已提交
209 210
			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
							__pa(MAX_DMA_ADDRESS));
211 212
			pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
				cpu, node, __pa(ptr));
Y
Yinghai Lu 已提交
213
		}
214
#endif
215
		per_cpu_offset(cpu) = ptr - __per_cpu_start;
216
		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
217 218

		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
219 220
	}

221
	/* Setup percpu data maps */
222
	setup_per_cpu_maps();
223

224 225
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
226 227 228

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
229 230 231
}

#endif
232

233
#ifdef X86_64_NUMA
234 235 236 237 238 239

/*
 * Allocate node_to_cpumask_map based on number of available nodes
 * Requires node_possible_map to be valid.
 *
 * Note: node_to_cpumask() is not valid until after this is done.
240
 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
 */
static void __init setup_node_to_cpumask_map(void)
{
	unsigned int node, num = 0;
	cpumask_t *map;

	/* setup nr_node_ids if not done yet */
	if (nr_node_ids == MAX_NUMNODES) {
		for_each_node_mask(node, node_possible_map)
			num = node;
		nr_node_ids = num + 1;
	}

	/* allocate the map */
	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
256
	DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
257

258
	pr_debug("Node to cpumask map at %p for %d nodes\n",
259
		 map, nr_node_ids);
260 261 262 263 264

	/* node_to_cpumask() will now work */
	node_to_cpumask_map = map;
}

265 266 267 268
void __cpuinit numa_set_node(int cpu, int node)
{
	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);

269 270
	/* early setting, no percpu area yet */
	if (cpu_to_node_map) {
271
		cpu_to_node_map[cpu] = node;
272 273
		return;
	}
274

275 276 277 278 279 280 281 282
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
	if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
		dump_stack();
		return;
	}
#endif
	per_cpu(x86_cpu_to_node_map, cpu) = node;
283

284 285
	if (node != NUMA_NO_NODE)
		cpu_pda(cpu)->nodenumber = node;
286 287 288 289 290 291 292
}

void __cpuinit numa_clear_node(int cpu)
{
	numa_set_node(cpu, NUMA_NO_NODE);
}

293 294
#ifndef CONFIG_DEBUG_PER_CPU_MAPS

295 296 297 298 299 300 301
void __cpuinit numa_add_cpu(int cpu)
{
	cpu_set(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}

void __cpuinit numa_remove_cpu(int cpu)
{
302
	cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
303 304
}

305 306 307 308 309 310 311
#else /* CONFIG_DEBUG_PER_CPU_MAPS */

/*
 * --------- debug versions of the numa functions ---------
 */
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
312
	int node = early_cpu_to_node(cpu);
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
	cpumask_t *mask;
	char buf[64];

	if (node_to_cpumask_map == NULL) {
		printk(KERN_ERR "node_to_cpumask_map NULL\n");
		dump_stack();
		return;
	}

	mask = &node_to_cpumask_map[node];
	if (enable)
		cpu_set(cpu, *mask);
	else
		cpu_clear(cpu, *mask);

328
	cpulist_scnprintf(buf, sizeof(buf), mask);
329
	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
330 331
		enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
}
332 333 334 335 336 337 338 339 340 341

void __cpuinit numa_add_cpu(int cpu)
{
	numa_set_cpumask(cpu, 1);
}

void __cpuinit numa_remove_cpu(int cpu)
{
	numa_set_cpumask(cpu, 0);
}
342 343 344 345 346 347 348 349 350 351 352 353 354

int cpu_to_node(int cpu)
{
	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
		printk(KERN_WARNING
			"cpu_to_node(%d): usage too early!\n", cpu);
		dump_stack();
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
	}
	return per_cpu(x86_cpu_to_node_map, cpu);
}
EXPORT_SYMBOL(cpu_to_node);

355 356 357 358
/*
 * Same function as cpu_to_node() but used if called before the
 * per_cpu areas are setup.
 */
359 360 361 362 363 364 365 366
int early_cpu_to_node(int cpu)
{
	if (early_per_cpu_ptr(x86_cpu_to_node_map))
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];

	if (!per_cpu_offset(cpu)) {
		printk(KERN_WARNING
			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
367
		dump_stack();
368 369 370 371
		return NUMA_NO_NODE;
	}
	return per_cpu(x86_cpu_to_node_map, cpu);
}
372

373 374 375 376

/* empty cpumask */
static const cpumask_t cpu_mask_none;

377 378 379
/*
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
 */
380
const cpumask_t *cpumask_of_node(int node)
381 382 383
{
	if (node_to_cpumask_map == NULL) {
		printk(KERN_WARNING
384
			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
385 386
			node);
		dump_stack();
387
		return (const cpumask_t *)&cpu_online_map;
388
	}
389 390
	if (node >= nr_node_ids) {
		printk(KERN_WARNING
391
			"cpumask_of_node(%d): node > nr_node_ids(%d)\n",
392 393
			node, nr_node_ids);
		dump_stack();
394
		return &cpu_mask_none;
395
	}
396
	return &node_to_cpumask_map[node];
397
}
398
EXPORT_SYMBOL(cpumask_of_node);
399 400 401

/*
 * Returns a bitmask of CPUs on Node 'node'.
402 403 404 405
 *
 * Side note: this function creates the returned cpumask on the stack
 * so with a high NR_CPUS count, excessive stack space is used.  The
 * node_to_cpumask_ptr function should be used whenever possible.
406 407 408 409 410 411 412 413 414
 */
cpumask_t node_to_cpumask(int node)
{
	if (node_to_cpumask_map == NULL) {
		printk(KERN_WARNING
			"node_to_cpumask(%d): no node_to_cpumask_map!\n", node);
		dump_stack();
		return cpu_online_map;
	}
415 416 417 418 419 420 421
	if (node >= nr_node_ids) {
		printk(KERN_WARNING
			"node_to_cpumask(%d): node > nr_node_ids(%d)\n",
			node, nr_node_ids);
		dump_stack();
		return cpu_mask_none;
	}
422 423 424 425 426 427 428 429 430 431 432
	return node_to_cpumask_map[node];
}
EXPORT_SYMBOL(node_to_cpumask);

/*
 * --------- end of debug versions of the numa functions ---------
 */

#endif /* CONFIG_DEBUG_PER_CPU_MAPS */

#endif /* X86_64_NUMA */
433