setup_percpu.c 13.3 KB
Newer Older
1 2 3 4 5
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
6
#include <linux/kexec.h>
7
#include <linux/crash_dump.h>
8 9
#include <linux/smp.h>
#include <linux/topology.h>
10
#include <linux/pfn.h>
11 12 13
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
14
#include <asm/mpspec.h>
15
#include <asm/apicdef.h>
16
#include <asm/highmem.h>
17
#include <asm/proto.h>
18
#include <asm/cpumask.h>
B
Brian Gerst 已提交
19
#include <asm/cpu.h>
20
#include <asm/stackprotector.h>
21

22 23 24 25 26 27
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
# define DBG(x...) printk(KERN_DEBUG x)
#else
# define DBG(x...)
#endif

28 29 30
DEFINE_PER_CPU(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);

B
Brian Gerst 已提交
31 32 33 34 35 36 37 38 39
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif

DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

40
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
B
Brian Gerst 已提交
41
	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
42 43
};
EXPORT_SYMBOL(__per_cpu_offset);
44

45 46 47 48 49 50 51 52 53 54 55 56 57
/*
 * On x86_64 symbols referenced from code should be reachable using
 * 32bit relocations.  Reserve space for static percpu variables in
 * modules so that they are always served from the first chunk which
 * is located at the percpu segment base.  On x86_32, anything can
 * address anywhere.  No need to reserve space in the first chunk.
 */
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	pg_data_t *last = NULL;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			return true;

		last = NODE_DATA(node);
	}
#endif
	return false;
}

87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
					unsigned long align)
{
	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int node = early_cpu_to_node(cpu);
	void *ptr;

	if (!node_online(node) || !NODE_DATA(node)) {
		ptr = __alloc_bootmem_nopanic(size, align, goal);
		pr_info("cpu %d has no node %d or node-local memory\n",
			cpu, node);
		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
			 cpu, size, __pa(ptr));
	} else {
		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
						   size, align, goal);
		pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
			 "%016lx\n", cpu, size, node, __pa(ptr));
	}
	return ptr;
#else
	return __alloc_bootmem_nopanic(size, align, goal);
#endif
}

126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
/*
 * Remap allocator
 *
 * This allocator uses PMD page as unit.  A PMD page is allocated for
 * each cpu and each is remapped into vmalloc area using PMD mapping.
 * As PMD page is quite large, only part of it is used for the first
 * chunk.  Unused part is returned to the bootmem allocator.
 *
 * So, the PMD pages are mapped twice - once to the physical mapping
 * and to the vmalloc area for the first percpu chunk.  The double
 * mapping does add one more PMD TLB entry pressure but still is much
 * better than only using 4k mappings while still being NUMA friendly.
 */
#ifdef CONFIG_NEED_MULTIPLE_NODES
static size_t pcpur_size __initdata;
static void **pcpur_ptrs __initdata;

static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
{
	size_t off = (size_t)pageno << PAGE_SHIFT;

	if (off >= pcpur_size)
		return NULL;

	return virt_to_page(pcpur_ptrs[cpu] + off);
}

static ssize_t __init setup_pcpu_remap(size_t static_size)
{
	static struct vm_struct vm;
	pg_data_t *last;
157
	size_t ptrs_size, dyn_size;
158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
	unsigned int cpu;
	ssize_t ret;

	/*
	 * If large page isn't supported, there's no benefit in doing
	 * this.  Also, on non-NUMA, embedding is better.
	 */
	if (!cpu_has_pse || pcpu_need_numa())
		return -EINVAL;

	last = NULL;
	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			goto proceed;

		last = NODE_DATA(node);
	}
	return -EINVAL;

proceed:
	/*
	 * Currently supports only single page.  Supporting multiple
	 * pages won't be too difficult if it ever becomes necessary.
	 */
185 186
	pcpur_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
			       PERCPU_DYNAMIC_RESERVE);
187 188 189 190 191
	if (pcpur_size > PMD_SIZE) {
		pr_warning("PERCPU: static data is larger than large page, "
			   "can't use large page\n");
		return -EINVAL;
	}
192
	dyn_size = pcpur_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

	/* allocate pointer array and alloc large pages */
	ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
	pcpur_ptrs = alloc_bootmem(ptrs_size);

	for_each_possible_cpu(cpu) {
		pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
		if (!pcpur_ptrs[cpu])
			goto enomem;

		/*
		 * Only use pcpur_size bytes and give back the rest.
		 *
		 * Ingo: The 2MB up-rounding bootmem is needed to make
		 * sure the partial 2MB page is still fully RAM - it's
		 * not well-specified to have a PAT-incompatible area
		 * (unmapped RAM, device memory, etc.) in that hole.
		 */
		free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
			     PMD_SIZE - pcpur_size);

		memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
	}

	/* allocate address and map */
	vm.flags = VM_ALLOC;
	vm.size = num_possible_cpus() * PMD_SIZE;
	vm_area_register_early(&vm, PMD_SIZE);

	for_each_possible_cpu(cpu) {
		pmd_t *pmd;

		pmd = populate_extra_pmd((unsigned long)vm.addr
					 + cpu * PMD_SIZE);
		set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
				     PAGE_KERNEL_LARGE));
	}

	/* we're ready, commit */
	pr_info("PERCPU: Remapped at %p with large pages, static data "
		"%zu bytes\n", vm.addr, static_size);

235 236 237
	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size,
				     PERCPU_FIRST_CHUNK_RESERVE,
				     PMD_SIZE, dyn_size, vm.addr, NULL);
238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	goto out_free_ar;

enomem:
	for_each_possible_cpu(cpu)
		if (pcpur_ptrs[cpu])
			free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
	ret = -ENOMEM;
out_free_ar:
	free_bootmem(__pa(pcpur_ptrs), ptrs_size);
	return ret;
}
#else
static ssize_t __init setup_pcpu_remap(size_t static_size)
{
	return -EINVAL;
}
#endif

256 257 258 259
/*
 * Embedding allocator
 *
 * The first chunk is sized to just contain the static area plus
260 261 262 263 264 265
 * module and dynamic reserves, and allocated as a contiguous area
 * using bootmem allocator and used as-is without being mapped into
 * vmalloc area.  This enables the first chunk to piggy back on the
 * linear physical PMD mapping and doesn't add any additional pressure
 * to TLB.  Note that if the needed size is smaller than the minimum
 * unit size, the leftover is returned to the bootmem allocator.
266 267
 */
static void *pcpue_ptr __initdata;
268
static size_t pcpue_size __initdata;
269 270 271 272
static size_t pcpue_unit_size __initdata;

static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
273 274 275 276 277 278
	size_t off = (size_t)pageno << PAGE_SHIFT;

	if (off >= pcpue_size)
		return NULL;

	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
279 280 281 282 283
}

static ssize_t __init setup_pcpu_embed(size_t static_size)
{
	unsigned int cpu;
284
	size_t dyn_size;
285 286 287 288 289 290 291 292 293 294

	/*
	 * If large page isn't supported, there's no benefit in doing
	 * this.  Also, embedding allocation doesn't play well with
	 * NUMA.
	 */
	if (!cpu_has_pse || pcpu_need_numa())
		return -EINVAL;

	/* allocate and copy */
295 296
	pcpue_size = PFN_ALIGN(static_size + PERCPU_MODULE_RESERVE +
			       PERCPU_DYNAMIC_RESERVE);
297
	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
298
	dyn_size = pcpue_size - static_size - PERCPU_FIRST_CHUNK_RESERVE;
299

300 301 302 303 304
	pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
				       PAGE_SIZE);
	if (!pcpue_ptr)
		return -ENOMEM;

305 306 307 308 309 310 311
	for_each_possible_cpu(cpu) {
		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;

		free_bootmem(__pa(ptr + pcpue_size),
			     pcpue_unit_size - pcpue_size);
		memcpy(ptr, __per_cpu_load, static_size);
	}
312 313 314

	/* we're ready, commit */
	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
315
		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
316

317 318
	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
				      PERCPU_FIRST_CHUNK_RESERVE,
319 320
				      pcpue_unit_size, dyn_size,
				      pcpue_ptr, NULL);
321 322
}

323 324 325 326 327 328 329
/*
 * 4k page allocator
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page and most of initialization is done by the generic
 * setup function.
 */
330 331 332 333 334 335 336 337 338 339
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_nr_static_pages __initdata;

static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
	if (pageno < pcpu4k_nr_static_pages)
		return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
	return NULL;
}

340 341 342 343 344
static void __init pcpu4k_populate_pte(unsigned long addr)
{
	populate_extra_pte(addr);
}

345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
static ssize_t __init setup_pcpu_4k(size_t static_size)
{
	size_t pages_size;
	unsigned int cpu;
	int i, j;
	ssize_t ret;

	pcpu4k_nr_static_pages = PFN_UP(static_size);

	/* unaligned allocations can't be freed, round up to page size */
	pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
			       * sizeof(pcpu4k_pages[0]));
	pcpu4k_pages = alloc_bootmem(pages_size);

	/* allocate and copy */
	j = 0;
	for_each_possible_cpu(cpu)
		for (i = 0; i < pcpu4k_nr_static_pages; i++) {
			void *ptr;

			ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
			if (!ptr)
				goto enomem;

			memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
			pcpu4k_pages[j++] = virt_to_page(ptr);
		}

	/* we're ready, commit */
	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
		pcpu4k_nr_static_pages, static_size);

377 378 379
	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
				     PERCPU_FIRST_CHUNK_RESERVE, -1, -1, NULL,
				     pcpu4k_populate_pte);
380 381 382 383 384 385 386 387 388 389 390
	goto out_free_ar;

enomem:
	while (--j >= 0)
		free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
	ret = -ENOMEM;
out_free_ar:
	free_bootmem(__pa(pcpu4k_pages), pages_size);
	return ret;
}

391 392 393 394 395 396 397 398 399 400 401 402 403
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
	struct desc_struct gdt;

	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
			0x2 | DESCTYPE_S, 0x8);
	gdt.s = 1;
	write_gdt_entry(get_cpu_gdt_table(cpu),
			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}

404 405 406 407 408 409 410
/*
 * Great future plan:
 * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
 * Always point %gs to its beginning
 */
void __init setup_per_cpu_areas(void)
{
411 412
	size_t static_size = __per_cpu_end - __per_cpu_start;
	unsigned int cpu;
413 414
	unsigned long delta;
	size_t pcpu_unit_size;
415
	ssize_t ret;
416

417
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
418
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
419

420 421 422 423 424 425 426 427
	/*
	 * Allocate percpu area.  If PSE is supported, try to make use
	 * of large page mappings.  Please read comments on top of
	 * each allocator for details.
	 */
	ret = setup_pcpu_remap(static_size);
	if (ret < 0)
		ret = setup_pcpu_embed(static_size);
428 429
	if (ret < 0)
		ret = setup_pcpu_4k(static_size);
430 431 432
	if (ret < 0)
		panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
		      static_size, ret);
433

434
	pcpu_unit_size = ret;
435

436
	/* alrighty, percpu areas up and running */
437 438 439
	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu) {
		per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
440
		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
441
		per_cpu(cpu_number, cpu) = cpu;
442
		setup_percpu_segment(cpu);
443
		setup_stack_canary_segment(cpu);
444
		/*
445 446 447 448 449
		 * Copy data used in early init routines from the
		 * initial arrays to the per cpu data areas.  These
		 * arrays then become expendable and the *_early_ptr's
		 * are zeroed indicating that the static arrays are
		 * gone.
450
		 */
B
Brian Gerst 已提交
451
#ifdef CONFIG_X86_LOCAL_APIC
452
		per_cpu(x86_cpu_to_apicid, cpu) =
453
			early_per_cpu_map(x86_cpu_to_apicid, cpu);
454
		per_cpu(x86_bios_cpu_apicid, cpu) =
455
			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
B
Brian Gerst 已提交
456
#endif
457
#ifdef CONFIG_X86_64
458
		per_cpu(irq_stack_ptr, cpu) =
459 460
			per_cpu(irq_stack_union.irq_stack, cpu) +
			IRQ_STACK_SIZE - 64;
B
Brian Gerst 已提交
461 462
#ifdef CONFIG_NUMA
		per_cpu(x86_cpu_to_node_map, cpu) =
463
			early_per_cpu_map(x86_cpu_to_node_map, cpu);
464
#endif
B
Brian Gerst 已提交
465
#endif
466
		/*
B
Brian Gerst 已提交
467
		 * Up to this point, the boot CPU has been using .data.init
468
		 * area.  Reload any changed state for the boot CPU.
469
		 */
B
Brian Gerst 已提交
470
		if (cpu == boot_cpu_id)
471
			switch_to_new_gdt(cpu);
472 473
	}

474
	/* indicate the early static arrays will soon be gone */
475
#ifdef CONFIG_X86_LOCAL_APIC
476 477
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
478
#endif
B
Brian Gerst 已提交
479
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
480 481
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
482

483 484
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
485 486 487

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
488
}