setup_percpu.c 7.4 KB
Newer Older
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3 4 5 6 7
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
8
#include <linux/kexec.h>
9
#include <linux/crash_dump.h>
10 11
#include <linux/smp.h>
#include <linux/topology.h>
12
#include <linux/pfn.h>
13 14 15
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
16
#include <asm/mpspec.h>
17
#include <asm/apicdef.h>
18
#include <asm/highmem.h>
19
#include <asm/proto.h>
20
#include <asm/cpumask.h>
B
Brian Gerst 已提交
21
#include <asm/cpu.h>
22
#include <asm/stackprotector.h>
23

24 25 26
DEFINE_PER_CPU(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);

B
Brian Gerst 已提交
27 28 29 30 31 32 33 34 35
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif

DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

36
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
B
Brian Gerst 已提交
37
	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
38 39
};
EXPORT_SYMBOL(__per_cpu_offset);
40

41 42 43 44 45 46 47 48 49 50 51 52 53
/*
 * On x86_64 symbols referenced from code should be reachable using
 * 32bit relocations.  Reserve space for static percpu variables in
 * modules so that they are always served from the first chunk which
 * is located at the percpu segment base.  On x86_32, anything can
 * address anywhere.  No need to reserve space in the first chunk.
 */
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif

54
#ifdef CONFIG_X86_32
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	pg_data_t *last = NULL;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			return true;

		last = NODE_DATA(node);
	}
#endif
	return false;
}
83
#endif
84

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
					unsigned long align)
{
	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int node = early_cpu_to_node(cpu);
	void *ptr;

	if (!node_online(node) || !NODE_DATA(node)) {
		ptr = __alloc_bootmem_nopanic(size, align, goal);
		pr_info("cpu %d has no node %d or node-local memory\n",
			cpu, node);
		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
			 cpu, size, __pa(ptr));
	} else {
		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
						   size, align, goal);
115 116
		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
			 cpu, size, node, __pa(ptr));
117 118 119 120 121 122 123
	}
	return ptr;
#else
	return __alloc_bootmem_nopanic(size, align, goal);
#endif
}

124 125 126
/*
 * Helpers for first chunk memory allocation
 */
127
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
128
{
129
	return pcpu_alloc_bootmem(cpu, size, align);
130 131 132 133
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
Y
Yinghai Lu 已提交
134 135 136 137 138
#ifdef CONFIG_NO_BOOTMEM
	u64 start = __pa(ptr);
	u64 end = start + size;
	free_early_partial(start, end);
#else
139
	free_bootmem(__pa(ptr), size);
Y
Yinghai Lu 已提交
140
#endif
141 142
}

143
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
144
{
145
#ifdef CONFIG_NEED_MULTIPLE_NODES
146 147 148 149
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
		return LOCAL_DISTANCE;
	else
		return REMOTE_DISTANCE;
150
#else
151
	return LOCAL_DISTANCE;
152
#endif
153 154
}

155
static void __init pcpup_populate_pte(unsigned long addr)
156 157 158 159
{
	populate_extra_pte(addr);
}

160 161 162 163 164 165 166 167 168 169 170 171 172
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
	struct desc_struct gdt;

	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
			0x2 | DESCTYPE_S, 0x8);
	gdt.s = 1;
	write_gdt_entry(get_cpu_gdt_table(cpu),
			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}

173 174
void __init setup_per_cpu_areas(void)
{
175
	unsigned int cpu;
176
	unsigned long delta;
T
Tejun Heo 已提交
177
	int rc;
178

179
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
180
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
181

182
	/*
183 184 185 186
	 * Allocate percpu area.  Embedding allocator is our favorite;
	 * however, on NUMA configurations, it can result in very
	 * sparse unit mapping and vmalloc area isn't spacious enough
	 * on 32bit.  Use page in that case.
187
	 */
188 189 190 191
#ifdef CONFIG_X86_32
	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
		pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
T
Tejun Heo 已提交
192
	rc = -EINVAL;
193 194 195 196 197 198 199 200 201
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
		const size_t dyn_size = PERCPU_MODULE_RESERVE +
			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;

		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					    dyn_size, atom_size,
					    pcpu_cpu_distance,
					    pcpu_fc_alloc, pcpu_fc_free);
T
Tejun Heo 已提交
202
		if (rc < 0)
203
			pr_warning("%s allocator failed (%d), falling back to page size\n",
204
				   pcpu_fc_names[pcpu_chosen_fc], rc);
205
	}
T
Tejun Heo 已提交
206
	if (rc < 0)
207 208 209
		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					   pcpu_fc_alloc, pcpu_fc_free,
					   pcpup_populate_pte);
T
Tejun Heo 已提交
210 211
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);
212

213
	/* alrighty, percpu areas up and running */
214 215
	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu) {
T
Tejun Heo 已提交
216
		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
217
		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
218
		per_cpu(cpu_number, cpu) = cpu;
219
		setup_percpu_segment(cpu);
220
		setup_stack_canary_segment(cpu);
221
		/*
222 223 224 225 226
		 * Copy data used in early init routines from the
		 * initial arrays to the per cpu data areas.  These
		 * arrays then become expendable and the *_early_ptr's
		 * are zeroed indicating that the static arrays are
		 * gone.
227
		 */
B
Brian Gerst 已提交
228
#ifdef CONFIG_X86_LOCAL_APIC
229
		per_cpu(x86_cpu_to_apicid, cpu) =
230
			early_per_cpu_map(x86_cpu_to_apicid, cpu);
231
		per_cpu(x86_bios_cpu_apicid, cpu) =
232
			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
B
Brian Gerst 已提交
233
#endif
234
#ifdef CONFIG_X86_64
235
		per_cpu(irq_stack_ptr, cpu) =
236 237
			per_cpu(irq_stack_union.irq_stack, cpu) +
			IRQ_STACK_SIZE - 64;
B
Brian Gerst 已提交
238 239
#ifdef CONFIG_NUMA
		per_cpu(x86_cpu_to_node_map, cpu) =
240
			early_per_cpu_map(x86_cpu_to_node_map, cpu);
241
#endif
B
Brian Gerst 已提交
242
#endif
243
		/*
244
		 * Up to this point, the boot CPU has been using .init.data
245
		 * area.  Reload any changed state for the boot CPU.
246
		 */
B
Brian Gerst 已提交
247
		if (cpu == boot_cpu_id)
248
			switch_to_new_gdt(cpu);
249 250
	}

251
	/* indicate the early static arrays will soon be gone */
252
#ifdef CONFIG_X86_LOCAL_APIC
253 254
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
255
#endif
B
Brian Gerst 已提交
256
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
257 258
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
259

260 261
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
	/*
262
	 * make sure boot cpu numa_node is right, when boot cpu is on the
263 264
	 * node that doesn't have mem installed
	 */
265
	set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id));
266 267
#endif

268 269
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
270 271 272

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
273
}