setup_percpu.c 8.1 KB
Newer Older
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3 4 5 6 7
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
8
#include <linux/kexec.h>
9
#include <linux/crash_dump.h>
10 11
#include <linux/smp.h>
#include <linux/topology.h>
12
#include <linux/pfn.h>
13 14 15
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
16
#include <asm/mpspec.h>
17
#include <asm/apicdef.h>
18
#include <asm/highmem.h>
19
#include <asm/proto.h>
20
#include <asm/cpumask.h>
B
Brian Gerst 已提交
21
#include <asm/cpu.h>
22
#include <asm/stackprotector.h>
23

24
DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
25 26
EXPORT_PER_CPU_SYMBOL(cpu_number);

B
Brian Gerst 已提交
27 28 29 30 31 32
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif

33
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
B
Brian Gerst 已提交
34 35
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

36
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
B
Brian Gerst 已提交
37
	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
38 39
};
EXPORT_SYMBOL(__per_cpu_offset);
40

41 42 43 44 45 46 47 48 49 50 51 52 53
/*
 * On x86_64 symbols referenced from code should be reachable using
 * 32bit relocations.  Reserve space for static percpu variables in
 * modules so that they are always served from the first chunk which
 * is located at the percpu segment base.  On x86_32, anything can
 * address anywhere.  No need to reserve space in the first chunk.
 */
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif

54
#ifdef CONFIG_X86_32
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	pg_data_t *last = NULL;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			return true;

		last = NODE_DATA(node);
	}
#endif
	return false;
}
83
#endif
84

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
					unsigned long align)
{
	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int node = early_cpu_to_node(cpu);
	void *ptr;

	if (!node_online(node) || !NODE_DATA(node)) {
		ptr = __alloc_bootmem_nopanic(size, align, goal);
		pr_info("cpu %d has no node %d or node-local memory\n",
			cpu, node);
		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
			 cpu, size, __pa(ptr));
	} else {
		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
						   size, align, goal);
115 116
		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
			 cpu, size, node, __pa(ptr));
117 118 119 120 121 122 123
	}
	return ptr;
#else
	return __alloc_bootmem_nopanic(size, align, goal);
#endif
}

124 125 126
/*
 * Helpers for first chunk memory allocation
 */
127
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
128
{
129
	return pcpu_alloc_bootmem(cpu, size, align);
130 131 132 133 134 135 136
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
	free_bootmem(__pa(ptr), size);
}

137
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
138
{
139
#ifdef CONFIG_NEED_MULTIPLE_NODES
140 141 142 143
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
		return LOCAL_DISTANCE;
	else
		return REMOTE_DISTANCE;
144
#else
145
	return LOCAL_DISTANCE;
146
#endif
147 148
}

149
static void __init pcpup_populate_pte(unsigned long addr)
150 151 152 153
{
	populate_extra_pte(addr);
}

154 155 156 157 158 159 160 161 162 163 164 165 166
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
	struct desc_struct gdt;

	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
			0x2 | DESCTYPE_S, 0x8);
	gdt.s = 1;
	write_gdt_entry(get_cpu_gdt_table(cpu),
			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}

167 168
void __init setup_per_cpu_areas(void)
{
169
	unsigned int cpu;
170
	unsigned long delta;
T
Tejun Heo 已提交
171
	int rc;
172

173
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
174
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
175

176
	/*
177 178 179 180
	 * Allocate percpu area.  Embedding allocator is our favorite;
	 * however, on NUMA configurations, it can result in very
	 * sparse unit mapping and vmalloc area isn't spacious enough
	 * on 32bit.  Use page in that case.
181
	 */
182 183 184 185
#ifdef CONFIG_X86_32
	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
		pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
T
Tejun Heo 已提交
186
	rc = -EINVAL;
187 188 189
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
		const size_t dyn_size = PERCPU_MODULE_RESERVE +
			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
190 191 192 193 194 195 196 197 198 199 200 201 202 203
		size_t atom_size;

		/*
		 * On 64bit, use PMD_SIZE for atom_size so that embedded
		 * percpu areas are aligned to PMD.  This, in the future,
		 * can also allow using PMD mappings in vmalloc area.  Use
		 * PAGE_SIZE on 32bit as vmalloc space is highly contended
		 * and large vmalloc area allocs can easily fail.
		 */
#ifdef CONFIG_X86_64
		atom_size = PMD_SIZE;
#else
		atom_size = PAGE_SIZE;
#endif
204 205 206 207
		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					    dyn_size, atom_size,
					    pcpu_cpu_distance,
					    pcpu_fc_alloc, pcpu_fc_free);
T
Tejun Heo 已提交
208
		if (rc < 0)
209
			pr_warning("%s allocator failed (%d), falling back to page size\n",
210
				   pcpu_fc_names[pcpu_chosen_fc], rc);
211
	}
T
Tejun Heo 已提交
212
	if (rc < 0)
213 214 215
		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					   pcpu_fc_alloc, pcpu_fc_free,
					   pcpup_populate_pte);
T
Tejun Heo 已提交
216 217
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);
218

219
	/* alrighty, percpu areas up and running */
220 221
	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu) {
T
Tejun Heo 已提交
222
		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
223
		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
224
		per_cpu(cpu_number, cpu) = cpu;
225
		setup_percpu_segment(cpu);
226
		setup_stack_canary_segment(cpu);
227
		/*
228 229 230 231 232
		 * Copy data used in early init routines from the
		 * initial arrays to the per cpu data areas.  These
		 * arrays then become expendable and the *_early_ptr's
		 * are zeroed indicating that the static arrays are
		 * gone.
233
		 */
B
Brian Gerst 已提交
234
#ifdef CONFIG_X86_LOCAL_APIC
235
		per_cpu(x86_cpu_to_apicid, cpu) =
236
			early_per_cpu_map(x86_cpu_to_apicid, cpu);
237
		per_cpu(x86_bios_cpu_apicid, cpu) =
238
			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
239 240
		per_cpu(x86_cpu_to_acpiid, cpu) =
			early_per_cpu_map(x86_cpu_to_acpiid, cpu);
B
Brian Gerst 已提交
241
#endif
242 243 244 245
#ifdef CONFIG_X86_32
		per_cpu(x86_cpu_to_logical_apicid, cpu) =
			early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
#endif
246
#ifdef CONFIG_X86_64
247
		per_cpu(irq_stack_ptr, cpu) =
248 249
			per_cpu(irq_stack_union.irq_stack, cpu) +
			IRQ_STACK_SIZE - 64;
250
#endif
B
Brian Gerst 已提交
251 252
#ifdef CONFIG_NUMA
		per_cpu(x86_cpu_to_node_map, cpu) =
253
			early_per_cpu_map(x86_cpu_to_node_map, cpu);
254
		/*
255
		 * Ensure that the boot cpu numa_node is correct when the boot
256 257 258 259 260 261 262
		 * cpu is on a node that doesn't have memory installed.
		 * Also cpu_up() will call cpu_to_node() for APs when
		 * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
		 * up later with c_init aka intel_init/amd_init.
		 * So set them all (boot cpu and all APs).
		 */
		set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
B
Brian Gerst 已提交
263
#endif
264
		/*
265
		 * Up to this point, the boot CPU has been using .init.data
266
		 * area.  Reload any changed state for the boot CPU.
267
		 */
268
		if (!cpu)
269
			switch_to_new_gdt(cpu);
270 271
	}

272
	/* indicate the early static arrays will soon be gone */
273
#ifdef CONFIG_X86_LOCAL_APIC
274 275
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
276
	early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
277
#endif
278 279 280
#ifdef CONFIG_X86_32
	early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
#endif
281
#ifdef CONFIG_NUMA
282 283
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
284

285 286
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
287 288 289

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
290
}