setup_percpu.c 8.7 KB
Newer Older
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3
#include <linux/kernel.h>
4
#include <linux/export.h>
5 6 7
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
8
#include <linux/kexec.h>
9
#include <linux/crash_dump.h>
10 11
#include <linux/smp.h>
#include <linux/topology.h>
12
#include <linux/pfn.h>
13 14
#include <asm/sections.h>
#include <asm/processor.h>
15
#include <asm/desc.h>
16
#include <asm/setup.h>
17
#include <asm/mpspec.h>
18
#include <asm/apicdef.h>
19
#include <asm/highmem.h>
20
#include <asm/proto.h>
21
#include <asm/cpumask.h>
B
Brian Gerst 已提交
22
#include <asm/cpu.h>
23
#include <asm/stackprotector.h>
24

25
DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number);
26 27
EXPORT_PER_CPU_SYMBOL(cpu_number);

B
Brian Gerst 已提交
28 29 30 31 32 33
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif

34
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
B
Brian Gerst 已提交
35 36
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

37
unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = {
B
Brian Gerst 已提交
38
	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
39 40
};
EXPORT_SYMBOL(__per_cpu_offset);
41

42 43 44 45 46 47 48 49 50 51 52 53 54
/*
 * On x86_64 symbols referenced from code should be reachable using
 * 32bit relocations.  Reserve space for static percpu variables in
 * modules so that they are always served from the first chunk which
 * is located at the percpu segment base.  On x86_32, anything can
 * address anywhere.  No need to reserve space in the first chunk.
 */
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif

55
#ifdef CONFIG_X86_32
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	pg_data_t *last = NULL;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			return true;

		last = NODE_DATA(node);
	}
#endif
	return false;
}
84
#endif
85

86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
					unsigned long align)
{
	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int node = early_cpu_to_node(cpu);
	void *ptr;

	if (!node_online(node) || !NODE_DATA(node)) {
		ptr = __alloc_bootmem_nopanic(size, align, goal);
		pr_info("cpu %d has no node %d or node-local memory\n",
			cpu, node);
		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
			 cpu, size, __pa(ptr));
	} else {
		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
						   size, align, goal);
116 117
		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
			 cpu, size, node, __pa(ptr));
118 119 120 121 122 123 124
	}
	return ptr;
#else
	return __alloc_bootmem_nopanic(size, align, goal);
#endif
}

125 126 127
/*
 * Helpers for first chunk memory allocation
 */
128
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
129
{
130
	return pcpu_alloc_bootmem(cpu, size, align);
131 132 133 134 135 136 137
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
	free_bootmem(__pa(ptr), size);
}

138
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
139
{
140
#ifdef CONFIG_NEED_MULTIPLE_NODES
141 142 143 144
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
		return LOCAL_DISTANCE;
	else
		return REMOTE_DISTANCE;
145
#else
146
	return LOCAL_DISTANCE;
147
#endif
148 149
}

150
static void __init pcpup_populate_pte(unsigned long addr)
151 152 153 154
{
	populate_extra_pte(addr);
}

155 156 157
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
158 159
	struct desc_struct d = GDT_ENTRY_INIT(0x8092, per_cpu_offset(cpu),
					      0xFFFFF);
160

161
	write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &d, DESCTYPE_S);
162 163 164
#endif
}

165 166
void __init setup_per_cpu_areas(void)
{
167
	unsigned int cpu;
168
	unsigned long delta;
T
Tejun Heo 已提交
169
	int rc;
170

171
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
172
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
173

174
	/*
175 176 177 178
	 * Allocate percpu area.  Embedding allocator is our favorite;
	 * however, on NUMA configurations, it can result in very
	 * sparse unit mapping and vmalloc area isn't spacious enough
	 * on 32bit.  Use page in that case.
179
	 */
180 181 182 183
#ifdef CONFIG_X86_32
	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
		pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
T
Tejun Heo 已提交
184
	rc = -EINVAL;
185 186 187
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
		const size_t dyn_size = PERCPU_MODULE_RESERVE +
			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;
188 189 190 191 192 193 194 195 196 197 198 199 200 201
		size_t atom_size;

		/*
		 * On 64bit, use PMD_SIZE for atom_size so that embedded
		 * percpu areas are aligned to PMD.  This, in the future,
		 * can also allow using PMD mappings in vmalloc area.  Use
		 * PAGE_SIZE on 32bit as vmalloc space is highly contended
		 * and large vmalloc area allocs can easily fail.
		 */
#ifdef CONFIG_X86_64
		atom_size = PMD_SIZE;
#else
		atom_size = PAGE_SIZE;
#endif
202 203 204 205
		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					    dyn_size, atom_size,
					    pcpu_cpu_distance,
					    pcpu_fc_alloc, pcpu_fc_free);
T
Tejun Heo 已提交
206
		if (rc < 0)
207
			pr_warning("%s allocator failed (%d), falling back to page size\n",
208
				   pcpu_fc_names[pcpu_chosen_fc], rc);
209
	}
T
Tejun Heo 已提交
210
	if (rc < 0)
211 212 213
		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					   pcpu_fc_alloc, pcpu_fc_free,
					   pcpup_populate_pte);
T
Tejun Heo 已提交
214 215
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);
216

217
	/* alrighty, percpu areas up and running */
218 219
	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu) {
T
Tejun Heo 已提交
220
		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
221
		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
222
		per_cpu(cpu_number, cpu) = cpu;
223
		setup_percpu_segment(cpu);
224
		setup_stack_canary_segment(cpu);
225
		/*
226 227 228 229 230
		 * Copy data used in early init routines from the
		 * initial arrays to the per cpu data areas.  These
		 * arrays then become expendable and the *_early_ptr's
		 * are zeroed indicating that the static arrays are
		 * gone.
231
		 */
B
Brian Gerst 已提交
232
#ifdef CONFIG_X86_LOCAL_APIC
233
		per_cpu(x86_cpu_to_apicid, cpu) =
234
			early_per_cpu_map(x86_cpu_to_apicid, cpu);
235
		per_cpu(x86_bios_cpu_apicid, cpu) =
236
			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
237 238
		per_cpu(x86_cpu_to_acpiid, cpu) =
			early_per_cpu_map(x86_cpu_to_acpiid, cpu);
B
Brian Gerst 已提交
239
#endif
240 241 242 243
#ifdef CONFIG_X86_32
		per_cpu(x86_cpu_to_logical_apicid, cpu) =
			early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
#endif
244
#ifdef CONFIG_X86_64
245
		per_cpu(irq_stack_ptr, cpu) =
246
			per_cpu(irq_stack_union.irq_stack, cpu) +
247
			IRQ_STACK_SIZE;
248
#endif
B
Brian Gerst 已提交
249 250
#ifdef CONFIG_NUMA
		per_cpu(x86_cpu_to_node_map, cpu) =
251
			early_per_cpu_map(x86_cpu_to_node_map, cpu);
252
		/*
253
		 * Ensure that the boot cpu numa_node is correct when the boot
254 255 256 257 258 259 260
		 * cpu is on a node that doesn't have memory installed.
		 * Also cpu_up() will call cpu_to_node() for APs when
		 * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
		 * up later with c_init aka intel_init/amd_init.
		 * So set them all (boot cpu and all APs).
		 */
		set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
B
Brian Gerst 已提交
261
#endif
262
		/*
263
		 * Up to this point, the boot CPU has been using .init.data
264
		 * area.  Reload any changed state for the boot CPU.
265
		 */
266
		if (!cpu)
267
			switch_to_new_gdt(cpu);
268 269
	}

270
	/* indicate the early static arrays will soon be gone */
271
#ifdef CONFIG_X86_LOCAL_APIC
272 273
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
274
	early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL;
275
#endif
276 277 278
#ifdef CONFIG_X86_32
	early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
#endif
279
#ifdef CONFIG_NUMA
280 281
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
282

283 284
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
285 286 287

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
288 289 290

#ifdef CONFIG_X86_32
	/*
291 292 293 294 295
	 * Sync back kernel address range again.  We already did this in
	 * setup_arch(), but percpu data also needs to be available in
	 * the smpboot asm.  We can't reliably pick up percpu mappings
	 * using vmalloc_fault(), because exception dispatch needs
	 * percpu data.
296 297 298 299 300 301 302 303 304 305 306 307 308
	 */
	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
			KERNEL_PGD_PTRS);

	/*
	 * sync back low identity map too.  It is used for example
	 * in the 32-bit EFI stub.
	 */
	clone_pgd_range(initial_page_table,
			swapper_pg_dir     + KERNEL_PGD_BOUNDARY,
			min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
#endif
309
}