setup_percpu.c 7.4 KB
Newer Older
1 2
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

3 4 5 6 7
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/percpu.h>
8
#include <linux/kexec.h>
9
#include <linux/crash_dump.h>
10 11
#include <linux/smp.h>
#include <linux/topology.h>
12
#include <linux/pfn.h>
13 14 15
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
16
#include <asm/mpspec.h>
17
#include <asm/apicdef.h>
18
#include <asm/highmem.h>
19
#include <asm/proto.h>
20
#include <asm/cpumask.h>
B
Brian Gerst 已提交
21
#include <asm/cpu.h>
22
#include <asm/stackprotector.h>
23

24 25 26
DEFINE_PER_CPU(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);

B
Brian Gerst 已提交
27 28 29 30 31 32 33 34 35
#ifdef CONFIG_X86_64
#define BOOT_PERCPU_OFFSET ((unsigned long)__per_cpu_load)
#else
#define BOOT_PERCPU_OFFSET 0
#endif

DEFINE_PER_CPU(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET;
EXPORT_PER_CPU_SYMBOL(this_cpu_off);

36
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
B
Brian Gerst 已提交
37
	[0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET,
38 39
};
EXPORT_SYMBOL(__per_cpu_offset);
40

41 42 43 44 45 46 47 48 49 50 51 52 53
/*
 * On x86_64 symbols referenced from code should be reachable using
 * 32bit relocations.  Reserve space for static percpu variables in
 * modules so that they are always served from the first chunk which
 * is located at the percpu segment base.  On x86_32, anything can
 * address anywhere.  No need to reserve space in the first chunk.
 */
#ifdef CONFIG_X86_64
#define PERCPU_FIRST_CHUNK_RESERVE	PERCPU_MODULE_RESERVE
#else
#define PERCPU_FIRST_CHUNK_RESERVE	0
#endif

54
#ifdef CONFIG_X86_32
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
/**
 * pcpu_need_numa - determine percpu allocation needs to consider NUMA
 *
 * If NUMA is not configured or there is only one NUMA node available,
 * there is no reason to consider NUMA.  This function determines
 * whether percpu allocation should consider NUMA or not.
 *
 * RETURNS:
 * true if NUMA should be considered; otherwise, false.
 */
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
	pg_data_t *last = NULL;
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		int node = early_cpu_to_node(cpu);

		if (node_online(node) && NODE_DATA(node) &&
		    last && last != NODE_DATA(node))
			return true;

		last = NODE_DATA(node);
	}
#endif
	return false;
}
83
#endif
84

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
/**
 * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
 * @cpu: cpu to allocate for
 * @size: size allocation in bytes
 * @align: alignment
 *
 * Allocate @size bytes aligned at @align for cpu @cpu.  This wrapper
 * does the right thing for NUMA regardless of the current
 * configuration.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
					unsigned long align)
{
	const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
	int node = early_cpu_to_node(cpu);
	void *ptr;

	if (!node_online(node) || !NODE_DATA(node)) {
		ptr = __alloc_bootmem_nopanic(size, align, goal);
		pr_info("cpu %d has no node %d or node-local memory\n",
			cpu, node);
		pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
			 cpu, size, __pa(ptr));
	} else {
		ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
						   size, align, goal);
115 116
		pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
			 cpu, size, node, __pa(ptr));
117 118 119 120 121 122 123
	}
	return ptr;
#else
	return __alloc_bootmem_nopanic(size, align, goal);
#endif
}

124 125 126
/*
 * Helpers for first chunk memory allocation
 */
127
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
128
{
129
	return pcpu_alloc_bootmem(cpu, size, align);
130 131 132 133 134 135 136
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
	free_bootmem(__pa(ptr), size);
}

137
static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
138
{
139
#ifdef CONFIG_NEED_MULTIPLE_NODES
140 141 142 143
	if (early_cpu_to_node(from) == early_cpu_to_node(to))
		return LOCAL_DISTANCE;
	else
		return REMOTE_DISTANCE;
144
#else
145
	return LOCAL_DISTANCE;
146
#endif
147 148
}

149
static void __init pcpup_populate_pte(unsigned long addr)
150 151 152 153
{
	populate_extra_pte(addr);
}

154 155 156 157 158 159 160 161 162 163 164 165 166
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
	struct desc_struct gdt;

	pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF,
			0x2 | DESCTYPE_S, 0x8);
	gdt.s = 1;
	write_gdt_entry(get_cpu_gdt_table(cpu),
			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
#endif
}

167 168
void __init setup_per_cpu_areas(void)
{
169
	unsigned int cpu;
170
	unsigned long delta;
T
Tejun Heo 已提交
171
	int rc;
172

173
	pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
174
		NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
175

176
	/*
177 178 179 180
	 * Allocate percpu area.  Embedding allocator is our favorite;
	 * however, on NUMA configurations, it can result in very
	 * sparse unit mapping and vmalloc area isn't spacious enough
	 * on 32bit.  Use page in that case.
181
	 */
182 183 184 185
#ifdef CONFIG_X86_32
	if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())
		pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
T
Tejun Heo 已提交
186
	rc = -EINVAL;
187 188 189 190 191 192 193 194 195
	if (pcpu_chosen_fc != PCPU_FC_PAGE) {
		const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;
		const size_t dyn_size = PERCPU_MODULE_RESERVE +
			PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;

		rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					    dyn_size, atom_size,
					    pcpu_cpu_distance,
					    pcpu_fc_alloc, pcpu_fc_free);
T
Tejun Heo 已提交
196
		if (rc < 0)
197
			pr_warning("%s allocator failed (%d), falling back to page size\n",
198
				   pcpu_fc_names[pcpu_chosen_fc], rc);
199
	}
T
Tejun Heo 已提交
200
	if (rc < 0)
201 202 203
		rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,
					   pcpu_fc_alloc, pcpu_fc_free,
					   pcpup_populate_pte);
T
Tejun Heo 已提交
204 205
	if (rc < 0)
		panic("cannot initialize percpu area (err=%d)", rc);
206

207
	/* alrighty, percpu areas up and running */
208 209
	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu) {
T
Tejun Heo 已提交
210
		per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];
211
		per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
212
		per_cpu(cpu_number, cpu) = cpu;
213
		setup_percpu_segment(cpu);
214
		setup_stack_canary_segment(cpu);
215
		/*
216 217 218 219 220
		 * Copy data used in early init routines from the
		 * initial arrays to the per cpu data areas.  These
		 * arrays then become expendable and the *_early_ptr's
		 * are zeroed indicating that the static arrays are
		 * gone.
221
		 */
B
Brian Gerst 已提交
222
#ifdef CONFIG_X86_LOCAL_APIC
223
		per_cpu(x86_cpu_to_apicid, cpu) =
224
			early_per_cpu_map(x86_cpu_to_apicid, cpu);
225
		per_cpu(x86_bios_cpu_apicid, cpu) =
226
			early_per_cpu_map(x86_bios_cpu_apicid, cpu);
B
Brian Gerst 已提交
227
#endif
228
#ifdef CONFIG_X86_64
229
		per_cpu(irq_stack_ptr, cpu) =
230 231
			per_cpu(irq_stack_union.irq_stack, cpu) +
			IRQ_STACK_SIZE - 64;
B
Brian Gerst 已提交
232 233
#ifdef CONFIG_NUMA
		per_cpu(x86_cpu_to_node_map, cpu) =
234
			early_per_cpu_map(x86_cpu_to_node_map, cpu);
235
		/*
236
		 * Ensure that the boot cpu numa_node is correct when the boot
237 238 239 240 241 242 243
		 * cpu is on a node that doesn't have memory installed.
		 * Also cpu_up() will call cpu_to_node() for APs when
		 * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
		 * up later with c_init aka intel_init/amd_init.
		 * So set them all (boot cpu and all APs).
		 */
		set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
244
#endif
B
Brian Gerst 已提交
245
#endif
246
		/*
247
		 * Up to this point, the boot CPU has been using .init.data
248
		 * area.  Reload any changed state for the boot CPU.
249
		 */
250
		if (!cpu)
251
			switch_to_new_gdt(cpu);
252 253
	}

254
	/* indicate the early static arrays will soon be gone */
255
#ifdef CONFIG_X86_LOCAL_APIC
256 257
	early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
	early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
258
#endif
B
Brian Gerst 已提交
259
#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
260 261
	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
262

263 264
	/* Setup node to cpumask map */
	setup_node_to_cpumask_map();
265 266 267

	/* Setup cpu initialized, callin, callout masks */
	setup_cpu_local_masks();
268
}