percpu-vm.c 10.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * mm/percpu-vm.c - vmalloc area based chunk allocation
 *
 * Copyright (C) 2010		SUSE Linux Products GmbH
 * Copyright (C) 2010		Tejun Heo <tj@kernel.org>
 *
 * This file is released under the GPLv2.
 *
 * Chunks are mapped into vmalloc areas and populated page by page.
 * This is the default chunk allocator.
 */

static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
				    unsigned int cpu, int page_idx)
{
	/* must not be used on pre-mapped chunk */
	WARN_ON(chunk->immutable);

	return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}

/**
23
 * pcpu_get_pages - get temp pages array
24
 *
25
 * Returns pointer to array of pointers to struct page which can be indexed
26 27
 * with pcpu_page_idx().  Note that there is only one array and accesses
 * should be serialized by pcpu_alloc_mutex.
28 29
 *
 * RETURNS:
30
 * Pointer to temp pages array on success.
31
 */
32
static struct page **pcpu_get_pages(void)
33 34 35 36
{
	static struct page **pages;
	size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);

37 38 39
	lockdep_assert_held(&pcpu_alloc_mutex);

	if (!pages)
40
		pages = pcpu_mem_zalloc(pages_size);
41 42 43 44 45 46 47 48 49 50 51 52 53 54
	return pages;
}

/**
 * pcpu_free_pages - free pages which were allocated for @chunk
 * @chunk: chunk pages were allocated for
 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be freed
 * @page_end: page index of the last page to be freed + 1
 *
 * Free pages [@page_start and @page_end) in @pages for all units.
 * The pages were allocated for @chunk.
 */
static void pcpu_free_pages(struct pcpu_chunk *chunk,
55
			    struct page **pages, int page_start, int page_end)
56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
{
	unsigned int cpu;
	int i;

	for_each_possible_cpu(cpu) {
		for (i = page_start; i < page_end; i++) {
			struct page *page = pages[pcpu_page_idx(cpu, i)];

			if (page)
				__free_page(page);
		}
	}
}

/**
 * pcpu_alloc_pages - allocates pages for @chunk
 * @chunk: target chunk
 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be allocated
 * @page_end: page index of the last page to be allocated + 1
 *
 * Allocate pages [@page_start,@page_end) into @pages for all units.
 * The allocation is for @chunk.  Percpu core doesn't care about the
 * content of @pages and will pass it verbatim to pcpu_map_pages().
 */
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
82
			    struct page **pages, int page_start, int page_end)
83 84
{
	const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
85
	unsigned int cpu, tcpu;
86 87 88 89 90 91 92
	int i;

	for_each_possible_cpu(cpu) {
		for (i = page_start; i < page_end; i++) {
			struct page **pagep = &pages[pcpu_page_idx(cpu, i)];

			*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
93 94
			if (!*pagep)
				goto err;
95 96 97
		}
	}
	return 0;
98 99 100 101 102 103 104 105 106 107 108 109

err:
	while (--i >= page_start)
		__free_page(pages[pcpu_page_idx(cpu, i)]);

	for_each_possible_cpu(tcpu) {
		if (tcpu == cpu)
			break;
		for (i = page_start; i < page_end; i++)
			__free_page(pages[pcpu_page_idx(tcpu, i)]);
	}
	return -ENOMEM;
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
}

/**
 * pcpu_pre_unmap_flush - flush cache prior to unmapping
 * @chunk: chunk the regions to be flushed belongs to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages in [@page_start,@page_end) of @chunk are about to be
 * unmapped.  Flush cache.  As each flushing trial can be very
 * expensive, issue flush on the whole region at once rather than
 * doing it for each cpu.  This could be an overkill but is more
 * scalable.
 */
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
				 int page_start, int page_end)
{
	flush_cache_vunmap(
T
Tejun Heo 已提交
128 129
		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
}

static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
	unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
}

/**
 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array which can be used to pass information to free
 * @page_start: page index of the first page to unmap
 * @page_end: page index of the last page to unmap + 1
 *
 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 * Corresponding elements in @pages were cleared by the caller and can
 * be used to carry information to pcpu_free_pages() which will be
 * called after all unmaps are finished.  The caller should call
 * proper pre/post flush functions.
 */
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
151
			     struct page **pages, int page_start, int page_end)
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
{
	unsigned int cpu;
	int i;

	for_each_possible_cpu(cpu) {
		for (i = page_start; i < page_end; i++) {
			struct page *page;

			page = pcpu_chunk_page(chunk, cpu, i);
			WARN_ON(!page);
			pages[pcpu_page_idx(cpu, i)] = page;
		}
		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
				   page_end - page_start);
	}
}

/**
 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
 * @chunk: pcpu_chunk the regions to be flushed belong to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
 * TLB for the regions.  This can be skipped if the area is to be
 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
 *
 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 * for the whole region.
 */
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
				      int page_start, int page_end)
{
	flush_tlb_kernel_range(
T
Tejun Heo 已提交
186 187
		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
}

static int __pcpu_map_pages(unsigned long addr, struct page **pages,
			    int nr_pages)
{
	return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
					PAGE_KERNEL, pages);
}

/**
 * pcpu_map_pages - map pages into a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array containing pages to be mapped
 * @page_start: page index of the first page to map
 * @page_end: page index of the last page to map + 1
 *
 * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
 * caller is responsible for calling pcpu_post_map_flush() after all
 * mappings are complete.
 *
208 209
 * This function is responsible for setting up whatever is necessary for
 * reverse lookup (addr -> chunk).
210 211
 */
static int pcpu_map_pages(struct pcpu_chunk *chunk,
212
			  struct page **pages, int page_start, int page_end)
213 214 215 216 217 218 219 220 221 222 223
{
	unsigned int cpu, tcpu;
	int i, err;

	for_each_possible_cpu(cpu) {
		err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
				       &pages[pcpu_page_idx(cpu, page_start)],
				       page_end - page_start);
		if (err < 0)
			goto err;

224
		for (i = page_start; i < page_end; i++)
225 226 227 228 229 230 231 232 233 234 235
			pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
					    chunk);
	}
	return 0;
err:
	for_each_possible_cpu(tcpu) {
		if (tcpu == cpu)
			break;
		__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
				   page_end - page_start);
	}
236
	pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255
	return err;
}

/**
 * pcpu_post_map_flush - flush cache after mapping
 * @chunk: pcpu_chunk the regions to be flushed belong to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
 * cache.
 *
 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 * for the whole region.
 */
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
				int page_start, int page_end)
{
	flush_cache_vmap(
T
Tejun Heo 已提交
256 257
		pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
		pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
258 259 260 261 262
}

/**
 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 * @chunk: chunk of interest
263 264
 * @page_start: the start page
 * @page_end: the end page
265 266
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
267
 * @chunk.
268 269 270 271
 *
 * CONTEXT:
 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 */
272 273
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
			       int page_start, int page_end)
274 275 276
{
	struct page **pages;

277
	pages = pcpu_get_pages();
278 279 280
	if (!pages)
		return -ENOMEM;

281 282
	if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
		return -ENOMEM;
283

284 285 286
	if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
		pcpu_free_pages(chunk, pages, page_start, page_end);
		return -ENOMEM;
287 288 289 290 291 292 293 294 295
	}
	pcpu_post_map_flush(chunk, page_start, page_end);

	return 0;
}

/**
 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 * @chunk: chunk to depopulate
296 297
 * @page_start: the start page
 * @page_end: the end page
298 299
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
300
 * from @chunk.
301 302 303 304
 *
 * CONTEXT:
 * pcpu_alloc_mutex.
 */
305 306
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
				  int page_start, int page_end)
307 308 309 310 311 312 313 314
{
	struct page **pages;

	/*
	 * If control reaches here, there must have been at least one
	 * successful population attempt so the temp pages array must
	 * be available now.
	 */
315
	pages = pcpu_get_pages();
316 317 318 319 320
	BUG_ON(!pages);

	/* unmap and free */
	pcpu_pre_unmap_flush(chunk, page_start, page_end);

321
	pcpu_unmap_pages(chunk, pages, page_start, page_end);
322 323 324

	/* no need to flush tlb, vmalloc will handle it lazily */

325
	pcpu_free_pages(chunk, pages, page_start, page_end);
326 327 328 329 330 331 332 333 334 335 336 337
}

static struct pcpu_chunk *pcpu_create_chunk(void)
{
	struct pcpu_chunk *chunk;
	struct vm_struct **vms;

	chunk = pcpu_alloc_chunk();
	if (!chunk)
		return NULL;

	vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
338
				pcpu_nr_groups, pcpu_atom_size);
339 340 341 342 343 344 345
	if (!vms) {
		pcpu_free_chunk(chunk);
		return NULL;
	}

	chunk->data = vms;
	chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
346 347

	pcpu_stats_chunk_alloc();
348
	trace_percpu_create_chunk(chunk->base_addr);
349

350 351 352 353 354
	return chunk;
}

static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
355 356 357
	if (!chunk)
		return;

358
	pcpu_stats_chunk_dealloc();
359
	trace_percpu_destroy_chunk(chunk->base_addr);
360

361
	if (chunk->data)
362 363 364 365 366 367 368 369 370 371 372 373 374 375
		pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
	pcpu_free_chunk(chunk);
}

static struct page *pcpu_addr_to_page(void *addr)
{
	return vmalloc_to_page(addr);
}

static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
{
	/* no extra restriction */
	return 0;
}