sparse.c 25.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14 15
#include <linux/swap.h>
#include <linux/swapops.h>
16

17
#include "internal.h"
A
Andy Whitcroft 已提交
18
#include <asm/dma.h>
19 20
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
A
Andy Whitcroft 已提交
21 22 23 24 25 26

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
27
#ifdef CONFIG_SPARSEMEM_EXTREME
28
struct mem_section **mem_section;
29 30
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
31
	____cacheline_internodealigned_in_smp;
32 33 34
#endif
EXPORT_SYMBOL(mem_section);

35 36 37 38 39 40 41 42 43 44 45 46
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
47
int page_to_nid(const struct page *page)
48 49 50 51
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
52 53 54 55 56 57 58 59 60

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
61 62
#endif

63
#ifdef CONFIG_SPARSEMEM_EXTREME
64
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
65 66 67 68 69
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

70
	if (slab_is_available()) {
71
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
72
	} else {
73 74
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
75 76 77 78
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
79 80

	return section;
81
}
B
Bob Picco 已提交
82

83
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
84
{
85 86
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
87

88 89 90 91 92 93 94
	/*
	 * An existing section is possible in the sub-section hotplug
	 * case. First hot-add instantiates, follow-on hot-add reuses
	 * the existing section.
	 *
	 * The mem_hotplug_lock resolves the apparent race below.
	 */
B
Bob Picco 已提交
95
	if (mem_section[root])
96
		return 0;
97

98
	section = sparse_index_alloc(nid);
99 100
	if (!section)
		return -ENOMEM;
101 102

	mem_section[root] = section;
G
Gavin Shan 已提交
103

104
	return 0;
105 106 107 108 109
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
110
}
111 112
#endif

113
#ifdef CONFIG_SPARSEMEM_EXTREME
114
unsigned long __section_nr(struct mem_section *ms)
115 116
{
	unsigned long root_nr;
117
	struct mem_section *root = NULL;
118

119 120
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
121 122 123 124 125 126 127
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

128
	VM_BUG_ON(!root);
129

130 131
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
132
#else
133
unsigned long __section_nr(struct mem_section *ms)
134
{
135
	return (unsigned long)(ms - mem_section[0]);
136 137
}
#endif
138

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

155 156 157
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
158
{
159
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
160

I
Ingo Molnar 已提交
161 162 163 164
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
165 166 167 168 169 170 171
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
172
	} else if (*end_pfn > max_sparsemem_pfn) {
173 174 175 176 177 178 179 180
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

181 182 183 184 185 186 187 188 189
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
190
unsigned long __highest_present_section_nr;
191 192
static void section_mark_present(struct mem_section *ms)
{
193
	unsigned long section_nr = __section_nr(ms);
194 195 196 197 198 199 200 201 202

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
203
	     ((section_nr != -1) &&				\
204 205 206
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

207 208 209 210 211
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

Y
Yi Wang 已提交
212
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
213 214 215 216 217 218 219 220 221 222 223
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
224
	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
225 226 227 228

	if (!nr_pages)
		return;

229
	for (nr = start_sec; nr <= end_sec; nr++) {
230 231 232 233 234
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
235
		ms = __nr_to_section(nr);
236 237
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

238
		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
239 240 241 242 243 244 245 246
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}

247 248 249 250
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
251

252 253 254 255
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

256
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
257
		align = 1 << (INTERNODE_CACHE_SHIFT);
258
		mem_section = memblock_alloc(size, align);
259 260 261
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
262 263 264
	}
#endif

A
Andy Whitcroft 已提交
265
	start &= PAGE_SECTION_MASK;
266
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
267 268
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
269 270 271
		struct mem_section *ms;

		sparse_index_init(section, nid);
272
		set_section_nid(section, nid);
B
Bob Picco 已提交
273 274

		ms = __nr_to_section(section);
275
		if (!ms->section_mem_map) {
276 277
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
278 279
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
280 281 282
	}
}

283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298
/*
 * Mark all memblocks as present using memory_present(). This is a
 * convienence function that is useful for a number of arches
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
299 300 301 302 303 304 305
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
306 307 308 309 310
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
311 312 313
}

/*
314
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
315 316 317
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
318 319
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
320 321 322
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

323
static void __meminit sparse_init_one_section(struct mem_section *ms,
324
		unsigned long pnum, struct page *mem_map,
325
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
326
{
327
	ms->section_mem_map &= ~SECTION_MAP_MASK;
328 329
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
330
	ms->usage = usage;
A
Andy Whitcroft 已提交
331 332
}

333
static unsigned long usemap_size(void)
334
{
335
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
336 337
}

338
size_t mem_section_usage_size(void)
339
{
340
	return sizeof(struct mem_section_usage) + usemap_size();
341 342
}

343
#ifdef CONFIG_MEMORY_HOTREMOVE
344
static struct mem_section_usage * __init
345
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
346
					 unsigned long size)
347
{
348
	struct mem_section_usage *usage;
349 350
	unsigned long goal, limit;
	int nid;
351 352 353
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
354
	 * other sections referencing the usemap remain active. Similarly,
355 356 357 358 359 360
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
361
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
362 363 364
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
365 366
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
367 368 369
		limit = 0;
		goto again;
	}
370
	return usage;
371 372
}

373 374
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
375 376
{
	unsigned long usemap_snr, pgdat_snr;
377 378
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
379 380 381
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

382 383 384 385 386 387
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

388
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
389 390 391 392 393 394 395 396 397 398 399 400 401
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
402 403
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
404 405 406 407 408 409 410 411
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
412 413
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
414 415
}
#else
416
static struct mem_section_usage * __init
417
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
418
					 unsigned long size)
419
{
420
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
421 422
}

423 424
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
425 426 427 428
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

429
#ifdef CONFIG_SPARSEMEM_VMEMMAP
430
static unsigned long __init section_map_size(void)
431 432 433 434 435
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
436
static unsigned long __init section_map_size(void)
437 438 439 440
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

441 442
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
443
{
444 445
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
446
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
447 448 449

	if (map)
		return map;
A
Andy Whitcroft 已提交
450

451
	map = memblock_alloc_try_nid_raw(size, size, addr,
452
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
453 454 455 456
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

457 458 459 460
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

461 462 463
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

464 465 466 467 468 469
static inline void __meminit sparse_buffer_free(unsigned long size)
{
	WARN_ON(!sparsemap_buf || size == 0);
	memblock_free_early(__pa(sparsemap_buf), size);
}

470
static void __init sparse_buffer_init(unsigned long size, int nid)
471
{
472
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
473
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
474 475 476 477 478
	/*
	 * Pre-allocated buffer is mainly used by __populate_section_memmap
	 * and we want it to be properly aligned to the section size - this is
	 * especially the case for VMEMMAP which maps memmap to PMDs
	 */
479
	sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
480
					addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
481 482 483
	sparsemap_buf_end = sparsemap_buf + size;
}

484
static void __init sparse_buffer_fini(void)
485 486 487 488
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
489
		sparse_buffer_free(size);
490 491 492 493 494 495 496 497
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
498
		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
499 500
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
501 502 503 504
		else {
			/* Free redundant aligned space */
			if ((unsigned long)(ptr - sparsemap_buf) > 0)
				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
505
			sparsemap_buf = ptr + size;
506
		}
507 508 509 510
	}
	return ptr;
}

511
void __weak __meminit vmemmap_populate_print_last(void)
512 513
{
}
514

515 516 517 518 519 520 521 522
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
523 524
	struct mem_section_usage *usage;
	unsigned long pnum;
525 526
	struct page *map;

527 528 529
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
530 531 532 533 534
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
535 536
		unsigned long pfn = section_nr_to_pfn(pnum);

537 538 539
		if (pnum >= pnum_end)
			break;

540 541
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
542 543 544 545 546 547
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
548
		check_usemap_section_nr(nid, usage);
549 550
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
551
		usage = (void *) usage + mem_section_usage_size();
552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
571
void __init sparse_init(void)
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

598
#ifdef CONFIG_MEMORY_HOTPLUG
599 600 601 602 603 604 605

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
606
		unsigned long section_nr = pfn_to_section_nr(pfn);
607 608 609 610 611 612 613 614 615 616 617 618
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
619
/* Mark all memory sections within the pfn range as offline */
620 621 622 623 624
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
625
		unsigned long section_nr = pfn_to_section_nr(pfn);
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

641
#ifdef CONFIG_SPARSEMEM_VMEMMAP
642
static struct page * __meminit populate_section_memmap(unsigned long pfn,
643
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
644
{
645
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
646
}
647 648

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
649
		struct vmem_altmap *altmap)
650
{
651 652
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
653

654
	vmemmap_free(start, end, altmap);
655
}
656
static void free_map_bootmem(struct page *memmap)
657
{
658
	unsigned long start = (unsigned long)memmap;
659
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
660

661
	vmemmap_free(start, end, NULL);
662
}
663
#else
664
struct page * __meminit populate_section_memmap(unsigned long pfn,
665
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
666
{
667 668
	return kvmalloc_node(array_size(sizeof(struct page),
					PAGES_PER_SECTION), GFP_KERNEL, nid);
669 670
}

671
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
672
		struct vmem_altmap *altmap)
673
{
674
	kvfree(pfn_to_page(pfn));
675
}
676

677
static void free_map_bootmem(struct page *memmap)
678 679
{
	unsigned long maps_section_nr, removing_section_nr, i;
680
	unsigned long magic, nr_pages;
681
	struct page *page = virt_to_page(memmap);
682

683 684 685
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

686
	for (i = 0; i < nr_pages; i++, page++) {
687
		magic = (unsigned long) page->freelist;
688 689 690 691

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
692
		removing_section_nr = page_private(page);
693 694 695 696 697 698 699 700 701 702 703 704 705

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
706
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
707

708 709 710 711 712 713 714 715
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	bool section_is_early = early_section(ms);
	struct page *memmap = NULL;
716
	bool empty;
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
	unsigned long *subsection_map = ms->usage
		? &ms->usage->subsection_map[0] : NULL;

	subsection_mask_set(map, pfn, nr_pages);
	if (subsection_map)
		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);

	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
				"section already deactivated (%#lx + %ld)\n",
				pfn, nr_pages))
		return;

	/*
	 * There are 3 cases to handle across two configurations
	 * (SPARSEMEM_VMEMMAP={y,n}):
	 *
	 * 1/ deactivation of a partial hot-added section (only possible
	 * in the SPARSEMEM_VMEMMAP=y case).
	 *    a/ section was present at memory init
	 *    b/ section was hot-added post memory init
	 * 2/ deactivation of a complete hot-added section
	 * 3/ deactivation of a complete section from memory init
	 *
	 * For 1/, when subsection_map does not empty we will not be
	 * freeing the usage map, but still need to free the vmemmap
	 * range.
	 *
	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
	 */
	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
747 748
	empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
	if (empty) {
749 750
		unsigned long section_nr = pfn_to_section_nr(pfn);

751 752 753 754 755 756 757 758
		/*
		 * When removing an early section, the usage map is kept (as the
		 * usage maps of other sections fall into the same page). It
		 * will be re-used when re-adding the section - which is then no
		 * longer an early section. If the usage map is PageReserved, it
		 * was allocated during boot.
		 */
		if (!PageReserved(virt_to_page(ms->usage))) {
759 760 761 762
			kfree(ms->usage);
			ms->usage = NULL;
		}
		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
763 764 765 766 767 768
		/*
		 * Mark the section invalid so that valid_section()
		 * return false. This prevents code from dereferencing
		 * ms->usage array.
		 */
		ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
769 770 771 772 773 774
	}

	if (section_is_early && memmap)
		free_map_bootmem(memmap);
	else
		depopulate_section_memmap(pfn, nr_pages, altmap);
775 776 777

	if (empty)
		ms->section_mem_map = (unsigned long)NULL;
778 779
}

780
static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
781 782
{
	struct mem_section *ms = __pfn_to_section(pfn);
783
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
	unsigned long *subsection_map;
	int rc = 0;

	subsection_mask_set(map, pfn, nr_pages);

	subsection_map = &ms->usage->subsection_map[0];

	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
		rc = -EINVAL;
	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
		rc = -EEXIST;
	else
		bitmap_or(subsection_map, map, subsection_map,
				SUBSECTIONS_PER_SECTION);

799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
	return rc;
}

static struct page * __meminit section_activate(int nid, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	struct mem_section_usage *usage = NULL;
	struct page *memmap;
	int rc = 0;

	if (!ms->usage) {
		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
		if (!usage)
			return ERR_PTR(-ENOMEM);
		ms->usage = usage;
	}

	rc = fill_subsection_map(pfn, nr_pages);
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843
	if (rc) {
		if (usage)
			ms->usage = NULL;
		kfree(usage);
		return ERR_PTR(rc);
	}

	/*
	 * The early init code does not consider partially populated
	 * initial sections, it simply assumes that memory will never be
	 * referenced.  If we hot-add memory into such a section then we
	 * do not need to populate the memmap and can simply reuse what
	 * is already there.
	 */
	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
		return pfn_to_page(pfn);

	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
	if (!memmap) {
		section_deactivate(pfn, nr_pages, altmap);
		return ERR_PTR(-ENOMEM);
	}

	return memmap;
}

844
/**
845
 * sparse_add_section - add a memory section, or populate an existing one
846 847
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
848
 * @nr_pages: number of pfns to add in the section
849 850 851 852 853 854 855 856
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
857
 */
858 859
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
860
{
861 862 863 864
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
865

866
	ret = sparse_index_init(section_nr, nid);
867
	if (ret < 0)
868
		return ret;
869

870 871 872
	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
	if (IS_ERR(memmap))
		return PTR_ERR(memmap);
873

874 875 876 877
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
878
	page_init_poison(memmap, sizeof(struct page) * nr_pages);
879

880
	ms = __nr_to_section(section_nr);
881
	set_section_nid(section_nr, nid);
882
	section_mark_present(ms);
883

884 885
	/* Align memmap to section boundary in the subsection case */
	if (section_nr_to_pfn(section_nr) != start_pfn)
886
		memmap = pfn_to_page(section_nr_to_pfn(section_nr));
887 888 889
	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);

	return 0;
A
Andy Whitcroft 已提交
890
}
891

892 893 894 895 896
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

897 898 899 900 901 902 903 904 905
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

906
	for (i = 0; i < nr_pages; i++) {
907
		if (PageHWPoison(&memmap[i])) {
908
			num_poisoned_pages_dec();
909 910 911 912 913 914 915 916 917 918
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

919
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
920 921
		unsigned long nr_pages, unsigned long map_offset,
		struct vmem_altmap *altmap)
922
{
923 924 925
	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
			nr_pages - map_offset);
	section_deactivate(pfn, nr_pages, altmap);
926
}
927
#endif /* CONFIG_MEMORY_HOTPLUG */