sparse.c 25.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14

15
#include "internal.h"
A
Andy Whitcroft 已提交
16
#include <asm/dma.h>
17 18
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
A
Andy Whitcroft 已提交
19 20 21 22 23 24

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
25
#ifdef CONFIG_SPARSEMEM_EXTREME
26
struct mem_section **mem_section;
27 28
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
29
	____cacheline_internodealigned_in_smp;
30 31 32
#endif
EXPORT_SYMBOL(mem_section);

33 34 35 36 37 38 39 40 41 42 43 44
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
45
int page_to_nid(const struct page *page)
46 47 48 49
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
50 51 52 53 54 55 56 57 58

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
59 60
#endif

61
#ifdef CONFIG_SPARSEMEM_EXTREME
62
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
63 64 65 66 67
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

68
	if (slab_is_available()) {
69
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
70
	} else {
71 72
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
73 74 75 76
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
77 78

	return section;
79
}
B
Bob Picco 已提交
80

81
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
82
{
83 84
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
85

86 87 88 89 90 91 92
	/*
	 * An existing section is possible in the sub-section hotplug
	 * case. First hot-add instantiates, follow-on hot-add reuses
	 * the existing section.
	 *
	 * The mem_hotplug_lock resolves the apparent race below.
	 */
B
Bob Picco 已提交
93
	if (mem_section[root])
94
		return 0;
95

96
	section = sparse_index_alloc(nid);
97 98
	if (!section)
		return -ENOMEM;
99 100

	mem_section[root] = section;
G
Gavin Shan 已提交
101

102
	return 0;
103 104 105 106 107
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
108
}
109 110
#endif

111
#ifdef CONFIG_SPARSEMEM_EXTREME
112
unsigned long __section_nr(struct mem_section *ms)
113 114
{
	unsigned long root_nr;
115
	struct mem_section *root = NULL;
116

117 118
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
119 120 121 122 123 124 125
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

126
	VM_BUG_ON(!root);
127

128 129
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
130
#else
131
unsigned long __section_nr(struct mem_section *ms)
132
{
133
	return (unsigned long)(ms - mem_section[0]);
134 135
}
#endif
136

137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

153 154 155
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
156
{
157
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
158

I
Ingo Molnar 已提交
159 160 161 162
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
163 164 165 166 167 168 169
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
170
	} else if (*end_pfn > max_sparsemem_pfn) {
171 172 173 174 175 176 177 178
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

179 180 181 182 183 184 185 186 187
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
188
unsigned long __highest_present_section_nr;
189 190
static void section_mark_present(struct mem_section *ms)
{
191
	unsigned long section_nr = __section_nr(ms);
192 193 194 195 196 197 198

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

199
static inline unsigned long next_present_section_nr(unsigned long section_nr)
200 201 202 203 204
{
	do {
		section_nr++;
		if (present_section_nr(section_nr))
			return section_nr;
205
	} while ((section_nr <= __highest_present_section_nr));
206 207 208 209 210

	return -1;
}
#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
211
	     ((section_nr != -1) &&				\
212 213 214
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

215 216 217 218 219
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

220 221 222 223 224 225 226 227 228 229 230 231
void subsection_mask_set(unsigned long *map, unsigned long pfn,
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
232
	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
233 234 235 236

	if (!nr_pages)
		return;

237
	for (nr = start_sec; nr <= end_sec; nr++) {
238 239 240 241 242
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
243
		ms = __nr_to_section(nr);
244 245
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

246
		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
247 248 249 250 251 252 253 254
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}

255 256 257 258
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
259

260 261 262 263
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

264
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
265
		align = 1 << (INTERNODE_CACHE_SHIFT);
266
		mem_section = memblock_alloc(size, align);
267 268 269
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
270 271 272
	}
#endif

A
Andy Whitcroft 已提交
273
	start &= PAGE_SECTION_MASK;
274
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
275 276
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
277 278 279
		struct mem_section *ms;

		sparse_index_init(section, nid);
280
		set_section_nid(section, nid);
B
Bob Picco 已提交
281 282

		ms = __nr_to_section(section);
283
		if (!ms->section_mem_map) {
284 285
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
286 287
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
288 289 290
	}
}

291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
/*
 * Mark all memblocks as present using memory_present(). This is a
 * convienence function that is useful for a number of arches
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
307 308 309 310 311 312 313
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
314 315 316 317 318
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
319 320 321
}

/*
322
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
323 324 325
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
326 327
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
328 329 330
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

331
static void __meminit sparse_init_one_section(struct mem_section *ms,
332
		unsigned long pnum, struct page *mem_map,
333
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
334
{
335
	ms->section_mem_map &= ~SECTION_MAP_MASK;
336 337
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
338
	ms->usage = usage;
A
Andy Whitcroft 已提交
339 340
}

341
static unsigned long usemap_size(void)
342
{
343
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
344 345
}

346
size_t mem_section_usage_size(void)
347
{
348
	return sizeof(struct mem_section_usage) + usemap_size();
349 350
}

351
#ifdef CONFIG_MEMORY_HOTREMOVE
352
static struct mem_section_usage * __init
353
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
354
					 unsigned long size)
355
{
356
	struct mem_section_usage *usage;
357 358
	unsigned long goal, limit;
	int nid;
359 360 361
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
362
	 * other sections referencing the usemap remain active. Similarly,
363 364 365 366 367 368
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
369
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
370 371 372
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
373 374
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
375 376 377
		limit = 0;
		goto again;
	}
378
	return usage;
379 380
}

381 382
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
383 384
{
	unsigned long usemap_snr, pgdat_snr;
385 386
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
387 388 389
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

390 391 392 393 394 395
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

396
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
397 398 399 400 401 402 403 404 405 406 407 408 409
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
410 411
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
412 413 414 415 416 417 418 419
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
420 421
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
422 423
}
#else
424
static struct mem_section_usage * __init
425
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
426
					 unsigned long size)
427
{
428
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
429 430
}

431 432
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
433 434 435 436
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

437
#ifdef CONFIG_SPARSEMEM_VMEMMAP
438
static unsigned long __init section_map_size(void)
439 440 441 442 443
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
444
static unsigned long __init section_map_size(void)
445 446 447 448
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

449 450
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
451
{
452 453
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
454
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
455 456 457

	if (map)
		return map;
A
Andy Whitcroft 已提交
458

459
	map = memblock_alloc_try_nid(size,
460
					  PAGE_SIZE, addr,
461
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
462 463 464 465
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

466 467 468 469
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

470 471 472
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

473 474 475 476 477 478
static inline void __meminit sparse_buffer_free(unsigned long size)
{
	WARN_ON(!sparsemap_buf || size == 0);
	memblock_free_early(__pa(sparsemap_buf), size);
}

479
static void __init sparse_buffer_init(unsigned long size, int nid)
480
{
481
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
482 483
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
	sparsemap_buf =
484
		memblock_alloc_try_nid_raw(size, PAGE_SIZE,
485
						addr,
486
						MEMBLOCK_ALLOC_ACCESSIBLE, nid);
487 488 489
	sparsemap_buf_end = sparsemap_buf + size;
}

490
static void __init sparse_buffer_fini(void)
491 492 493 494
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
495
		sparse_buffer_free(size);
496 497 498 499 500 501 502 503
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
504
		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
505 506
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
507 508 509 510
		else {
			/* Free redundant aligned space */
			if ((unsigned long)(ptr - sparsemap_buf) > 0)
				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
511
			sparsemap_buf = ptr + size;
512
		}
513 514 515 516
	}
	return ptr;
}

517
void __weak __meminit vmemmap_populate_print_last(void)
518 519
{
}
520

521 522 523 524 525 526 527 528
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
529 530
	struct mem_section_usage *usage;
	unsigned long pnum;
531 532
	struct page *map;

533 534 535
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
536 537 538 539 540
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
541 542
		unsigned long pfn = section_nr_to_pfn(pnum);

543 544 545
		if (pnum >= pnum_end)
			break;

546 547
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
548 549 550 551 552 553
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
554
		check_usemap_section_nr(nid, usage);
555 556
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
557
		usage = (void *) usage + mem_section_usage_size();
558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
577
void __init sparse_init(void)
578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

604
#ifdef CONFIG_MEMORY_HOTPLUG
605 606 607 608 609 610 611

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
612
		unsigned long section_nr = pfn_to_section_nr(pfn);
613 614 615 616 617 618 619 620 621 622 623 624
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
625
/* Mark all memory sections within the pfn range as offline */
626 627 628 629 630
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
631
		unsigned long section_nr = pfn_to_section_nr(pfn);
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

647
#ifdef CONFIG_SPARSEMEM_VMEMMAP
648 649
static struct page *populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
650
{
651
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
652
}
653 654

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
655
		struct vmem_altmap *altmap)
656
{
657 658
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
659

660
	vmemmap_free(start, end, altmap);
661
}
662
static void free_map_bootmem(struct page *memmap)
663
{
664
	unsigned long start = (unsigned long)memmap;
665
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
666

667
	vmemmap_free(start, end, NULL);
668
}
669
#else
670 671
struct page *populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
672 673
{
	struct page *page, *ret;
674
	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
675

676
	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
677 678 679 680 681 682 683 684 685 686 687 688 689 690 691
	if (page)
		goto got_map_page;

	ret = vmalloc(memmap_size);
	if (ret)
		goto got_map_ptr;

	return NULL;
got_map_page:
	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:

	return ret;
}

692
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
693
		struct vmem_altmap *altmap)
694
{
695
	struct page *memmap = pfn_to_page(pfn);
696

697
	if (is_vmalloc_addr(memmap))
698 699 700
		vfree(memmap);
	else
		free_pages((unsigned long)memmap,
701
			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
702
}
703

704
static void free_map_bootmem(struct page *memmap)
705 706
{
	unsigned long maps_section_nr, removing_section_nr, i;
707
	unsigned long magic, nr_pages;
708
	struct page *page = virt_to_page(memmap);
709

710 711 712
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

713
	for (i = 0; i < nr_pages; i++, page++) {
714
		magic = (unsigned long) page->freelist;
715 716 717 718

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
719
		removing_section_nr = page_private(page);
720 721 722 723 724 725 726 727 728 729 730 731 732

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
733
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
734

735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	bool section_is_early = early_section(ms);
	struct page *memmap = NULL;
	unsigned long *subsection_map = ms->usage
		? &ms->usage->subsection_map[0] : NULL;

	subsection_mask_set(map, pfn, nr_pages);
	if (subsection_map)
		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);

	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
				"section already deactivated (%#lx + %ld)\n",
				pfn, nr_pages))
		return;

	/*
	 * There are 3 cases to handle across two configurations
	 * (SPARSEMEM_VMEMMAP={y,n}):
	 *
	 * 1/ deactivation of a partial hot-added section (only possible
	 * in the SPARSEMEM_VMEMMAP=y case).
	 *    a/ section was present at memory init
	 *    b/ section was hot-added post memory init
	 * 2/ deactivation of a complete hot-added section
	 * 3/ deactivation of a complete section from memory init
	 *
	 * For 1/, when subsection_map does not empty we will not be
	 * freeing the usage map, but still need to free the vmemmap
	 * range.
	 *
	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
	 */
	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
		unsigned long section_nr = pfn_to_section_nr(pfn);

		if (!section_is_early) {
			kfree(ms->usage);
			ms->usage = NULL;
		}
		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
	}

	if (section_is_early && memmap)
		free_map_bootmem(memmap);
	else
		depopulate_section_memmap(pfn, nr_pages, altmap);
}

static struct page * __meminit section_activate(int nid, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	struct mem_section_usage *usage = NULL;
	unsigned long *subsection_map;
	struct page *memmap;
	int rc = 0;

	subsection_mask_set(map, pfn, nr_pages);

	if (!ms->usage) {
		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
		if (!usage)
			return ERR_PTR(-ENOMEM);
		ms->usage = usage;
	}
	subsection_map = &ms->usage->subsection_map[0];

	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
		rc = -EINVAL;
	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
		rc = -EEXIST;
	else
		bitmap_or(subsection_map, map, subsection_map,
				SUBSECTIONS_PER_SECTION);

	if (rc) {
		if (usage)
			ms->usage = NULL;
		kfree(usage);
		return ERR_PTR(rc);
	}

	/*
	 * The early init code does not consider partially populated
	 * initial sections, it simply assumes that memory will never be
	 * referenced.  If we hot-add memory into such a section then we
	 * do not need to populate the memmap and can simply reuse what
	 * is already there.
	 */
	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
		return pfn_to_page(pfn);

	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
	if (!memmap) {
		section_deactivate(pfn, nr_pages, altmap);
		return ERR_PTR(-ENOMEM);
	}

	return memmap;
}

844
/**
845
 * sparse_add_section - add a memory section, or populate an existing one
846 847
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
848
 * @nr_pages: number of pfns to add in the section
849 850 851 852 853 854 855 856
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
857
 */
858 859
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
860
{
861 862 863 864
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
865

866
	ret = sparse_index_init(section_nr, nid);
867
	if (ret < 0)
868
		return ret;
869

870 871 872
	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
	if (IS_ERR(memmap))
		return PTR_ERR(memmap);
873

874 875 876 877
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
878
	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
879

880
	ms = __nr_to_section(section_nr);
881
	set_section_nid(section_nr, nid);
882
	section_mark_present(ms);
883

884 885 886 887 888 889
	/* Align memmap to section boundary in the subsection case */
	if (section_nr_to_pfn(section_nr) != start_pfn)
		memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);

	return 0;
A
Andy Whitcroft 已提交
890
}
891

892 893 894 895 896 897 898 899
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

	if (!memmap)
		return;

900 901 902 903 904 905 906 907 908
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

909
	for (i = 0; i < nr_pages; i++) {
910
		if (PageHWPoison(&memmap[i])) {
911
			atomic_long_sub(1, &num_poisoned_pages);
912 913 914 915 916 917 918 919 920 921
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

922
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
923 924
		unsigned long nr_pages, unsigned long map_offset,
		struct vmem_altmap *altmap)
925
{
926 927 928
	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
			nr_pages - map_offset);
	section_deactivate(pfn, nr_pages, altmap);
929
}
930
#endif /* CONFIG_MEMORY_HOTPLUG */