sparse.c 22.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14

15
#include "internal.h"
A
Andy Whitcroft 已提交
16
#include <asm/dma.h>
17 18
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
A
Andy Whitcroft 已提交
19 20 21 22 23 24

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
25
#ifdef CONFIG_SPARSEMEM_EXTREME
26
struct mem_section **mem_section;
27 28
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
29
	____cacheline_internodealigned_in_smp;
30 31 32
#endif
EXPORT_SYMBOL(mem_section);

33 34 35 36 37 38 39 40 41 42 43 44
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
45
int page_to_nid(const struct page *page)
46 47 48 49
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
50 51 52 53 54 55 56 57 58

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
59 60
#endif

61
#ifdef CONFIG_SPARSEMEM_EXTREME
62
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
63 64 65 66 67
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

68
	if (slab_is_available()) {
69
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
70
	} else {
71 72
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
73 74 75 76
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
77 78

	return section;
79
}
B
Bob Picco 已提交
80

81
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
82
{
83 84
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
85 86

	if (mem_section[root])
87
		return -EEXIST;
88

89
	section = sparse_index_alloc(nid);
90 91
	if (!section)
		return -ENOMEM;
92 93

	mem_section[root] = section;
G
Gavin Shan 已提交
94

95
	return 0;
96 97 98 99 100
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
101
}
102 103
#endif

104
#ifdef CONFIG_SPARSEMEM_EXTREME
105
unsigned long __section_nr(struct mem_section *ms)
106 107
{
	unsigned long root_nr;
108
	struct mem_section *root = NULL;
109

110 111
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
112 113 114 115 116 117 118
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

119
	VM_BUG_ON(!root);
120

121 122
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
123
#else
124
unsigned long __section_nr(struct mem_section *ms)
125
{
126
	return (unsigned long)(ms - mem_section[0]);
127 128
}
#endif
129

130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

146 147 148
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
149
{
150
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
151

I
Ingo Molnar 已提交
152 153 154 155
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
156 157 158 159 160 161 162
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
163
	} else if (*end_pfn > max_sparsemem_pfn) {
164 165 166 167 168 169 170 171
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

172 173 174 175 176 177 178 179 180
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
181
unsigned long __highest_present_section_nr;
182 183
static void section_mark_present(struct mem_section *ms)
{
184
	unsigned long section_nr = __section_nr(ms);
185 186 187 188 189 190 191

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

192
static inline unsigned long next_present_section_nr(unsigned long section_nr)
193 194 195 196 197
{
	do {
		section_nr++;
		if (present_section_nr(section_nr))
			return section_nr;
198
	} while ((section_nr <= __highest_present_section_nr));
199 200 201 202 203

	return -1;
}
#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
204
	     ((section_nr != -1) &&				\
205 206 207
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

208 209 210 211 212
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247
void subsection_mask_set(unsigned long *map, unsigned long pfn,
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
	int i, start_sec = pfn_to_section_nr(pfn);

	if (!nr_pages)
		return;

	for (i = start_sec; i <= end_sec; i++) {
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
		ms = __nr_to_section(i);
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

		pr_debug("%s: sec: %d pfns: %ld set(%d, %d)\n", __func__, i,
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}

248 249 250 251
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
252

253 254 255 256
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

257
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
258
		align = 1 << (INTERNODE_CACHE_SHIFT);
259
		mem_section = memblock_alloc(size, align);
260 261 262
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
263 264 265
	}
#endif

A
Andy Whitcroft 已提交
266
	start &= PAGE_SECTION_MASK;
267
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
268 269
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
270 271 272
		struct mem_section *ms;

		sparse_index_init(section, nid);
273
		set_section_nid(section, nid);
B
Bob Picco 已提交
274 275

		ms = __nr_to_section(section);
276
		if (!ms->section_mem_map) {
277 278
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
279 280
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
281 282 283
	}
}

284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299
/*
 * Mark all memblocks as present using memory_present(). This is a
 * convienence function that is useful for a number of arches
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
300 301 302 303 304 305 306
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
307 308 309 310 311
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
312 313 314
}

/*
315
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
316 317 318
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
319 320
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
321 322 323
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

324
static void __meminit sparse_init_one_section(struct mem_section *ms,
325
		unsigned long pnum, struct page *mem_map,
326
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
327
{
328
	ms->section_mem_map &= ~SECTION_MAP_MASK;
329 330
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
331
	ms->usage = usage;
A
Andy Whitcroft 已提交
332 333
}

334
static unsigned long usemap_size(void)
335
{
336
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
337 338
}

339
size_t mem_section_usage_size(void)
340
{
341
	return sizeof(struct mem_section_usage) + usemap_size();
342 343
}

344
#ifdef CONFIG_MEMORY_HOTREMOVE
345
static struct mem_section_usage * __init
346
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
347
					 unsigned long size)
348
{
349
	struct mem_section_usage *usage;
350 351
	unsigned long goal, limit;
	int nid;
352 353 354
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
355
	 * other sections referencing the usemap remain active. Similarly,
356 357 358 359 360 361
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
362
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
363 364 365
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
366 367
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
368 369 370
		limit = 0;
		goto again;
	}
371
	return usage;
372 373
}

374 375
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
376 377
{
	unsigned long usemap_snr, pgdat_snr;
378 379
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
380 381 382
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

383 384 385 386 387 388
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

389
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
390 391 392 393 394 395 396 397 398 399 400 401 402
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
403 404
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
405 406 407 408 409 410 411 412
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
413 414
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
415 416
}
#else
417
static struct mem_section_usage * __init
418
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
419
					 unsigned long size)
420
{
421
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
422 423
}

424 425
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
426 427 428 429
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

430
#ifdef CONFIG_SPARSEMEM_VMEMMAP
431
static unsigned long __init section_map_size(void)
432 433 434 435 436
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
437
static unsigned long __init section_map_size(void)
438 439 440 441
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

442 443
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
444
{
445 446
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
447
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
448 449 450

	if (map)
		return map;
A
Andy Whitcroft 已提交
451

452
	map = memblock_alloc_try_nid(size,
453
					  PAGE_SIZE, addr,
454
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
455 456 457 458
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

459 460 461 462
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

463 464 465
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

466
static void __init sparse_buffer_init(unsigned long size, int nid)
467
{
468
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
469 470
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
	sparsemap_buf =
471
		memblock_alloc_try_nid_raw(size, PAGE_SIZE,
472
						addr,
473
						MEMBLOCK_ALLOC_ACCESSIBLE, nid);
474 475 476
	sparsemap_buf_end = sparsemap_buf + size;
}

477
static void __init sparse_buffer_fini(void)
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
		memblock_free_early(__pa(sparsemap_buf), size);
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
		ptr = PTR_ALIGN(sparsemap_buf, size);
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
		else
			sparsemap_buf = ptr + size;
	}
	return ptr;
}

500
void __weak __meminit vmemmap_populate_print_last(void)
501 502
{
}
503

504 505 506 507 508 509 510 511
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
512 513
	struct mem_section_usage *usage;
	unsigned long pnum;
514 515
	struct page *map;

516 517 518
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
519 520 521 522 523
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
524 525
		unsigned long pfn = section_nr_to_pfn(pnum);

526 527 528
		if (pnum >= pnum_end)
			break;

529 530
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
531 532 533 534 535 536
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
537
		check_usemap_section_nr(nid, usage);
538 539
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
540
		usage = (void *) usage + mem_section_usage_size();
541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
560
void __init sparse_init(void)
561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

587
#ifdef CONFIG_MEMORY_HOTPLUG
588 589 590 591 592 593 594

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
595
		unsigned long section_nr = pfn_to_section_nr(pfn);
596 597 598 599 600 601 602 603 604 605 606 607
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
608
/* Mark all memory sections within the pfn range as offline */
609 610 611 612 613
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
614
		unsigned long section_nr = pfn_to_section_nr(pfn);
615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

630
#ifdef CONFIG_SPARSEMEM_VMEMMAP
631 632
static struct page *populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
633
{
634
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
635
}
636 637

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
638
		struct vmem_altmap *altmap)
639
{
640 641
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
642

643
	vmemmap_free(start, end, altmap);
644
}
645
static void free_map_bootmem(struct page *memmap)
646
{
647
	unsigned long start = (unsigned long)memmap;
648
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
649

650
	vmemmap_free(start, end, NULL);
651
}
652
#else
653 654
struct page *populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
655 656
{
	struct page *page, *ret;
657
	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
658

659
	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
660 661 662 663 664 665 666 667 668 669 670 671 672 673 674
	if (page)
		goto got_map_page;

	ret = vmalloc(memmap_size);
	if (ret)
		goto got_map_ptr;

	return NULL;
got_map_page:
	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:

	return ret;
}

675
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
676
		struct vmem_altmap *altmap)
677
{
678
	struct page *memmap = pfn_to_page(pfn);
679

680
	if (is_vmalloc_addr(memmap))
681 682 683
		vfree(memmap);
	else
		free_pages((unsigned long)memmap,
684
			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
685
}
686

687
static void free_map_bootmem(struct page *memmap)
688 689
{
	unsigned long maps_section_nr, removing_section_nr, i;
690
	unsigned long magic, nr_pages;
691
	struct page *page = virt_to_page(memmap);
692

693 694 695
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

696
	for (i = 0; i < nr_pages; i++, page++) {
697
		magic = (unsigned long) page->freelist;
698 699 700 701

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
702
		removing_section_nr = page_private(page);
703 704 705 706 707 708 709 710 711 712 713 714 715

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
716
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
717

718 719 720 721 722 723 724 725 726 727 728 729
/**
 * sparse_add_one_section - add a memory section
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
730
 */
731 732
int __meminit sparse_add_one_section(int nid, unsigned long start_pfn,
				     struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
733
{
734
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
735
	struct mem_section_usage *usage;
736 737 738
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
739

740 741 742 743
	/*
	 * no locking for this, because it does its own
	 * plus, it does a kmalloc
	 */
744
	ret = sparse_index_init(section_nr, nid);
745 746
	if (ret < 0 && ret != -EEXIST)
		return ret;
747
	ret = 0;
748 749
	memmap = populate_section_memmap(start_pfn, PAGES_PER_SECTION, nid,
			altmap);
750 751
	if (!memmap)
		return -ENOMEM;
752 753
	usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
	if (!usage) {
754
		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
755 756
		return -ENOMEM;
	}
757 758 759 760 761 762

	ms = __pfn_to_section(start_pfn);
	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
		ret = -EEXIST;
		goto out;
	}
763

764 765 766 767
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
768
	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
769

770
	set_section_nid(section_nr, nid);
771
	section_mark_present(ms);
772
	sparse_init_one_section(ms, section_nr, memmap, usage, 0);
773 774

out:
775
	if (ret < 0) {
776
		kfree(usage);
777
		depopulate_section_memmap(start_pfn, PAGES_PER_SECTION, altmap);
778
	}
779
	return ret;
A
Andy Whitcroft 已提交
780
}
781

782 783 784 785 786 787 788 789
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

	if (!memmap)
		return;

790 791 792 793 794 795 796 797 798
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

799
	for (i = 0; i < nr_pages; i++) {
800
		if (PageHWPoison(&memmap[i])) {
801
			atomic_long_sub(1, &num_poisoned_pages);
802 803 804 805 806 807 808 809 810 811
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

812
static void free_section_usage(struct mem_section *ms, struct page *memmap,
813 814
		struct mem_section_usage *usage, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
815
{
816
	if (!usage)
817 818 819 820 821
		return;

	/*
	 * Check to see if allocation came from hot-plug-add
	 */
822
	if (!early_section(ms)) {
823
		kfree(usage);
824
		if (memmap)
825
			depopulate_section_memmap(pfn, nr_pages, altmap);
826 827 828 829 830 831 832 833
		return;
	}

	/*
	 * The usemap came from bootmem. This is packed with other usemaps
	 * on the section which has pgdat at boot time. Just keep it as is now.
	 */

834 835
	if (memmap)
		free_map_bootmem(memmap);
836 837
}

838 839
void sparse_remove_one_section(struct mem_section *ms, unsigned long map_offset,
			       struct vmem_altmap *altmap)
840 841
{
	struct page *memmap = NULL;
842
	struct mem_section_usage *usage = NULL;
843 844

	if (ms->section_mem_map) {
845
		usage = ms->usage;
846 847 848
		memmap = sparse_decode_mem_map(ms->section_mem_map,
						__section_nr(ms));
		ms->section_mem_map = 0;
849
		ms->usage = NULL;
850 851
	}

852 853
	clear_hwpoisoned_pages(memmap + map_offset,
			PAGES_PER_SECTION - map_offset);
854 855 856
	free_section_usage(ms, memmap, usage,
			section_nr_to_pfn(__section_nr(ms)),
			PAGES_PER_SECTION, altmap);
857
}
858
#endif /* CONFIG_MEMORY_HOTPLUG */