sparse.c 25.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14 15
#include <linux/swap.h>
#include <linux/swapops.h>
16

17
#include "internal.h"
A
Andy Whitcroft 已提交
18
#include <asm/dma.h>
19 20
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
A
Andy Whitcroft 已提交
21 22 23 24 25 26

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
27
#ifdef CONFIG_SPARSEMEM_EXTREME
28
struct mem_section **mem_section;
29 30
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
31
	____cacheline_internodealigned_in_smp;
32 33 34
#endif
EXPORT_SYMBOL(mem_section);

35 36 37 38 39 40 41 42 43 44 45 46
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
47
int page_to_nid(const struct page *page)
48 49 50 51
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
52 53 54 55 56 57 58 59 60

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
61 62
#endif

63
#ifdef CONFIG_SPARSEMEM_EXTREME
64
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
65 66 67 68 69
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

70
	if (slab_is_available()) {
71
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
72
	} else {
73 74
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
75 76 77 78
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
79 80

	return section;
81
}
B
Bob Picco 已提交
82

83
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
84
{
85 86
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
87

88 89 90 91 92 93 94
	/*
	 * An existing section is possible in the sub-section hotplug
	 * case. First hot-add instantiates, follow-on hot-add reuses
	 * the existing section.
	 *
	 * The mem_hotplug_lock resolves the apparent race below.
	 */
B
Bob Picco 已提交
95
	if (mem_section[root])
96
		return 0;
97

98
	section = sparse_index_alloc(nid);
99 100
	if (!section)
		return -ENOMEM;
101 102

	mem_section[root] = section;
G
Gavin Shan 已提交
103

104
	return 0;
105 106 107 108 109
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
110
}
111 112
#endif

113
#ifdef CONFIG_SPARSEMEM_EXTREME
114
unsigned long __section_nr(struct mem_section *ms)
115 116
{
	unsigned long root_nr;
117
	struct mem_section *root = NULL;
118

119 120
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
121 122 123 124 125 126 127
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

128
	VM_BUG_ON(!root);
129

130 131
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
132
#else
133
unsigned long __section_nr(struct mem_section *ms)
134
{
135
	return (unsigned long)(ms - mem_section[0]);
136 137
}
#endif
138

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

155 156 157
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
158
{
159
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
160

I
Ingo Molnar 已提交
161 162 163 164
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
165 166 167 168 169 170 171
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
172
	} else if (*end_pfn > max_sparsemem_pfn) {
173 174 175 176 177 178 179 180
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

181 182 183 184 185 186 187 188 189
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
190
unsigned long __highest_present_section_nr;
191 192
static void section_mark_present(struct mem_section *ms)
{
193
	unsigned long section_nr = __section_nr(ms);
194 195 196 197 198 199 200

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

201
static inline unsigned long next_present_section_nr(unsigned long section_nr)
202 203 204 205 206
{
	do {
		section_nr++;
		if (present_section_nr(section_nr))
			return section_nr;
207
	} while ((section_nr <= __highest_present_section_nr));
208 209 210 211 212

	return -1;
}
#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
213
	     ((section_nr != -1) &&				\
214 215 216
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

217 218 219 220 221
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

Y
Yi Wang 已提交
222
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
223 224 225 226 227 228 229 230 231 232 233
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
234
	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
235 236 237 238

	if (!nr_pages)
		return;

239
	for (nr = start_sec; nr <= end_sec; nr++) {
240 241 242 243 244
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
245
		ms = __nr_to_section(nr);
246 247
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

248
		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
249 250 251 252 253 254 255 256
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}

257 258 259 260
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
261

262 263 264 265
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

266
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
267
		align = 1 << (INTERNODE_CACHE_SHIFT);
268
		mem_section = memblock_alloc(size, align);
269 270 271
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
272 273 274
	}
#endif

A
Andy Whitcroft 已提交
275
	start &= PAGE_SECTION_MASK;
276
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
277 278
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
279 280 281
		struct mem_section *ms;

		sparse_index_init(section, nid);
282
		set_section_nid(section, nid);
B
Bob Picco 已提交
283 284

		ms = __nr_to_section(section);
285
		if (!ms->section_mem_map) {
286 287
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
288 289
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
290 291 292
	}
}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
/*
 * Mark all memblocks as present using memory_present(). This is a
 * convienence function that is useful for a number of arches
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
309 310 311 312 313 314 315
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
316 317 318 319 320
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
321 322 323
}

/*
324
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
325 326 327
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
328 329
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
330 331 332
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

333
static void __meminit sparse_init_one_section(struct mem_section *ms,
334
		unsigned long pnum, struct page *mem_map,
335
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
336
{
337
	ms->section_mem_map &= ~SECTION_MAP_MASK;
338 339
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
340
	ms->usage = usage;
A
Andy Whitcroft 已提交
341 342
}

343
static unsigned long usemap_size(void)
344
{
345
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
346 347
}

348
size_t mem_section_usage_size(void)
349
{
350
	return sizeof(struct mem_section_usage) + usemap_size();
351 352
}

353
#ifdef CONFIG_MEMORY_HOTREMOVE
354
static struct mem_section_usage * __init
355
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
356
					 unsigned long size)
357
{
358
	struct mem_section_usage *usage;
359 360
	unsigned long goal, limit;
	int nid;
361 362 363
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
364
	 * other sections referencing the usemap remain active. Similarly,
365 366 367 368 369 370
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
371
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
372 373 374
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
375 376
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
377 378 379
		limit = 0;
		goto again;
	}
380
	return usage;
381 382
}

383 384
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
385 386
{
	unsigned long usemap_snr, pgdat_snr;
387 388
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
389 390 391
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

392 393 394 395 396 397
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

398
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
399 400 401 402 403 404 405 406 407 408 409 410 411
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
412 413
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
414 415 416 417 418 419 420 421
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
422 423
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
424 425
}
#else
426
static struct mem_section_usage * __init
427
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
428
					 unsigned long size)
429
{
430
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
431 432
}

433 434
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
435 436 437 438
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

439
#ifdef CONFIG_SPARSEMEM_VMEMMAP
440
static unsigned long __init section_map_size(void)
441 442 443 444 445
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
446
static unsigned long __init section_map_size(void)
447 448 449 450
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

451 452
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
453
{
454 455
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
456
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
457 458 459

	if (map)
		return map;
A
Andy Whitcroft 已提交
460

461
	map = memblock_alloc_try_nid_raw(size, size, addr,
462
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
463 464 465 466
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

467 468 469 470
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

471 472 473
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

474 475 476 477 478 479
static inline void __meminit sparse_buffer_free(unsigned long size)
{
	WARN_ON(!sparsemap_buf || size == 0);
	memblock_free_early(__pa(sparsemap_buf), size);
}

480
static void __init sparse_buffer_init(unsigned long size, int nid)
481
{
482
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
483
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
484 485 486 487 488 489 490
	/*
	 * Pre-allocated buffer is mainly used by __populate_section_memmap
	 * and we want it to be properly aligned to the section size - this is
	 * especially the case for VMEMMAP which maps memmap to PMDs
	 */
	sparsemap_buf = memblock_alloc_try_nid_raw(size, section_map_size(),
					addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
491 492 493
	sparsemap_buf_end = sparsemap_buf + size;
}

494
static void __init sparse_buffer_fini(void)
495 496 497 498
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
499
		sparse_buffer_free(size);
500 501 502 503 504 505 506 507
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
508
		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
509 510
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
511 512 513 514
		else {
			/* Free redundant aligned space */
			if ((unsigned long)(ptr - sparsemap_buf) > 0)
				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
515
			sparsemap_buf = ptr + size;
516
		}
517 518 519 520
	}
	return ptr;
}

521
void __weak __meminit vmemmap_populate_print_last(void)
522 523
{
}
524

525 526 527 528 529 530 531 532
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
533 534
	struct mem_section_usage *usage;
	unsigned long pnum;
535 536
	struct page *map;

537 538 539
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
540 541 542 543 544
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
545 546
		unsigned long pfn = section_nr_to_pfn(pnum);

547 548 549
		if (pnum >= pnum_end)
			break;

550 551
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
552 553 554 555 556 557
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
558
		check_usemap_section_nr(nid, usage);
559 560
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
561
		usage = (void *) usage + mem_section_usage_size();
562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
581
void __init sparse_init(void)
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

608
#ifdef CONFIG_MEMORY_HOTPLUG
609 610 611 612 613 614 615

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
616
		unsigned long section_nr = pfn_to_section_nr(pfn);
617 618 619 620 621 622 623 624 625 626 627 628
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
629
/* Mark all memory sections within the pfn range as offline */
630 631 632 633 634
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
635
		unsigned long section_nr = pfn_to_section_nr(pfn);
636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

651
#ifdef CONFIG_SPARSEMEM_VMEMMAP
652
static struct page * __meminit populate_section_memmap(unsigned long pfn,
653
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
654
{
655
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
656
}
657 658

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
659
		struct vmem_altmap *altmap)
660
{
661 662
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
663

664
	vmemmap_free(start, end, altmap);
665
}
666
static void free_map_bootmem(struct page *memmap)
667
{
668
	unsigned long start = (unsigned long)memmap;
669
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
670

671
	vmemmap_free(start, end, NULL);
672
}
673
#else
674
struct page * __meminit populate_section_memmap(unsigned long pfn,
675
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
676 677
{
	struct page *page, *ret;
678
	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
679

680
	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
	if (page)
		goto got_map_page;

	ret = vmalloc(memmap_size);
	if (ret)
		goto got_map_ptr;

	return NULL;
got_map_page:
	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:

	return ret;
}

696
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
697
		struct vmem_altmap *altmap)
698
{
699
	struct page *memmap = pfn_to_page(pfn);
700

701
	if (is_vmalloc_addr(memmap))
702 703 704
		vfree(memmap);
	else
		free_pages((unsigned long)memmap,
705
			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
706
}
707

708
static void free_map_bootmem(struct page *memmap)
709 710
{
	unsigned long maps_section_nr, removing_section_nr, i;
711
	unsigned long magic, nr_pages;
712
	struct page *page = virt_to_page(memmap);
713

714 715 716
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

717
	for (i = 0; i < nr_pages; i++, page++) {
718
		magic = (unsigned long) page->freelist;
719 720 721 722

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
723
		removing_section_nr = page_private(page);
724 725 726 727 728 729 730 731 732 733 734 735 736

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
737
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
738

739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	bool section_is_early = early_section(ms);
	struct page *memmap = NULL;
	unsigned long *subsection_map = ms->usage
		? &ms->usage->subsection_map[0] : NULL;

	subsection_mask_set(map, pfn, nr_pages);
	if (subsection_map)
		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);

	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
				"section already deactivated (%#lx + %ld)\n",
				pfn, nr_pages))
		return;

	/*
	 * There are 3 cases to handle across two configurations
	 * (SPARSEMEM_VMEMMAP={y,n}):
	 *
	 * 1/ deactivation of a partial hot-added section (only possible
	 * in the SPARSEMEM_VMEMMAP=y case).
	 *    a/ section was present at memory init
	 *    b/ section was hot-added post memory init
	 * 2/ deactivation of a complete hot-added section
	 * 3/ deactivation of a complete section from memory init
	 *
	 * For 1/, when subsection_map does not empty we will not be
	 * freeing the usage map, but still need to free the vmemmap
	 * range.
	 *
	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
	 */
	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
		unsigned long section_nr = pfn_to_section_nr(pfn);

		if (!section_is_early) {
			kfree(ms->usage);
			ms->usage = NULL;
		}
		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
	}

	if (section_is_early && memmap)
		free_map_bootmem(memmap);
	else
		depopulate_section_memmap(pfn, nr_pages, altmap);
}

static struct page * __meminit section_activate(int nid, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	struct mem_section_usage *usage = NULL;
	unsigned long *subsection_map;
	struct page *memmap;
	int rc = 0;

	subsection_mask_set(map, pfn, nr_pages);

	if (!ms->usage) {
		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
		if (!usage)
			return ERR_PTR(-ENOMEM);
		ms->usage = usage;
	}
	subsection_map = &ms->usage->subsection_map[0];

	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
		rc = -EINVAL;
	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
		rc = -EEXIST;
	else
		bitmap_or(subsection_map, map, subsection_map,
				SUBSECTIONS_PER_SECTION);

	if (rc) {
		if (usage)
			ms->usage = NULL;
		kfree(usage);
		return ERR_PTR(rc);
	}

	/*
	 * The early init code does not consider partially populated
	 * initial sections, it simply assumes that memory will never be
	 * referenced.  If we hot-add memory into such a section then we
	 * do not need to populate the memmap and can simply reuse what
	 * is already there.
	 */
	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
		return pfn_to_page(pfn);

	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
	if (!memmap) {
		section_deactivate(pfn, nr_pages, altmap);
		return ERR_PTR(-ENOMEM);
	}

	return memmap;
}

848
/**
849
 * sparse_add_section - add a memory section, or populate an existing one
850 851
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
852
 * @nr_pages: number of pfns to add in the section
853 854 855 856 857 858 859 860
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
861
 */
862 863
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
864
{
865 866 867 868
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
869

870
	ret = sparse_index_init(section_nr, nid);
871
	if (ret < 0)
872
		return ret;
873

874 875 876
	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
	if (IS_ERR(memmap))
		return PTR_ERR(memmap);
877

878 879 880 881
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
882
	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
883

884
	ms = __nr_to_section(section_nr);
885
	set_section_nid(section_nr, nid);
886
	section_mark_present(ms);
887

888 889 890 891 892 893
	/* Align memmap to section boundary in the subsection case */
	if (section_nr_to_pfn(section_nr) != start_pfn)
		memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);

	return 0;
A
Andy Whitcroft 已提交
894
}
895

896 897 898 899 900
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

901 902 903 904 905 906 907 908 909
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

910
	for (i = 0; i < nr_pages; i++) {
911
		if (PageHWPoison(&memmap[i])) {
912
			num_poisoned_pages_dec();
913 914 915 916 917 918 919 920 921 922
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

923
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
924 925
		unsigned long nr_pages, unsigned long map_offset,
		struct vmem_altmap *altmap)
926
{
927 928 929
	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
			nr_pages - map_offset);
	section_deactivate(pfn, nr_pages, altmap);
930
}
931
#endif /* CONFIG_MEMORY_HOTPLUG */