sparse.c 26.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14 15
#include <linux/swap.h>
#include <linux/swapops.h>
16
#include <linux/bootmem_info.h>
17

18
#include "internal.h"
A
Andy Whitcroft 已提交
19 20 21 22 23 24 25
#include <asm/dma.h>

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
26
#ifdef CONFIG_SPARSEMEM_EXTREME
27
struct mem_section **mem_section;
28 29
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
30
	____cacheline_internodealigned_in_smp;
31 32 33
#endif
EXPORT_SYMBOL(mem_section);

34 35 36 37 38 39 40 41 42 43 44 45
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
46
int page_to_nid(const struct page *page)
47 48 49 50
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
51 52 53 54 55 56 57 58 59

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
60 61
#endif

62
#ifdef CONFIG_SPARSEMEM_EXTREME
63
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
64 65 66 67 68
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

69
	if (slab_is_available()) {
70
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
71
	} else {
72 73
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
74 75 76 77
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
78 79

	return section;
80
}
B
Bob Picco 已提交
81

82
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
83
{
84 85
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
86

87 88 89 90 91 92 93
	/*
	 * An existing section is possible in the sub-section hotplug
	 * case. First hot-add instantiates, follow-on hot-add reuses
	 * the existing section.
	 *
	 * The mem_hotplug_lock resolves the apparent race below.
	 */
B
Bob Picco 已提交
94
	if (mem_section[root])
95
		return 0;
96

97
	section = sparse_index_alloc(nid);
98 99
	if (!section)
		return -ENOMEM;
100 101

	mem_section[root] = section;
G
Gavin Shan 已提交
102

103
	return 0;
104 105 106 107 108
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
109
}
110 111
#endif

112
#ifdef CONFIG_SPARSEMEM_EXTREME
113
unsigned long __section_nr(struct mem_section *ms)
114 115
{
	unsigned long root_nr;
116
	struct mem_section *root = NULL;
117

118 119
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
120 121 122 123 124 125 126
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

127
	VM_BUG_ON(!root);
128

129 130
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
131
#else
132
unsigned long __section_nr(struct mem_section *ms)
133
{
134
	return (unsigned long)(ms - mem_section[0]);
135 136
}
#endif
137

138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

154 155 156
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
157
{
158
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
159

I
Ingo Molnar 已提交
160 161 162 163
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
164 165 166 167 168 169 170
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
171
	} else if (*end_pfn > max_sparsemem_pfn) {
172 173 174 175 176 177 178 179
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

180 181 182 183 184 185 186 187 188
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
189
unsigned long __highest_present_section_nr;
190 191
static void section_mark_present(struct mem_section *ms)
{
192
	unsigned long section_nr = __section_nr(ms);
193 194 195 196 197 198 199 200 201

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
202
	     ((section_nr != -1) &&				\
203 204 205
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

206 207 208 209 210
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

211
#ifdef CONFIG_SPARSEMEM_VMEMMAP
Y
Yi Wang 已提交
212
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
213 214 215 216 217 218 219 220 221 222 223
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
224
	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
225 226 227 228

	if (!nr_pages)
		return;

229
	for (nr = start_sec; nr <= end_sec; nr++) {
230 231 232 233 234
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
235
		ms = __nr_to_section(nr);
236 237
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

238
		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
239 240 241 242 243 244 245
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}
246 247 248 249 250
#else
void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
}
#endif
251

252
/* Record a memory area against a node. */
253
static void __init memory_present(int nid, unsigned long start, unsigned long end)
254 255
{
	unsigned long pfn;
I
Ingo Molnar 已提交
256

257 258 259 260
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

261
		size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
262
		align = 1 << (INTERNODE_CACHE_SHIFT);
263
		mem_section = memblock_alloc(size, align);
264 265 266
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
267 268 269
	}
#endif

A
Andy Whitcroft 已提交
270
	start &= PAGE_SECTION_MASK;
271
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
272 273
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
274 275 276
		struct mem_section *ms;

		sparse_index_init(section, nid);
277
		set_section_nid(section, nid);
B
Bob Picco 已提交
278 279

		ms = __nr_to_section(section);
280
		if (!ms->section_mem_map) {
281 282
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
283 284
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
285 286 287
	}
}

288
/*
289 290 291
 * Mark all memblocks as present using memory_present().
 * This is a convenience function that is useful to mark all of the systems
 * memory as present during initialization.
292
 */
293
static void __init memblocks_present(void)
294
{
295 296
	unsigned long start, end;
	int i, nid;
297

298 299
	for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
		memory_present(nid, start, end);
300 301
}

A
Andy Whitcroft 已提交
302 303 304 305 306 307 308
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
309 310 311 312 313
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
314 315
}

316
#ifdef CONFIG_MEMORY_HOTPLUG
A
Andy Whitcroft 已提交
317
/*
318
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
319 320 321
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
322 323
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
324 325
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}
326
#endif /* CONFIG_MEMORY_HOTPLUG */
A
Andy Whitcroft 已提交
327

328
static void __meminit sparse_init_one_section(struct mem_section *ms,
329
		unsigned long pnum, struct page *mem_map,
330
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
331
{
332
	ms->section_mem_map &= ~SECTION_MAP_MASK;
333 334
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
335
	ms->usage = usage;
A
Andy Whitcroft 已提交
336 337
}

338
static unsigned long usemap_size(void)
339
{
340
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
341 342
}

343
size_t mem_section_usage_size(void)
344
{
345
	return sizeof(struct mem_section_usage) + usemap_size();
346 347
}

348 349
static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
{
350
#ifndef CONFIG_NUMA
351 352 353 354 355 356
	return __pa_symbol(pgdat);
#else
	return __pa(pgdat);
#endif
}

357
#ifdef CONFIG_MEMORY_HOTREMOVE
358
static struct mem_section_usage * __init
359
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
360
					 unsigned long size)
361
{
362
	struct mem_section_usage *usage;
363 364
	unsigned long goal, limit;
	int nid;
365 366 367
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
368
	 * other sections referencing the usemap remain active. Similarly,
369 370 371 372 373 374
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
375
	goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
376 377 378
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
379 380
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
381 382 383
		limit = 0;
		goto again;
	}
384
	return usage;
385 386
}

387 388
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
389 390
{
	unsigned long usemap_snr, pgdat_snr;
391 392
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
393 394 395
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

396 397 398 399 400 401
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

402
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
403
	pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
404 405 406 407 408 409 410 411 412 413 414 415
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
416 417
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
418 419 420 421 422 423 424 425
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
426 427
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
428 429
}
#else
430
static struct mem_section_usage * __init
431
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
432
					 unsigned long size)
433
{
434
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
435 436
}

437 438
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
439 440 441 442
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

443
#ifdef CONFIG_SPARSEMEM_VMEMMAP
444
static unsigned long __init section_map_size(void)
445 446 447 448 449
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
450
static unsigned long __init section_map_size(void)
451 452 453 454
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

455 456
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
457
{
458 459
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
460
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
461 462 463

	if (map)
		return map;
A
Andy Whitcroft 已提交
464

465
	map = memblock_alloc_try_nid_raw(size, size, addr,
466
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
467 468 469 470
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

471 472 473 474
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

475 476 477
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

478 479 480 481 482 483
static inline void __meminit sparse_buffer_free(unsigned long size)
{
	WARN_ON(!sparsemap_buf || size == 0);
	memblock_free_early(__pa(sparsemap_buf), size);
}

484
static void __init sparse_buffer_init(unsigned long size, int nid)
485
{
486
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
487
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
488 489 490 491 492
	/*
	 * Pre-allocated buffer is mainly used by __populate_section_memmap
	 * and we want it to be properly aligned to the section size - this is
	 * especially the case for VMEMMAP which maps memmap to PMDs
	 */
493
	sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
494
					addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
495 496 497
	sparsemap_buf_end = sparsemap_buf + size;
}

498
static void __init sparse_buffer_fini(void)
499 500 501 502
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
503
		sparse_buffer_free(size);
504 505 506 507 508 509 510 511
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
512
		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
513 514
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
515 516 517 518
		else {
			/* Free redundant aligned space */
			if ((unsigned long)(ptr - sparsemap_buf) > 0)
				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
519
			sparsemap_buf = ptr + size;
520
		}
521 522 523 524
	}
	return ptr;
}

525
void __weak __meminit vmemmap_populate_print_last(void)
526 527
{
}
528

529 530 531 532 533 534 535 536
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
537 538
	struct mem_section_usage *usage;
	unsigned long pnum;
539 540
	struct page *map;

541 542 543
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
544 545 546 547 548
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
549 550
		unsigned long pfn = section_nr_to_pfn(pnum);

551 552 553
		if (pnum >= pnum_end)
			break;

554 555
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
556 557 558 559
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
560
			sparse_buffer_fini();
561 562
			goto failed;
		}
563
		check_usemap_section_nr(nid, usage);
564 565
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
566
		usage = (void *) usage + mem_section_usage_size();
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
586
void __init sparse_init(void)
587
{
588 589 590 591 592 593 594
	unsigned long pnum_end, pnum_begin, map_count = 1;
	int nid_begin;

	memblocks_present();

	pnum_begin = first_present_section_nr();
	nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

617
#ifdef CONFIG_MEMORY_HOTPLUG
618 619 620 621 622 623 624

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
625
		unsigned long section_nr = pfn_to_section_nr(pfn);
626 627 628 629 630 631 632 633 634 635 636
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

637
/* Mark all memory sections within the pfn range as offline */
638 639 640 641 642
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
643
		unsigned long section_nr = pfn_to_section_nr(pfn);
644 645 646 647 648 649 650 651 652 653 654 655 656 657
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}

658
#ifdef CONFIG_SPARSEMEM_VMEMMAP
659
static struct page * __meminit populate_section_memmap(unsigned long pfn,
660
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
661
{
662
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
663
}
664 665

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
666
		struct vmem_altmap *altmap)
667
{
668 669
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
670

671
	vmemmap_free(start, end, altmap);
672
}
673
static void free_map_bootmem(struct page *memmap)
674
{
675
	unsigned long start = (unsigned long)memmap;
676
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
677

678
	vmemmap_free(start, end, NULL);
679
}
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728

static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	unsigned long *subsection_map = ms->usage
		? &ms->usage->subsection_map[0] : NULL;

	subsection_mask_set(map, pfn, nr_pages);
	if (subsection_map)
		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);

	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
				"section already deactivated (%#lx + %ld)\n",
				pfn, nr_pages))
		return -EINVAL;

	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
	return 0;
}

static bool is_subsection_map_empty(struct mem_section *ms)
{
	return bitmap_empty(&ms->usage->subsection_map[0],
			    SUBSECTIONS_PER_SECTION);
}

static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	unsigned long *subsection_map;
	int rc = 0;

	subsection_mask_set(map, pfn, nr_pages);

	subsection_map = &ms->usage->subsection_map[0];

	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
		rc = -EINVAL;
	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
		rc = -EEXIST;
	else
		bitmap_or(subsection_map, map, subsection_map,
				SUBSECTIONS_PER_SECTION);

	return rc;
}
729
#else
730
struct page * __meminit populate_section_memmap(unsigned long pfn,
731
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
732
{
733 734
	return kvmalloc_node(array_size(sizeof(struct page),
					PAGES_PER_SECTION), GFP_KERNEL, nid);
735 736
}

737
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
738
		struct vmem_altmap *altmap)
739
{
740
	kvfree(pfn_to_page(pfn));
741
}
742

743
static void free_map_bootmem(struct page *memmap)
744 745
{
	unsigned long maps_section_nr, removing_section_nr, i;
746
	unsigned long magic, nr_pages;
747
	struct page *page = virt_to_page(memmap);
748

749 750 751
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

752
	for (i = 0; i < nr_pages; i++, page++) {
753
		magic = (unsigned long) page->freelist;
754 755 756 757

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
758
		removing_section_nr = page_private(page);
759 760 761 762 763 764 765 766 767 768 769 770 771

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
772

773
static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
774
{
775 776 777 778 779
	return 0;
}

static bool is_subsection_map_empty(struct mem_section *ms)
{
780
	return true;
781 782
}

783
static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
784
{
785
	return 0;
786
}
787
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
788

789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804
/*
 * To deactivate a memory region, there are 3 cases to handle across
 * two configurations (SPARSEMEM_VMEMMAP={y,n}):
 *
 * 1. deactivation of a partial hot-added section (only possible in
 *    the SPARSEMEM_VMEMMAP=y case).
 *      a) section was present at memory init.
 *      b) section was hot-added post memory init.
 * 2. deactivation of a complete hot-added section.
 * 3. deactivation of a complete section from memory init.
 *
 * For 1, when subsection_map does not empty we will not be freeing the
 * usage map, but still need to free the vmemmap range.
 *
 * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
 */
805 806 807 808 809 810 811 812 813 814
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	bool section_is_early = early_section(ms);
	struct page *memmap = NULL;
	bool empty;

	if (clear_subsection_map(pfn, nr_pages))
		return;
815

816
	empty = is_subsection_map_empty(ms);
817
	if (empty) {
818 819
		unsigned long section_nr = pfn_to_section_nr(pfn);

820 821 822 823 824 825 826 827
		/*
		 * When removing an early section, the usage map is kept (as the
		 * usage maps of other sections fall into the same page). It
		 * will be re-used when re-adding the section - which is then no
		 * longer an early section. If the usage map is PageReserved, it
		 * was allocated during boot.
		 */
		if (!PageReserved(virt_to_page(ms->usage))) {
828 829 830 831
			kfree(ms->usage);
			ms->usage = NULL;
		}
		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
832 833 834 835 836 837
		/*
		 * Mark the section invalid so that valid_section()
		 * return false. This prevents code from dereferencing
		 * ms->usage array.
		 */
		ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
838 839
	}

840 841 842 843 844
	/*
	 * The memmap of early sections is always fully populated. See
	 * section_activate() and pfn_valid() .
	 */
	if (!section_is_early)
845
		depopulate_section_memmap(pfn, nr_pages, altmap);
846 847
	else if (memmap)
		free_map_bootmem(memmap);
848 849 850

	if (empty)
		ms->section_mem_map = (unsigned long)NULL;
851 852
}

853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
static struct page * __meminit section_activate(int nid, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	struct mem_section_usage *usage = NULL;
	struct page *memmap;
	int rc = 0;

	if (!ms->usage) {
		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
		if (!usage)
			return ERR_PTR(-ENOMEM);
		ms->usage = usage;
	}

	rc = fill_subsection_map(pfn, nr_pages);
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
	if (rc) {
		if (usage)
			ms->usage = NULL;
		kfree(usage);
		return ERR_PTR(rc);
	}

	/*
	 * The early init code does not consider partially populated
	 * initial sections, it simply assumes that memory will never be
	 * referenced.  If we hot-add memory into such a section then we
	 * do not need to populate the memmap and can simply reuse what
	 * is already there.
	 */
	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
		return pfn_to_page(pfn);

	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
	if (!memmap) {
		section_deactivate(pfn, nr_pages, altmap);
		return ERR_PTR(-ENOMEM);
	}

	return memmap;
}

895
/**
896
 * sparse_add_section - add a memory section, or populate an existing one
897 898
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
899
 * @nr_pages: number of pfns to add in the section
900 901 902 903
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
904 905 906 907
 * Note that only VMEMMAP supports sub-section aligned hotplug,
 * the proper alignment and size are gated by check_pfn_span().
 *
 *
908 909 910 911
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
912
 */
913 914
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
915
{
916 917 918 919
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
920

921
	ret = sparse_index_init(section_nr, nid);
922
	if (ret < 0)
923
		return ret;
924

925 926 927
	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
	if (IS_ERR(memmap))
		return PTR_ERR(memmap);
928

929 930 931 932
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
933
	page_init_poison(memmap, sizeof(struct page) * nr_pages);
934

935
	ms = __nr_to_section(section_nr);
936
	set_section_nid(section_nr, nid);
937
	section_mark_present(ms);
938

939 940
	/* Align memmap to section boundary in the subsection case */
	if (section_nr_to_pfn(section_nr) != start_pfn)
941
		memmap = pfn_to_page(section_nr_to_pfn(section_nr));
942 943 944
	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);

	return 0;
A
Andy Whitcroft 已提交
945
}
946

947 948 949 950 951
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

952 953 954 955 956 957 958 959 960
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

961
	for (i = 0; i < nr_pages; i++) {
962
		if (PageHWPoison(&memmap[i])) {
963
			num_poisoned_pages_dec();
964 965 966 967 968 969 970 971 972 973
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

974
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
975 976
		unsigned long nr_pages, unsigned long map_offset,
		struct vmem_altmap *altmap)
977
{
978 979 980
	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
			nr_pages - map_offset);
	section_deactivate(pfn, nr_pages, altmap);
981
}
982
#endif /* CONFIG_MEMORY_HOTPLUG */