sparse.c 26.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14 15
#include <linux/swap.h>
#include <linux/swapops.h>
16

17
#include "internal.h"
A
Andy Whitcroft 已提交
18 19 20 21 22 23 24
#include <asm/dma.h>

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
25
#ifdef CONFIG_SPARSEMEM_EXTREME
26
struct mem_section **mem_section;
27 28
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
29
	____cacheline_internodealigned_in_smp;
30 31 32
#endif
EXPORT_SYMBOL(mem_section);

33 34 35 36 37 38 39 40 41 42 43 44
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
45
int page_to_nid(const struct page *page)
46 47 48 49
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
50 51 52 53 54 55 56 57 58

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
59 60
#endif

61
#ifdef CONFIG_SPARSEMEM_EXTREME
62
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
63 64 65 66 67
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

68
	if (slab_is_available()) {
69
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
70
	} else {
71 72
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
73 74 75 76
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
77 78

	return section;
79
}
B
Bob Picco 已提交
80

81
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
82
{
83 84
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
85

86 87 88 89 90 91 92
	/*
	 * An existing section is possible in the sub-section hotplug
	 * case. First hot-add instantiates, follow-on hot-add reuses
	 * the existing section.
	 *
	 * The mem_hotplug_lock resolves the apparent race below.
	 */
B
Bob Picco 已提交
93
	if (mem_section[root])
94
		return 0;
95

96
	section = sparse_index_alloc(nid);
97 98
	if (!section)
		return -ENOMEM;
99 100

	mem_section[root] = section;
G
Gavin Shan 已提交
101

102
	return 0;
103 104 105 106 107
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
108
}
109 110
#endif

111
#ifdef CONFIG_SPARSEMEM_EXTREME
112
unsigned long __section_nr(struct mem_section *ms)
113 114
{
	unsigned long root_nr;
115
	struct mem_section *root = NULL;
116

117 118
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
119 120 121 122 123 124 125
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

126
	VM_BUG_ON(!root);
127

128 129
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
130
#else
131
unsigned long __section_nr(struct mem_section *ms)
132
{
133
	return (unsigned long)(ms - mem_section[0]);
134 135
}
#endif
136

137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

153 154 155
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
156
{
157
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
158

I
Ingo Molnar 已提交
159 160 161 162
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
163 164 165 166 167 168 169
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
170
	} else if (*end_pfn > max_sparsemem_pfn) {
171 172 173 174 175 176 177 178
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

179 180 181 182 183 184 185 186 187
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
188
unsigned long __highest_present_section_nr;
189 190
static void section_mark_present(struct mem_section *ms)
{
191
	unsigned long section_nr = __section_nr(ms);
192 193 194 195 196 197 198 199 200

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
201
	     ((section_nr != -1) &&				\
202 203 204
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

205 206 207 208 209
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

210
#ifdef CONFIG_SPARSEMEM_VMEMMAP
Y
Yi Wang 已提交
211
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
212 213 214 215 216 217 218 219 220 221 222
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
223
	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
224 225 226 227

	if (!nr_pages)
		return;

228
	for (nr = start_sec; nr <= end_sec; nr++) {
229 230 231 232 233
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
234
		ms = __nr_to_section(nr);
235 236
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

237
		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
238 239 240 241 242 243 244
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}
245 246 247 248 249
#else
void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
}
#endif
250

251 252 253 254
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
255

256 257 258 259
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

260
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
261
		align = 1 << (INTERNODE_CACHE_SHIFT);
262
		mem_section = memblock_alloc(size, align);
263 264 265
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
266 267 268
	}
#endif

A
Andy Whitcroft 已提交
269
	start &= PAGE_SECTION_MASK;
270
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
271 272
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
273 274 275
		struct mem_section *ms;

		sparse_index_init(section, nid);
276
		set_section_nid(section, nid);
B
Bob Picco 已提交
277 278

		ms = __nr_to_section(section);
279
		if (!ms->section_mem_map) {
280 281
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
282 283
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
284 285 286
	}
}

287 288
/*
 * Mark all memblocks as present using memory_present(). This is a
289
 * convenience function that is useful for a number of arches
290 291 292 293 294 295 296 297 298 299 300 301 302
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
303 304 305 306 307 308 309
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
310 311 312 313 314
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
315 316 317
}

/*
318
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
319 320 321
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
322 323
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
324 325 326
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

327
static void __meminit sparse_init_one_section(struct mem_section *ms,
328
		unsigned long pnum, struct page *mem_map,
329
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
330
{
331
	ms->section_mem_map &= ~SECTION_MAP_MASK;
332 333
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
334
	ms->usage = usage;
A
Andy Whitcroft 已提交
335 336
}

337
static unsigned long usemap_size(void)
338
{
339
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
340 341
}

342
size_t mem_section_usage_size(void)
343
{
344
	return sizeof(struct mem_section_usage) + usemap_size();
345 346
}

347
#ifdef CONFIG_MEMORY_HOTREMOVE
348
static struct mem_section_usage * __init
349
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
350
					 unsigned long size)
351
{
352
	struct mem_section_usage *usage;
353 354
	unsigned long goal, limit;
	int nid;
355 356 357
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
358
	 * other sections referencing the usemap remain active. Similarly,
359 360 361 362 363 364
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
365
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
366 367 368
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
369 370
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
371 372 373
		limit = 0;
		goto again;
	}
374
	return usage;
375 376
}

377 378
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
379 380
{
	unsigned long usemap_snr, pgdat_snr;
381 382
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
383 384 385
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

386 387 388 389 390 391
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

392
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
393 394 395 396 397 398 399 400 401 402 403 404 405
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
406 407
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
408 409 410 411 412 413 414 415
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
416 417
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
418 419
}
#else
420
static struct mem_section_usage * __init
421
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
422
					 unsigned long size)
423
{
424
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
425 426
}

427 428
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
429 430 431 432
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

433
#ifdef CONFIG_SPARSEMEM_VMEMMAP
434
static unsigned long __init section_map_size(void)
435 436 437 438 439
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
440
static unsigned long __init section_map_size(void)
441 442 443 444
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

445 446
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
447
{
448 449
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
450
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
451 452 453

	if (map)
		return map;
A
Andy Whitcroft 已提交
454

455
	map = memblock_alloc_try_nid_raw(size, size, addr,
456
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
457 458 459 460
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

461 462 463 464
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

465 466 467
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

468 469 470 471 472 473
static inline void __meminit sparse_buffer_free(unsigned long size)
{
	WARN_ON(!sparsemap_buf || size == 0);
	memblock_free_early(__pa(sparsemap_buf), size);
}

474
static void __init sparse_buffer_init(unsigned long size, int nid)
475
{
476
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
477
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
478 479 480 481 482
	/*
	 * Pre-allocated buffer is mainly used by __populate_section_memmap
	 * and we want it to be properly aligned to the section size - this is
	 * especially the case for VMEMMAP which maps memmap to PMDs
	 */
483
	sparsemap_buf = memblock_alloc_exact_nid_raw(size, section_map_size(),
484
					addr, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
485 486 487
	sparsemap_buf_end = sparsemap_buf + size;
}

488
static void __init sparse_buffer_fini(void)
489 490 491 492
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
493
		sparse_buffer_free(size);
494 495 496 497 498 499 500 501
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
502
		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
503 504
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
505 506 507 508
		else {
			/* Free redundant aligned space */
			if ((unsigned long)(ptr - sparsemap_buf) > 0)
				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
509
			sparsemap_buf = ptr + size;
510
		}
511 512 513 514
	}
	return ptr;
}

515
void __weak __meminit vmemmap_populate_print_last(void)
516 517
{
}
518

519 520 521 522 523 524 525 526
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
527 528
	struct mem_section_usage *usage;
	unsigned long pnum;
529 530
	struct page *map;

531 532 533
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
534 535 536 537 538
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
539 540
		unsigned long pfn = section_nr_to_pfn(pnum);

541 542 543
		if (pnum >= pnum_end)
			break;

544 545
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
546 547 548 549 550 551
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
552
		check_usemap_section_nr(nid, usage);
553 554
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
555
		usage = (void *) usage + mem_section_usage_size();
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
575
void __init sparse_init(void)
576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

602
#ifdef CONFIG_MEMORY_HOTPLUG
603 604 605 606 607 608 609

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
610
		unsigned long section_nr = pfn_to_section_nr(pfn);
611 612 613 614 615 616 617 618 619 620 621 622
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
623
/* Mark all memory sections within the pfn range as offline */
624 625 626 627 628
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
629
		unsigned long section_nr = pfn_to_section_nr(pfn);
630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

645
#ifdef CONFIG_SPARSEMEM_VMEMMAP
646
static struct page * __meminit populate_section_memmap(unsigned long pfn,
647
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
648
{
649
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
650
}
651 652

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
653
		struct vmem_altmap *altmap)
654
{
655 656
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
657

658
	vmemmap_free(start, end, altmap);
659
}
660
static void free_map_bootmem(struct page *memmap)
661
{
662
	unsigned long start = (unsigned long)memmap;
663
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
664

665
	vmemmap_free(start, end, NULL);
666
}
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715

static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	unsigned long *subsection_map = ms->usage
		? &ms->usage->subsection_map[0] : NULL;

	subsection_mask_set(map, pfn, nr_pages);
	if (subsection_map)
		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);

	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
				"section already deactivated (%#lx + %ld)\n",
				pfn, nr_pages))
		return -EINVAL;

	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
	return 0;
}

static bool is_subsection_map_empty(struct mem_section *ms)
{
	return bitmap_empty(&ms->usage->subsection_map[0],
			    SUBSECTIONS_PER_SECTION);
}

static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	unsigned long *subsection_map;
	int rc = 0;

	subsection_mask_set(map, pfn, nr_pages);

	subsection_map = &ms->usage->subsection_map[0];

	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
		rc = -EINVAL;
	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
		rc = -EEXIST;
	else
		bitmap_or(subsection_map, map, subsection_map,
				SUBSECTIONS_PER_SECTION);

	return rc;
}
716
#else
717
struct page * __meminit populate_section_memmap(unsigned long pfn,
718
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
719
{
720 721
	return kvmalloc_node(array_size(sizeof(struct page),
					PAGES_PER_SECTION), GFP_KERNEL, nid);
722 723
}

724
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
725
		struct vmem_altmap *altmap)
726
{
727
	kvfree(pfn_to_page(pfn));
728
}
729

730
static void free_map_bootmem(struct page *memmap)
731 732
{
	unsigned long maps_section_nr, removing_section_nr, i;
733
	unsigned long magic, nr_pages;
734
	struct page *page = virt_to_page(memmap);
735

736 737 738
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

739
	for (i = 0; i < nr_pages; i++, page++) {
740
		magic = (unsigned long) page->freelist;
741 742 743 744

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
745
		removing_section_nr = page_private(page);
746 747 748 749 750 751 752 753 754 755 756 757 758

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
759

760
static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
761
{
762 763 764 765 766
	return 0;
}

static bool is_subsection_map_empty(struct mem_section *ms)
{
767
	return true;
768 769
}

770
static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
771
{
772
	return 0;
773
}
774
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
775

776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791
/*
 * To deactivate a memory region, there are 3 cases to handle across
 * two configurations (SPARSEMEM_VMEMMAP={y,n}):
 *
 * 1. deactivation of a partial hot-added section (only possible in
 *    the SPARSEMEM_VMEMMAP=y case).
 *      a) section was present at memory init.
 *      b) section was hot-added post memory init.
 * 2. deactivation of a complete hot-added section.
 * 3. deactivation of a complete section from memory init.
 *
 * For 1, when subsection_map does not empty we will not be freeing the
 * usage map, but still need to free the vmemmap range.
 *
 * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
 */
792 793 794 795 796 797 798 799 800 801
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	bool section_is_early = early_section(ms);
	struct page *memmap = NULL;
	bool empty;

	if (clear_subsection_map(pfn, nr_pages))
		return;
802

803
	empty = is_subsection_map_empty(ms);
804
	if (empty) {
805 806
		unsigned long section_nr = pfn_to_section_nr(pfn);

807 808 809 810 811 812 813 814
		/*
		 * When removing an early section, the usage map is kept (as the
		 * usage maps of other sections fall into the same page). It
		 * will be re-used when re-adding the section - which is then no
		 * longer an early section. If the usage map is PageReserved, it
		 * was allocated during boot.
		 */
		if (!PageReserved(virt_to_page(ms->usage))) {
815 816 817 818
			kfree(ms->usage);
			ms->usage = NULL;
		}
		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
819 820 821 822 823 824
		/*
		 * Mark the section invalid so that valid_section()
		 * return false. This prevents code from dereferencing
		 * ms->usage array.
		 */
		ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
825 826
	}

827 828 829 830 831
	/*
	 * The memmap of early sections is always fully populated. See
	 * section_activate() and pfn_valid() .
	 */
	if (!section_is_early)
832
		depopulate_section_memmap(pfn, nr_pages, altmap);
833 834
	else if (memmap)
		free_map_bootmem(memmap);
835 836 837

	if (empty)
		ms->section_mem_map = (unsigned long)NULL;
838 839
}

840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855
static struct page * __meminit section_activate(int nid, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
{
	struct mem_section *ms = __pfn_to_section(pfn);
	struct mem_section_usage *usage = NULL;
	struct page *memmap;
	int rc = 0;

	if (!ms->usage) {
		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
		if (!usage)
			return ERR_PTR(-ENOMEM);
		ms->usage = usage;
	}

	rc = fill_subsection_map(pfn, nr_pages);
856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881
	if (rc) {
		if (usage)
			ms->usage = NULL;
		kfree(usage);
		return ERR_PTR(rc);
	}

	/*
	 * The early init code does not consider partially populated
	 * initial sections, it simply assumes that memory will never be
	 * referenced.  If we hot-add memory into such a section then we
	 * do not need to populate the memmap and can simply reuse what
	 * is already there.
	 */
	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
		return pfn_to_page(pfn);

	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
	if (!memmap) {
		section_deactivate(pfn, nr_pages, altmap);
		return ERR_PTR(-ENOMEM);
	}

	return memmap;
}

882
/**
883
 * sparse_add_section - add a memory section, or populate an existing one
884 885
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
886
 * @nr_pages: number of pfns to add in the section
887 888 889 890
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
891 892 893 894
 * Note that only VMEMMAP supports sub-section aligned hotplug,
 * the proper alignment and size are gated by check_pfn_span().
 *
 *
895 896 897 898
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
899
 */
900 901
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
902
{
903 904 905 906
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
907

908
	ret = sparse_index_init(section_nr, nid);
909
	if (ret < 0)
910
		return ret;
911

912 913 914
	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
	if (IS_ERR(memmap))
		return PTR_ERR(memmap);
915

916 917 918 919
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
920
	page_init_poison(memmap, sizeof(struct page) * nr_pages);
921

922
	ms = __nr_to_section(section_nr);
923
	set_section_nid(section_nr, nid);
924
	section_mark_present(ms);
925

926 927
	/* Align memmap to section boundary in the subsection case */
	if (section_nr_to_pfn(section_nr) != start_pfn)
928
		memmap = pfn_to_page(section_nr_to_pfn(section_nr));
929 930 931
	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);

	return 0;
A
Andy Whitcroft 已提交
932
}
933

934 935 936 937 938
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

939 940 941 942 943 944 945 946 947
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

948
	for (i = 0; i < nr_pages; i++) {
949
		if (PageHWPoison(&memmap[i])) {
950
			num_poisoned_pages_dec();
951 952 953 954 955 956 957 958 959 960
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

961
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
962 963
		unsigned long nr_pages, unsigned long map_offset,
		struct vmem_altmap *altmap)
964
{
965 966 967
	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
			nr_pages - map_offset);
	section_deactivate(pfn, nr_pages, altmap);
968
}
969
#endif /* CONFIG_MEMORY_HOTPLUG */