sparse.c 25.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14 15
#include <linux/swap.h>
#include <linux/swapops.h>
16

17
#include "internal.h"
A
Andy Whitcroft 已提交
18
#include <asm/dma.h>
19 20
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
A
Andy Whitcroft 已提交
21 22 23 24 25 26

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
27
#ifdef CONFIG_SPARSEMEM_EXTREME
28
struct mem_section **mem_section;
29 30
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
31
	____cacheline_internodealigned_in_smp;
32 33 34
#endif
EXPORT_SYMBOL(mem_section);

35 36 37 38 39 40 41 42 43 44 45 46
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
47
int page_to_nid(const struct page *page)
48 49 50 51
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
52 53 54 55 56 57 58 59 60

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
61 62
#endif

63
#ifdef CONFIG_SPARSEMEM_EXTREME
64
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
65 66 67 68 69
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

70
	if (slab_is_available()) {
71
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
72
	} else {
73 74
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
75 76 77 78
		if (!section)
			panic("%s: Failed to allocate %lu bytes nid=%d\n",
			      __func__, array_size, nid);
	}
79 80

	return section;
81
}
B
Bob Picco 已提交
82

83
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
84
{
85 86
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
87

88 89 90 91 92 93 94
	/*
	 * An existing section is possible in the sub-section hotplug
	 * case. First hot-add instantiates, follow-on hot-add reuses
	 * the existing section.
	 *
	 * The mem_hotplug_lock resolves the apparent race below.
	 */
B
Bob Picco 已提交
95
	if (mem_section[root])
96
		return 0;
97

98
	section = sparse_index_alloc(nid);
99 100
	if (!section)
		return -ENOMEM;
101 102

	mem_section[root] = section;
G
Gavin Shan 已提交
103

104
	return 0;
105 106 107 108 109
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
110
}
111 112
#endif

113
#ifdef CONFIG_SPARSEMEM_EXTREME
114
unsigned long __section_nr(struct mem_section *ms)
115 116
{
	unsigned long root_nr;
117
	struct mem_section *root = NULL;
118

119 120
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
121 122 123 124 125 126 127
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

128
	VM_BUG_ON(!root);
129

130 131
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
132
#else
133
unsigned long __section_nr(struct mem_section *ms)
134
{
135
	return (unsigned long)(ms - mem_section[0]);
136 137
}
#endif
138

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

155 156 157
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
158
{
159
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
160

I
Ingo Molnar 已提交
161 162 163 164
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
165 166 167 168 169 170 171
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
172
	} else if (*end_pfn > max_sparsemem_pfn) {
173 174 175 176 177 178 179 180
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

181 182 183 184 185 186 187 188 189
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
190
unsigned long __highest_present_section_nr;
191 192
static void section_mark_present(struct mem_section *ms)
{
193
	unsigned long section_nr = __section_nr(ms);
194 195 196 197 198 199 200

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

201
static inline unsigned long next_present_section_nr(unsigned long section_nr)
202 203 204 205 206
{
	do {
		section_nr++;
		if (present_section_nr(section_nr))
			return section_nr;
207
	} while ((section_nr <= __highest_present_section_nr));
208 209 210 211 212

	return -1;
}
#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
Q
Qian Cai 已提交
213
	     ((section_nr != -1) &&				\
214 215 216
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

217 218 219 220 221
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

Y
Yi Wang 已提交
222
static void subsection_mask_set(unsigned long *map, unsigned long pfn,
223 224 225 226 227 228 229 230 231 232 233
		unsigned long nr_pages)
{
	int idx = subsection_map_index(pfn);
	int end = subsection_map_index(pfn + nr_pages - 1);

	bitmap_set(map, idx, end - idx + 1);
}

void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
{
	int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
234
	unsigned long nr, start_sec = pfn_to_section_nr(pfn);
235 236 237 238

	if (!nr_pages)
		return;

239
	for (nr = start_sec; nr <= end_sec; nr++) {
240 241 242 243 244
		struct mem_section *ms;
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
245
		ms = __nr_to_section(nr);
246 247
		subsection_mask_set(ms->usage->subsection_map, pfn, pfns);

248
		pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
249 250 251 252 253 254 255 256
				pfns, subsection_map_index(pfn),
				subsection_map_index(pfn + pfns - 1));

		pfn += pfns;
		nr_pages -= pfns;
	}
}

257 258 259 260
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
261

262 263 264 265
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

266
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
267
		align = 1 << (INTERNODE_CACHE_SHIFT);
268
		mem_section = memblock_alloc(size, align);
269 270 271
		if (!mem_section)
			panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
			      __func__, size, align);
272 273 274
	}
#endif

A
Andy Whitcroft 已提交
275
	start &= PAGE_SECTION_MASK;
276
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
277 278
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
279 280 281
		struct mem_section *ms;

		sparse_index_init(section, nid);
282
		set_section_nid(section, nid);
B
Bob Picco 已提交
283 284

		ms = __nr_to_section(section);
285
		if (!ms->section_mem_map) {
286 287
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
288 289
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
290 291 292
	}
}

293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
/*
 * Mark all memblocks as present using memory_present(). This is a
 * convienence function that is useful for a number of arches
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
309 310 311 312 313 314 315
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
316 317 318 319 320
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
321 322 323
}

/*
324
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
325 326 327
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
328 329
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
330 331 332
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

333
static void __meminit sparse_init_one_section(struct mem_section *ms,
334
		unsigned long pnum, struct page *mem_map,
335
		struct mem_section_usage *usage, unsigned long flags)
A
Andy Whitcroft 已提交
336
{
337
	ms->section_mem_map &= ~SECTION_MAP_MASK;
338 339
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
		| SECTION_HAS_MEM_MAP | flags;
340
	ms->usage = usage;
A
Andy Whitcroft 已提交
341 342
}

343
static unsigned long usemap_size(void)
344
{
345
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
346 347
}

348
size_t mem_section_usage_size(void)
349
{
350
	return sizeof(struct mem_section_usage) + usemap_size();
351 352
}

353
#ifdef CONFIG_MEMORY_HOTREMOVE
354
static struct mem_section_usage * __init
355
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
356
					 unsigned long size)
357
{
358
	struct mem_section_usage *usage;
359 360
	unsigned long goal, limit;
	int nid;
361 362 363
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
364
	 * other sections referencing the usemap remain active. Similarly,
365 366 367 368 369 370
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
371
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
372 373 374
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
375 376
	usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
	if (!usage && limit) {
377 378 379
		limit = 0;
		goto again;
	}
380
	return usage;
381 382
}

383 384
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
385 386
{
	unsigned long usemap_snr, pgdat_snr;
387 388
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
389 390 391
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

392 393 394 395 396 397
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

398
	usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
399 400 401 402 403 404 405 406 407 408 409 410 411
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
412 413
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
414 415 416 417 418 419 420 421
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
422 423
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
424 425
}
#else
426
static struct mem_section_usage * __init
427
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
428
					 unsigned long size)
429
{
430
	return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
431 432
}

433 434
static void __init check_usemap_section_nr(int nid,
		struct mem_section_usage *usage)
435 436 437 438
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

439
#ifdef CONFIG_SPARSEMEM_VMEMMAP
440
static unsigned long __init section_map_size(void)
441 442 443 444 445
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
446
static unsigned long __init section_map_size(void)
447 448 449 450
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

451 452
struct page __init *__populate_section_memmap(unsigned long pfn,
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
453
{
454 455
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);
456
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
457 458 459

	if (map)
		return map;
A
Andy Whitcroft 已提交
460

461
	map = memblock_alloc_try_nid_raw(size,
462
					  PAGE_SIZE, addr,
463
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
464 465 466 467
	if (!map)
		panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
		      __func__, size, PAGE_SIZE, nid, &addr);

468 469 470 471
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

472 473 474
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

475 476 477 478 479 480
static inline void __meminit sparse_buffer_free(unsigned long size)
{
	WARN_ON(!sparsemap_buf || size == 0);
	memblock_free_early(__pa(sparsemap_buf), size);
}

481
static void __init sparse_buffer_init(unsigned long size, int nid)
482
{
483
	phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
484 485
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
	sparsemap_buf =
486
		memblock_alloc_try_nid_raw(size, PAGE_SIZE,
487
						addr,
488
						MEMBLOCK_ALLOC_ACCESSIBLE, nid);
489 490 491
	sparsemap_buf_end = sparsemap_buf + size;
}

492
static void __init sparse_buffer_fini(void)
493 494 495 496
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
497
		sparse_buffer_free(size);
498 499 500 501 502 503 504 505
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
506
		ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
507 508
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
509 510 511 512
		else {
			/* Free redundant aligned space */
			if ((unsigned long)(ptr - sparsemap_buf) > 0)
				sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
513
			sparsemap_buf = ptr + size;
514
		}
515 516 517 518
	}
	return ptr;
}

519
void __weak __meminit vmemmap_populate_print_last(void)
520 521
{
}
522

523 524 525 526 527 528 529 530
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
531 532
	struct mem_section_usage *usage;
	unsigned long pnum;
533 534
	struct page *map;

535 536 537
	usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
			mem_section_usage_size() * map_count);
	if (!usage) {
538 539 540 541 542
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
543 544
		unsigned long pfn = section_nr_to_pfn(pnum);

545 546 547
		if (pnum >= pnum_end)
			break;

548 549
		map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
				nid, NULL);
550 551 552 553 554 555
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
556
		check_usemap_section_nr(nid, usage);
557 558
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
				SECTION_IS_EARLY);
559
		usage = (void *) usage + mem_section_usage_size();
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
579
void __init sparse_init(void)
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

606
#ifdef CONFIG_MEMORY_HOTPLUG
607 608 609 610 611 612 613

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
614
		unsigned long section_nr = pfn_to_section_nr(pfn);
615 616 617 618 619 620 621 622 623 624 625 626
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
627
/* Mark all memory sections within the pfn range as offline */
628 629 630 631 632
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
633
		unsigned long section_nr = pfn_to_section_nr(pfn);
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

649
#ifdef CONFIG_SPARSEMEM_VMEMMAP
650
static struct page * __meminit populate_section_memmap(unsigned long pfn,
651
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
652
{
653
	return __populate_section_memmap(pfn, nr_pages, nid, altmap);
654
}
655 656

static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
657
		struct vmem_altmap *altmap)
658
{
659 660
	unsigned long start = (unsigned long) pfn_to_page(pfn);
	unsigned long end = start + nr_pages * sizeof(struct page);
661

662
	vmemmap_free(start, end, altmap);
663
}
664
static void free_map_bootmem(struct page *memmap)
665
{
666
	unsigned long start = (unsigned long)memmap;
667
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
668

669
	vmemmap_free(start, end, NULL);
670
}
671
#else
672
struct page * __meminit populate_section_memmap(unsigned long pfn,
673
		unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
674 675
{
	struct page *page, *ret;
676
	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
677

678
	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693
	if (page)
		goto got_map_page;

	ret = vmalloc(memmap_size);
	if (ret)
		goto got_map_ptr;

	return NULL;
got_map_page:
	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:

	return ret;
}

694
static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
695
		struct vmem_altmap *altmap)
696
{
697
	struct page *memmap = pfn_to_page(pfn);
698

699
	if (is_vmalloc_addr(memmap))
700 701 702
		vfree(memmap);
	else
		free_pages((unsigned long)memmap,
703
			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
704
}
705

706
static void free_map_bootmem(struct page *memmap)
707 708
{
	unsigned long maps_section_nr, removing_section_nr, i;
709
	unsigned long magic, nr_pages;
710
	struct page *page = virt_to_page(memmap);
711

712 713 714
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

715
	for (i = 0; i < nr_pages; i++, page++) {
716
		magic = (unsigned long) page->freelist;
717 718 719 720

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
721
		removing_section_nr = page_private(page);
722 723 724 725 726 727 728 729 730 731 732 733 734

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
735
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
736

737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845
static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
		struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	bool section_is_early = early_section(ms);
	struct page *memmap = NULL;
	unsigned long *subsection_map = ms->usage
		? &ms->usage->subsection_map[0] : NULL;

	subsection_mask_set(map, pfn, nr_pages);
	if (subsection_map)
		bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);

	if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
				"section already deactivated (%#lx + %ld)\n",
				pfn, nr_pages))
		return;

	/*
	 * There are 3 cases to handle across two configurations
	 * (SPARSEMEM_VMEMMAP={y,n}):
	 *
	 * 1/ deactivation of a partial hot-added section (only possible
	 * in the SPARSEMEM_VMEMMAP=y case).
	 *    a/ section was present at memory init
	 *    b/ section was hot-added post memory init
	 * 2/ deactivation of a complete hot-added section
	 * 3/ deactivation of a complete section from memory init
	 *
	 * For 1/, when subsection_map does not empty we will not be
	 * freeing the usage map, but still need to free the vmemmap
	 * range.
	 *
	 * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
	 */
	bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
	if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
		unsigned long section_nr = pfn_to_section_nr(pfn);

		if (!section_is_early) {
			kfree(ms->usage);
			ms->usage = NULL;
		}
		memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
		ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
	}

	if (section_is_early && memmap)
		free_map_bootmem(memmap);
	else
		depopulate_section_memmap(pfn, nr_pages, altmap);
}

static struct page * __meminit section_activate(int nid, unsigned long pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
{
	DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
	struct mem_section *ms = __pfn_to_section(pfn);
	struct mem_section_usage *usage = NULL;
	unsigned long *subsection_map;
	struct page *memmap;
	int rc = 0;

	subsection_mask_set(map, pfn, nr_pages);

	if (!ms->usage) {
		usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
		if (!usage)
			return ERR_PTR(-ENOMEM);
		ms->usage = usage;
	}
	subsection_map = &ms->usage->subsection_map[0];

	if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
		rc = -EINVAL;
	else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
		rc = -EEXIST;
	else
		bitmap_or(subsection_map, map, subsection_map,
				SUBSECTIONS_PER_SECTION);

	if (rc) {
		if (usage)
			ms->usage = NULL;
		kfree(usage);
		return ERR_PTR(rc);
	}

	/*
	 * The early init code does not consider partially populated
	 * initial sections, it simply assumes that memory will never be
	 * referenced.  If we hot-add memory into such a section then we
	 * do not need to populate the memmap and can simply reuse what
	 * is already there.
	 */
	if (nr_pages < PAGES_PER_SECTION && early_section(ms))
		return pfn_to_page(pfn);

	memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
	if (!memmap) {
		section_deactivate(pfn, nr_pages, altmap);
		return ERR_PTR(-ENOMEM);
	}

	return memmap;
}

846
/**
847
 * sparse_add_section - add a memory section, or populate an existing one
848 849
 * @nid: The node to add section on
 * @start_pfn: start pfn of the memory range
850
 * @nr_pages: number of pfns to add in the section
851 852 853 854 855 856 857 858
 * @altmap: device page map
 *
 * This is only intended for hotplug.
 *
 * Return:
 * * 0		- On success.
 * * -EEXIST	- Section has been present.
 * * -ENOMEM	- Out of memory.
A
Andy Whitcroft 已提交
859
 */
860 861
int __meminit sparse_add_section(int nid, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
862
{
863 864 865 866
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
	int ret;
A
Andy Whitcroft 已提交
867

868
	ret = sparse_index_init(section_nr, nid);
869
	if (ret < 0)
870
		return ret;
871

872 873 874
	memmap = section_activate(nid, start_pfn, nr_pages, altmap);
	if (IS_ERR(memmap))
		return PTR_ERR(memmap);
875

876 877 878 879
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
880
	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
881

882
	ms = __nr_to_section(section_nr);
883
	set_section_nid(section_nr, nid);
884
	section_mark_present(ms);
885

886 887 888 889 890 891
	/* Align memmap to section boundary in the subsection case */
	if (section_nr_to_pfn(section_nr) != start_pfn)
		memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
	sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);

	return 0;
A
Andy Whitcroft 已提交
892
}
893

894 895 896 897 898
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

899 900 901 902 903 904 905 906 907
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

908
	for (i = 0; i < nr_pages; i++) {
909
		if (PageHWPoison(&memmap[i])) {
910
			num_poisoned_pages_dec();
911 912 913 914 915 916 917 918 919 920
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

921
void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
922 923
		unsigned long nr_pages, unsigned long map_offset,
		struct vmem_altmap *altmap)
924
{
925 926 927
	clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
			nr_pages - map_offset);
	section_deactivate(pfn, nr_pages, altmap);
928
}
929
#endif /* CONFIG_MEMORY_HOTPLUG */