sparse.c 21.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
A
Andy Whitcroft 已提交
2 3 4 5
/*
 * sparse memory mappings.
 */
#include <linux/mm.h>
6
#include <linux/slab.h>
A
Andy Whitcroft 已提交
7
#include <linux/mmzone.h>
8
#include <linux/memblock.h>
9
#include <linux/compiler.h>
10
#include <linux/highmem.h>
11
#include <linux/export.h>
12
#include <linux/spinlock.h>
13
#include <linux/vmalloc.h>
14

15
#include "internal.h"
A
Andy Whitcroft 已提交
16
#include <asm/dma.h>
17 18
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
A
Andy Whitcroft 已提交
19 20 21 22 23 24

/*
 * Permanent SPARSEMEM data:
 *
 * 1) mem_section	- memory sections, mem_map's for valid memory
 */
25
#ifdef CONFIG_SPARSEMEM_EXTREME
26
struct mem_section **mem_section;
27 28
#else
struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
29
	____cacheline_internodealigned_in_smp;
30 31 32
#endif
EXPORT_SYMBOL(mem_section);

33 34 35 36 37 38 39 40 41 42 43 44
#ifdef NODE_NOT_IN_PAGE_FLAGS
/*
 * If we did not store the node number in the page then we have to
 * do a lookup in the section_to_node_table in order to find which
 * node the page belongs to.
 */
#if MAX_NUMNODES <= 256
static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#else
static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
#endif

I
Ian Campbell 已提交
45
int page_to_nid(const struct page *page)
46 47 48 49
{
	return section_to_node_table[page_to_section(page)];
}
EXPORT_SYMBOL(page_to_nid);
50 51 52 53 54 55 56 57 58

static void set_section_nid(unsigned long section_nr, int nid)
{
	section_to_node_table[section_nr] = nid;
}
#else /* !NODE_NOT_IN_PAGE_FLAGS */
static inline void set_section_nid(unsigned long section_nr, int nid)
{
}
59 60
#endif

61
#ifdef CONFIG_SPARSEMEM_EXTREME
62
static noinline struct mem_section __ref *sparse_index_alloc(int nid)
63 64 65 66 67
{
	struct mem_section *section = NULL;
	unsigned long array_size = SECTIONS_PER_ROOT *
				   sizeof(struct mem_section);

68 69 70
	if (slab_is_available())
		section = kzalloc_node(array_size, GFP_KERNEL, nid);
	else
71 72
		section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
					      nid);
73 74

	return section;
75
}
B
Bob Picco 已提交
76

77
static int __meminit sparse_index_init(unsigned long section_nr, int nid)
B
Bob Picco 已提交
78
{
79 80
	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
	struct mem_section *section;
B
Bob Picco 已提交
81 82

	if (mem_section[root])
83
		return -EEXIST;
84

85
	section = sparse_index_alloc(nid);
86 87
	if (!section)
		return -ENOMEM;
88 89

	mem_section[root] = section;
G
Gavin Shan 已提交
90

91
	return 0;
92 93 94 95 96
}
#else /* !SPARSEMEM_EXTREME */
static inline int sparse_index_init(unsigned long section_nr, int nid)
{
	return 0;
B
Bob Picco 已提交
97
}
98 99
#endif

100
#ifdef CONFIG_SPARSEMEM_EXTREME
101 102 103
int __section_nr(struct mem_section* ms)
{
	unsigned long root_nr;
104
	struct mem_section *root = NULL;
105

106 107
	for (root_nr = 0; root_nr < NR_SECTION_ROOTS; root_nr++) {
		root = __nr_to_section(root_nr * SECTIONS_PER_ROOT);
108 109 110 111 112 113 114
		if (!root)
			continue;

		if ((ms >= root) && (ms < (root + SECTIONS_PER_ROOT)))
		     break;
	}

115
	VM_BUG_ON(!root);
116

117 118
	return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
119 120 121 122 123 124
#else
int __section_nr(struct mem_section* ms)
{
	return (int)(ms - mem_section[0]);
}
#endif
125

126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
/*
 * During early boot, before section_mem_map is used for an actual
 * mem_map, we use section_mem_map to store the section's NUMA
 * node.  This keeps us from having to use another data structure.  The
 * node information is cleared just before we store the real mem_map.
 */
static inline unsigned long sparse_encode_early_nid(int nid)
{
	return (nid << SECTION_NID_SHIFT);
}

static inline int sparse_early_nid(struct mem_section *section)
{
	return (section->section_mem_map >> SECTION_NID_SHIFT);
}

142 143 144
/* Validate the physical addressing limitations of the model */
void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
						unsigned long *end_pfn)
A
Andy Whitcroft 已提交
145
{
146
	unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
A
Andy Whitcroft 已提交
147

I
Ingo Molnar 已提交
148 149 150 151
	/*
	 * Sanity checks - do not allow an architecture to pass
	 * in larger pfns than the maximum scope of sparsemem:
	 */
152 153 154 155 156 157 158
	if (*start_pfn > max_sparsemem_pfn) {
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*start_pfn = max_sparsemem_pfn;
		*end_pfn = max_sparsemem_pfn;
159
	} else if (*end_pfn > max_sparsemem_pfn) {
160 161 162 163 164 165 166 167
		mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
			"End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
			*start_pfn, *end_pfn, max_sparsemem_pfn);
		WARN_ON_ONCE(1);
		*end_pfn = max_sparsemem_pfn;
	}
}

168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
/*
 * There are a number of times that we loop over NR_MEM_SECTIONS,
 * looking for section_present() on each.  But, when we have very
 * large physical address spaces, NR_MEM_SECTIONS can also be
 * very large which makes the loops quite long.
 *
 * Keeping track of this gives us an easy way to break out of
 * those loops early.
 */
int __highest_present_section_nr;
static void section_mark_present(struct mem_section *ms)
{
	int section_nr = __section_nr(ms);

	if (section_nr > __highest_present_section_nr)
		__highest_present_section_nr = section_nr;

	ms->section_mem_map |= SECTION_MARKED_PRESENT;
}

static inline int next_present_section_nr(int section_nr)
{
	do {
		section_nr++;
		if (present_section_nr(section_nr))
			return section_nr;
194
	} while ((section_nr <= __highest_present_section_nr));
195 196 197 198 199 200 201 202 203

	return -1;
}
#define for_each_present_section_nr(start, section_nr)		\
	for (section_nr = next_present_section_nr(start-1);	\
	     ((section_nr >= 0) &&				\
	      (section_nr <= __highest_present_section_nr));	\
	     section_nr = next_present_section_nr(section_nr))

204 205 206 207 208
static inline unsigned long first_present_section_nr(void)
{
	return next_present_section_nr(-1);
}

209 210 211 212
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
	unsigned long pfn;
I
Ingo Molnar 已提交
213

214 215 216 217
#ifdef CONFIG_SPARSEMEM_EXTREME
	if (unlikely(!mem_section)) {
		unsigned long size, align;

218
		size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
219
		align = 1 << (INTERNODE_CACHE_SHIFT);
220
		mem_section = memblock_alloc(size, align);
221 222 223
	}
#endif

A
Andy Whitcroft 已提交
224
	start &= PAGE_SECTION_MASK;
225
	mminit_validate_memmodel_limits(&start, &end);
A
Andy Whitcroft 已提交
226 227
	for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
		unsigned long section = pfn_to_section_nr(pfn);
B
Bob Picco 已提交
228 229 230
		struct mem_section *ms;

		sparse_index_init(section, nid);
231
		set_section_nid(section, nid);
B
Bob Picco 已提交
232 233

		ms = __nr_to_section(section);
234
		if (!ms->section_mem_map) {
235 236
			ms->section_mem_map = sparse_encode_early_nid(nid) |
							SECTION_IS_ONLINE;
237 238
			section_mark_present(ms);
		}
A
Andy Whitcroft 已提交
239 240 241
	}
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
/*
 * Mark all memblocks as present using memory_present(). This is a
 * convienence function that is useful for a number of arches
 * to mark all of the systems memory as present during initialization.
 */
void __init memblocks_present(void)
{
	struct memblock_region *reg;

	for_each_memblock(memory, reg) {
		memory_present(memblock_get_region_node(reg),
			       memblock_region_memory_base_pfn(reg),
			       memblock_region_memory_end_pfn(reg));
	}
}

A
Andy Whitcroft 已提交
258 259 260 261 262 263 264
/*
 * Subtle, we encode the real pfn into the mem_map such that
 * the identity pfn - section_mem_map will return the actual
 * physical page frame number.
 */
static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
{
265 266 267 268 269
	unsigned long coded_mem_map =
		(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
	BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
	BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
	return coded_mem_map;
A
Andy Whitcroft 已提交
270 271 272
}

/*
273
 * Decode mem_map from the coded memmap
A
Andy Whitcroft 已提交
274 275 276
 */
struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
{
277 278
	/* mask off the extra low bits of information */
	coded_mem_map &= SECTION_MAP_MASK;
A
Andy Whitcroft 已提交
279 280 281
	return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
}

282
static void __meminit sparse_init_one_section(struct mem_section *ms,
283 284
		unsigned long pnum, struct page *mem_map,
		unsigned long *pageblock_bitmap)
A
Andy Whitcroft 已提交
285
{
286
	ms->section_mem_map &= ~SECTION_MAP_MASK;
287 288
	ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
							SECTION_HAS_MEM_MAP;
289
 	ms->pageblock_flags = pageblock_bitmap;
A
Andy Whitcroft 已提交
290 291
}

292
unsigned long usemap_size(void)
293
{
294
	return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
295 296 297 298 299 300 301 302 303
}

#ifdef CONFIG_MEMORY_HOTPLUG
static unsigned long *__kmalloc_section_usemap(void)
{
	return kmalloc(usemap_size(), GFP_KERNEL);
}
#endif /* CONFIG_MEMORY_HOTPLUG */

304 305
#ifdef CONFIG_MEMORY_HOTREMOVE
static unsigned long * __init
306
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
307
					 unsigned long size)
308
{
309 310 311
	unsigned long goal, limit;
	unsigned long *p;
	int nid;
312 313 314
	/*
	 * A page may contain usemaps for other sections preventing the
	 * page being freed and making a section unremovable while
L
Li Zhong 已提交
315
	 * other sections referencing the usemap remain active. Similarly,
316 317 318 319 320 321
	 * a pgdat can prevent a section being removed. If section A
	 * contains a pgdat and section B contains the usemap, both
	 * sections become inter-dependent. This allocates usemaps
	 * from the same section as the pgdat where possible to avoid
	 * this problem.
	 */
322
	goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
323 324 325
	limit = goal + (1UL << PA_SECTION_SHIFT);
	nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
326
	p = memblock_alloc_try_nid_nopanic(size,
327 328
						SMP_CACHE_BYTES, goal, limit,
						nid);
329 330 331 332 333
	if (!p && limit) {
		limit = 0;
		goto again;
	}
	return p;
334 335 336 337 338
}

static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
{
	unsigned long usemap_snr, pgdat_snr;
339 340
	static unsigned long old_usemap_snr;
	static unsigned long old_pgdat_snr;
341 342 343
	struct pglist_data *pgdat = NODE_DATA(nid);
	int usemap_nid;

344 345 346 347 348 349
	/* First call */
	if (!old_usemap_snr) {
		old_usemap_snr = NR_MEM_SECTIONS;
		old_pgdat_snr = NR_MEM_SECTIONS;
	}

350 351 352 353 354 355 356 357 358 359 360 361 362 363
	usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
	pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
	if (usemap_snr == pgdat_snr)
		return;

	if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
		/* skip redundant message */
		return;

	old_usemap_snr = usemap_snr;
	old_pgdat_snr = pgdat_snr;

	usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
	if (usemap_nid != nid) {
364 365
		pr_info("node %d must be removed before remove section %ld\n",
			nid, usemap_snr);
366 367 368 369 370 371 372 373
		return;
	}
	/*
	 * There is a circular dependency.
	 * Some platforms allow un-removable section because they will just
	 * gather other removable sections for dynamic partitioning.
	 * Just notify un-removable section's number here.
	 */
374 375
	pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
		usemap_snr, pgdat_snr, nid);
376 377 378
}
#else
static unsigned long * __init
379
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
380
					 unsigned long size)
381
{
382
	return memblock_alloc_node_nopanic(size, pgdat->node_id);
383 384 385 386 387 388 389
}

static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

390
#ifdef CONFIG_SPARSEMEM_VMEMMAP
391
static unsigned long __init section_map_size(void)
392 393 394 395 396
{
	return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
}

#else
397
static unsigned long __init section_map_size(void)
398 399 400 401
{
	return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}

402 403
struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
		struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
404
{
405 406 407 408 409
	unsigned long size = section_map_size();
	struct page *map = sparse_buffer_alloc(size);

	if (map)
		return map;
A
Andy Whitcroft 已提交
410

411
	map = memblock_alloc_try_nid(size,
412
					  PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
413
					  MEMBLOCK_ALLOC_ACCESSIBLE, nid);
414 415 416 417
	return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */

418 419 420
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;

421
static void __init sparse_buffer_init(unsigned long size, int nid)
422 423 424
{
	WARN_ON(sparsemap_buf);	/* forgot to call sparse_buffer_fini()? */
	sparsemap_buf =
425
		memblock_alloc_try_nid_raw(size, PAGE_SIZE,
426
						__pa(MAX_DMA_ADDRESS),
427
						MEMBLOCK_ALLOC_ACCESSIBLE, nid);
428 429 430
	sparsemap_buf_end = sparsemap_buf + size;
}

431
static void __init sparse_buffer_fini(void)
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
{
	unsigned long size = sparsemap_buf_end - sparsemap_buf;

	if (sparsemap_buf && size > 0)
		memblock_free_early(__pa(sparsemap_buf), size);
	sparsemap_buf = NULL;
}

void * __meminit sparse_buffer_alloc(unsigned long size)
{
	void *ptr = NULL;

	if (sparsemap_buf) {
		ptr = PTR_ALIGN(sparsemap_buf, size);
		if (ptr + size > sparsemap_buf_end)
			ptr = NULL;
		else
			sparsemap_buf = ptr + size;
	}
	return ptr;
}

454
void __weak __meminit vmemmap_populate_print_last(void)
455 456
{
}
457

458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
/*
 * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
 * And number of present sections in this node is map_count.
 */
static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
				   unsigned long pnum_end,
				   unsigned long map_count)
{
	unsigned long pnum, usemap_longs, *usemap;
	struct page *map;

	usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
							  usemap_size() *
							  map_count);
	if (!usemap) {
		pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
		goto failed;
	}
	sparse_buffer_init(map_count * section_map_size(), nid);
	for_each_present_section_nr(pnum_begin, pnum) {
		if (pnum >= pnum_end)
			break;

		map = sparse_mem_map_populate(pnum, nid, NULL);
		if (!map) {
			pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
			       __func__, nid);
			pnum_begin = pnum;
			goto failed;
		}
		check_usemap_section_nr(nid, usemap);
		sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
		usemap += usemap_longs;
	}
	sparse_buffer_fini();
	return;
failed:
	/* We failed to allocate, mark all the following pnums as not present */
	for_each_present_section_nr(pnum_begin, pnum) {
		struct mem_section *ms;

		if (pnum >= pnum_end)
			break;
		ms = __nr_to_section(pnum);
		ms->section_mem_map = 0;
	}
}

/*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
 */
511
void __init sparse_init(void)
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
{
	unsigned long pnum_begin = first_present_section_nr();
	int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
	unsigned long pnum_end, map_count = 1;

	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
	set_pageblock_order();

	for_each_present_section_nr(pnum_begin + 1, pnum_end) {
		int nid = sparse_early_nid(__nr_to_section(pnum_end));

		if (nid == nid_begin) {
			map_count++;
			continue;
		}
		/* Init node with sections in range [pnum_begin, pnum_end) */
		sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
		nid_begin = nid;
		pnum_begin = pnum_end;
		map_count = 1;
	}
	/* cover the last node */
	sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
	vmemmap_populate_print_last();
}

538
#ifdef CONFIG_MEMORY_HOTPLUG
539 540 541 542 543 544 545

/* Mark all memory sections within the pfn range as online */
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
546
		unsigned long section_nr = pfn_to_section_nr(pfn);
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
		struct mem_section *ms;

		/* onlining code should never touch invalid ranges */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map |= SECTION_IS_ONLINE;
	}
}

#ifdef CONFIG_MEMORY_HOTREMOVE
/* Mark all memory sections within the pfn range as online */
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
565
		unsigned long section_nr = pfn_to_section_nr(pfn);
566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
		struct mem_section *ms;

		/*
		 * TODO this needs some double checking. Offlining code makes
		 * sure to check pfn_valid but those checks might be just bogus
		 */
		if (WARN_ON(!valid_section_nr(section_nr)))
			continue;

		ms = __nr_to_section(section_nr);
		ms->section_mem_map &= ~SECTION_IS_ONLINE;
	}
}
#endif

581
#ifdef CONFIG_SPARSEMEM_VMEMMAP
582 583
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
		struct vmem_altmap *altmap)
584 585
{
	/* This will make the necessary allocations eventually. */
586
	return sparse_mem_map_populate(pnum, nid, altmap);
587
}
588 589
static void __kfree_section_memmap(struct page *memmap,
		struct vmem_altmap *altmap)
590
{
591
	unsigned long start = (unsigned long)memmap;
592
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
593

594
	vmemmap_free(start, end, altmap);
595
}
596
#ifdef CONFIG_MEMORY_HOTREMOVE
597
static void free_map_bootmem(struct page *memmap)
598
{
599
	unsigned long start = (unsigned long)memmap;
600
	unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
601

602
	vmemmap_free(start, end, NULL);
603
}
604
#endif /* CONFIG_MEMORY_HOTREMOVE */
605
#else
606
static struct page *__kmalloc_section_memmap(void)
607 608
{
	struct page *page, *ret;
609
	unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
610

611
	page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626
	if (page)
		goto got_map_page;

	ret = vmalloc(memmap_size);
	if (ret)
		goto got_map_ptr;

	return NULL;
got_map_page:
	ret = (struct page *)pfn_to_kaddr(page_to_pfn(page));
got_map_ptr:

	return ret;
}

627 628
static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
		struct vmem_altmap *altmap)
629
{
630
	return __kmalloc_section_memmap();
631 632
}

633 634
static void __kfree_section_memmap(struct page *memmap,
		struct vmem_altmap *altmap)
635
{
636
	if (is_vmalloc_addr(memmap))
637 638 639
		vfree(memmap);
	else
		free_pages((unsigned long)memmap,
640
			   get_order(sizeof(struct page) * PAGES_PER_SECTION));
641
}
642

643
#ifdef CONFIG_MEMORY_HOTREMOVE
644
static void free_map_bootmem(struct page *memmap)
645 646
{
	unsigned long maps_section_nr, removing_section_nr, i;
647
	unsigned long magic, nr_pages;
648
	struct page *page = virt_to_page(memmap);
649

650 651 652
	nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
		>> PAGE_SHIFT;

653
	for (i = 0; i < nr_pages; i++, page++) {
654
		magic = (unsigned long) page->freelist;
655 656 657 658

		BUG_ON(magic == NODE_INFO);

		maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
659
		removing_section_nr = page_private(page);
660 661 662 663 664 665 666 667 668 669 670 671 672

		/*
		 * When this function is called, the removing section is
		 * logical offlined state. This means all pages are isolated
		 * from page allocator. If removing section's memmap is placed
		 * on the same section, it must not be freed.
		 * If it is freed, page allocator may allocate it which will
		 * be removed physically soon.
		 */
		if (maps_section_nr != removing_section_nr)
			put_page_bootmem(page);
	}
}
673
#endif /* CONFIG_MEMORY_HOTREMOVE */
674
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
675

A
Andy Whitcroft 已提交
676 677 678 679 680
/*
 * returns the number of sections whose mem_maps were properly
 * set.  If this is <=0, then that means that the passed-in
 * map was not consumed and must be freed.
 */
681 682
int __meminit sparse_add_one_section(struct pglist_data *pgdat,
		unsigned long start_pfn, struct vmem_altmap *altmap)
A
Andy Whitcroft 已提交
683
{
684 685 686
	unsigned long section_nr = pfn_to_section_nr(start_pfn);
	struct mem_section *ms;
	struct page *memmap;
687
	unsigned long *usemap;
688
	int ret;
A
Andy Whitcroft 已提交
689

690 691 692 693
	/*
	 * no locking for this, because it does its own
	 * plus, it does a kmalloc
	 */
694 695 696
	ret = sparse_index_init(section_nr, pgdat->node_id);
	if (ret < 0 && ret != -EEXIST)
		return ret;
697
	ret = 0;
698
	memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
699 700
	if (!memmap)
		return -ENOMEM;
701
	usemap = __kmalloc_section_usemap();
702
	if (!usemap) {
703
		__kfree_section_memmap(memmap, altmap);
704 705
		return -ENOMEM;
	}
706 707 708 709 710 711

	ms = __pfn_to_section(start_pfn);
	if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
		ret = -EEXIST;
		goto out;
	}
712

713 714 715 716
	/*
	 * Poison uninitialized struct pages in order to catch invalid flags
	 * combinations.
	 */
717
	page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION);
718

719
	section_mark_present(ms);
720
	sparse_init_one_section(ms, section_nr, memmap, usemap);
721 722

out:
723
	if (ret < 0) {
724
		kfree(usemap);
725
		__kfree_section_memmap(memmap, altmap);
726
	}
727
	return ret;
A
Andy Whitcroft 已提交
728
}
729

730
#ifdef CONFIG_MEMORY_HOTREMOVE
731 732 733 734 735 736 737 738
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
	int i;

	if (!memmap)
		return;

739 740 741 742 743 744 745 746 747
	/*
	 * A further optimization is to have per section refcounted
	 * num_poisoned_pages.  But that would need more space per memmap, so
	 * for now just do a quick global check to speed up this routine in the
	 * absence of bad pages.
	 */
	if (atomic_long_read(&num_poisoned_pages) == 0)
		return;

748
	for (i = 0; i < nr_pages; i++) {
749
		if (PageHWPoison(&memmap[i])) {
750
			atomic_long_sub(1, &num_poisoned_pages);
751 752 753 754 755 756 757 758 759 760
			ClearPageHWPoison(&memmap[i]);
		}
	}
}
#else
static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
}
#endif

761 762
static void free_section_usemap(struct page *memmap, unsigned long *usemap,
		struct vmem_altmap *altmap)
763 764 765 766 767 768 769 770 771 772 773 774 775
{
	struct page *usemap_page;

	if (!usemap)
		return;

	usemap_page = virt_to_page(usemap);
	/*
	 * Check to see if allocation came from hot-plug-add
	 */
	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
		kfree(usemap);
		if (memmap)
776
			__kfree_section_memmap(memmap, altmap);
777 778 779 780 781 782 783 784
		return;
	}

	/*
	 * The usemap came from bootmem. This is packed with other usemaps
	 * on the section which has pgdat at boot time. Just keep it as is now.
	 */

785 786
	if (memmap)
		free_map_bootmem(memmap);
787 788
}

789
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
790
		unsigned long map_offset, struct vmem_altmap *altmap)
791 792
{
	struct page *memmap = NULL;
793
	unsigned long *usemap = NULL;
794 795 796 797 798 799 800 801 802

	if (ms->section_mem_map) {
		usemap = ms->pageblock_flags;
		memmap = sparse_decode_mem_map(ms->section_mem_map,
						__section_nr(ms));
		ms->section_mem_map = 0;
		ms->pageblock_flags = NULL;
	}

803 804
	clear_hwpoisoned_pages(memmap + map_offset,
			PAGES_PER_SECTION - map_offset);
805
	free_section_usemap(memmap, usemap, altmap);
806
}
807 808
#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */