memory_hotplug.c 47.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8 9
/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
10
#include <linux/sched/signal.h>
11 12 13 14
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/compiler.h>
15
#include <linux/export.h>
16
#include <linux/pagevec.h>
17
#include <linux/writeback.h>
18 19 20 21
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
22
#include <linux/memremap.h>
23 24 25
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
26
#include <linux/ioport.h>
K
KAMEZAWA Hiroyuki 已提交
27 28 29
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
30
#include <linux/pfn.h>
31
#include <linux/suspend.h>
32
#include <linux/mm_inline.h>
33
#include <linux/firmware-map.h>
34
#include <linux/stop_machine.h>
35
#include <linux/hugetlb.h>
36
#include <linux/memblock.h>
37
#include <linux/compaction.h>
38
#include <linux/rmap.h>
39 40 41

#include <asm/tlbflush.h>

42
#include "internal.h"
43
#include "shuffle.h"
44

45 46 47 48 49 50 51 52
/*
 * online_page_callback contains pointer to current page onlining function.
 * Initially it is generic_online_page(). If it is required it could be
 * changed by calling set_online_page_callback() for callback registration
 * and restore_online_page_callback() for generic callback restore.
 */

static online_page_callback_t online_page_callback = generic_online_page;
53
static DEFINE_MUTEX(online_page_callback_lock);
54

55
DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock);
56

57 58 59 60
void get_online_mems(void)
{
	percpu_down_read(&mem_hotplug_lock);
}
61

62 63 64 65
void put_online_mems(void)
{
	percpu_up_read(&mem_hotplug_lock);
}
66

67 68
bool movable_node_enabled = false;

69
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
70
bool memhp_auto_online;
71 72 73
#else
bool memhp_auto_online = true;
#endif
74 75
EXPORT_SYMBOL_GPL(memhp_auto_online);

76 77 78 79 80 81 82 83 84 85 86
static int __init setup_memhp_default_state(char *str)
{
	if (!strcmp(str, "online"))
		memhp_auto_online = true;
	else if (!strcmp(str, "offline"))
		memhp_auto_online = false;

	return 1;
}
__setup("memhp_default_state=", setup_memhp_default_state);

87
void mem_hotplug_begin(void)
88
{
89 90
	cpus_read_lock();
	percpu_down_write(&mem_hotplug_lock);
91 92
}

93
void mem_hotplug_done(void)
94
{
95 96
	percpu_up_write(&mem_hotplug_lock);
	cpus_read_unlock();
97
}
98

99 100
u64 max_mem_size = U64_MAX;

101 102 103
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
104 105 106
	struct resource *res;
	unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
	char *resource_name = "System RAM";
107 108 109 110

	if (start + size > max_mem_size)
		return ERR_PTR(-E2BIG);

111 112 113 114 115 116 117 118 119 120 121
	/*
	 * Request ownership of the new memory range.  This might be
	 * a child of an existing resource that was present but
	 * not marked as busy.
	 */
	res = __request_region(&iomem_resource, start, size,
			       resource_name, flags);

	if (!res) {
		pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
				start, start + size);
122
		return ERR_PTR(-EEXIST);
123 124 125 126 127 128 129 130 131 132 133 134
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
}

135
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
136 137
void get_page_bootmem(unsigned long info,  struct page *page,
		      unsigned long type)
138
{
139
	page->freelist = (void *)type;
140 141
	SetPagePrivate(page);
	set_page_private(page, info);
142
	page_ref_inc(page);
143 144
}

145
void put_page_bootmem(struct page *page)
146
{
A
Andrea Arcangeli 已提交
147
	unsigned long type;
148

149
	type = (unsigned long) page->freelist;
A
Andrea Arcangeli 已提交
150 151
	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
152

153
	if (page_ref_dec_return(page) == 1) {
154
		page->freelist = NULL;
155 156
		ClearPagePrivate(page);
		set_page_private(page, 0);
A
Andrea Arcangeli 已提交
157
		INIT_LIST_HEAD(&page->lru);
158
		free_reserved_page(page);
159 160 161
	}
}

162 163
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
164
static void register_page_bootmem_info_section(unsigned long start_pfn)
165
{
166
	unsigned long mapsize, section_nr, i;
167 168
	struct mem_section *ms;
	struct page *page, *memmap;
169
	struct mem_section_usage *usage;
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	/* Get section's memmap address */
	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	/*
	 * Get page for the memmap's phys address
	 * XXX: need more consideration for sparse_vmemmap...
	 */
	page = virt_to_page(memmap);
	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;

	/* remember memmap's page */
	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, SECTION_INFO);

189 190
	usage = ms->usage;
	page = virt_to_page(usage);
191

192
	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
193 194

	for (i = 0; i < mapsize; i++, page++)
195
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
196 197

}
198 199 200
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
201
	unsigned long mapsize, section_nr, i;
202 203
	struct mem_section *ms;
	struct page *page, *memmap;
204
	struct mem_section_usage *usage;
205 206 207 208 209 210 211 212

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);

213 214
	usage = ms->usage;
	page = virt_to_page(usage);
215

216
	mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
217 218 219 220 221

	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
222

223
void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
224 225 226 227 228 229 230 231 232 233 234 235
{
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);

	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	pfn = pgdat->node_start_pfn;
236
	end_pfn = pgdat_end_pfn(pgdat);
237

238
	/* register section info */
239 240 241 242 243
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
		/*
		 * Some platforms can assign the same pfn to multiple nodes - on
		 * node0 as well as nodeN.  To avoid registering a pfn against
		 * multiple nodes we check that this pfn does not already
244
		 * reside in some other nodes.
245
		 */
246
		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
247 248
			register_page_bootmem_info_section(pfn);
	}
249
}
250
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
251

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
		const char *reason)
{
	/*
	 * Disallow all operations smaller than a sub-section and only
	 * allow operations smaller than a section for
	 * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
	 * enforces a larger memory_block_size_bytes() granularity for
	 * memory that will be marked online, so this check should only
	 * fire for direct arch_{add,remove}_memory() users outside of
	 * add_memory_resource().
	 */
	unsigned long min_align;

	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
		min_align = PAGES_PER_SUBSECTION;
	else
		min_align = PAGES_PER_SECTION;
	if (!IS_ALIGNED(pfn, min_align)
			|| !IS_ALIGNED(nr_pages, min_align)) {
		WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
				reason, pfn, pfn + nr_pages - 1);
		return -EINVAL;
	}
	return 0;
}

279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
static int check_hotplug_memory_addressable(unsigned long pfn,
					    unsigned long nr_pages)
{
	const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;

	if (max_addr >> MAX_PHYSMEM_BITS) {
		const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
		WARN(1,
		     "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
		     (u64)PFN_PHYS(pfn), max_addr, max_allowed);
		return -E2BIG;
	}

	return 0;
}

295 296 297 298 299 300
/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
301 302
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
		struct mhp_restrictions *restrictions)
303
{
304 305
	int err;
	unsigned long nr, start_sec, end_sec;
306
	struct vmem_altmap *altmap = restrictions->altmap;
307

308 309 310 311
	err = check_hotplug_memory_addressable(pfn, nr_pages);
	if (err)
		return err;

312 313 314 315
	if (altmap) {
		/*
		 * Validate altmap is within bounds of the total request
		 */
316
		if (altmap->base_pfn != pfn
317 318
				|| vmem_altmap_offset(altmap) > nr_pages) {
			pr_warn_once("memory add fail, invalid altmap\n");
319
			return -EINVAL;
320 321 322 323
		}
		altmap->alloc = 0;
	}

324 325 326 327 328 329
	err = check_pfn_span(pfn, nr_pages, "add");
	if (err)
		return err;

	start_sec = pfn_to_section_nr(pfn);
	end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
330
	for (nr = start_sec; nr <= end_sec; nr++) {
331 332 333 334
		unsigned long pfns;

		pfns = min(nr_pages, PAGES_PER_SECTION
				- (pfn & ~PAGE_SECTION_MASK));
335 336 337
		err = sparse_add_section(nid, pfn, pfns, altmap);
		if (err)
			break;
338 339
		pfn += pfns;
		nr_pages -= pfns;
340
		cond_resched();
341
	}
342
	vmemmap_populate_print_last();
343 344 345
	return err;
}

346
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
347
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
348 349 350
				     unsigned long start_pfn,
				     unsigned long end_pfn)
{
351
	for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
352
		if (unlikely(!pfn_to_online_page(start_pfn)))
353 354 355 356 357
			continue;

		if (unlikely(pfn_to_nid(start_pfn) != nid))
			continue;

358
		if (zone != page_zone(pfn_to_page(start_pfn)))
359 360 361 362 363 364 365 366 367
			continue;

		return start_pfn;
	}

	return 0;
}

/* find the biggest valid pfn in the range [start_pfn, end_pfn). */
368
static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
369 370 371 372 373 374 375
				    unsigned long start_pfn,
				    unsigned long end_pfn)
{
	unsigned long pfn;

	/* pfn is the end pfn of a memory section. */
	pfn = end_pfn - 1;
376
	for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
377
		if (unlikely(!pfn_to_online_page(pfn)))
378 379 380 381 382
			continue;

		if (unlikely(pfn_to_nid(pfn) != nid))
			continue;

383
		if (zone != page_zone(pfn_to_page(pfn)))
384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
			continue;

		return pfn;
	}

	return 0;
}

static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
			     unsigned long end_pfn)
{
	unsigned long pfn;
	int nid = zone_to_nid(zone);

	zone_span_writelock(zone);
399
	if (zone->zone_start_pfn == start_pfn) {
400 401 402 403 404 405 406
		/*
		 * If the section is smallest section in the zone, it need
		 * shrink zone->zone_start_pfn and zone->zone_spanned_pages.
		 * In this case, we find second smallest valid mem_section
		 * for shrinking zone.
		 */
		pfn = find_smallest_section_pfn(nid, zone, end_pfn,
407
						zone_end_pfn(zone));
408
		if (pfn) {
409
			zone->spanned_pages = zone_end_pfn(zone) - pfn;
410
			zone->zone_start_pfn = pfn;
411 412 413
		} else {
			zone->zone_start_pfn = 0;
			zone->spanned_pages = 0;
414
		}
415
	} else if (zone_end_pfn(zone) == end_pfn) {
416 417 418 419 420 421
		/*
		 * If the section is biggest section in the zone, it need
		 * shrink zone->spanned_pages.
		 * In this case, we find second biggest valid mem_section for
		 * shrinking zone.
		 */
422
		pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
423 424
					       start_pfn);
		if (pfn)
425
			zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
426 427 428 429
		else {
			zone->zone_start_pfn = 0;
			zone->spanned_pages = 0;
		}
430 431 432 433
	}
	zone_span_writeunlock(zone);
}

434
static void update_pgdat_span(struct pglist_data *pgdat)
435
{
436 437 438 439 440 441 442 443 444
	unsigned long node_start_pfn = 0, node_end_pfn = 0;
	struct zone *zone;

	for (zone = pgdat->node_zones;
	     zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
		unsigned long zone_end_pfn = zone->zone_start_pfn +
					     zone->spanned_pages;

		/* No need to lock the zones, they can't change. */
445 446 447 448 449 450 451 452
		if (!zone->spanned_pages)
			continue;
		if (!node_end_pfn) {
			node_start_pfn = zone->zone_start_pfn;
			node_end_pfn = zone_end_pfn;
			continue;
		}

453 454 455 456
		if (zone_end_pfn > node_end_pfn)
			node_end_pfn = zone_end_pfn;
		if (zone->zone_start_pfn < node_start_pfn)
			node_start_pfn = zone->zone_start_pfn;
457 458
	}

459 460
	pgdat->node_start_pfn = node_start_pfn;
	pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
461 462
}

463 464 465
void __ref remove_pfn_range_from_zone(struct zone *zone,
				      unsigned long start_pfn,
				      unsigned long nr_pages)
466 467 468 469
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	unsigned long flags;

470 471 472
	/* Poison struct pages because they are now uninitialized again. */
	page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);

473 474 475 476 477 478 479 480 481 482
#ifdef CONFIG_ZONE_DEVICE
	/*
	 * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
	 * we will not try to shrink the zones - which is okay as
	 * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
	 */
	if (zone_idx(zone) == ZONE_DEVICE)
		return;
#endif

483 484
	clear_zone_contiguous(zone);

485 486
	pgdat_resize_lock(zone->zone_pgdat, &flags);
	shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
487
	update_pgdat_span(pgdat);
488
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
489 490

	set_zone_contiguous(zone);
491 492
}

493 494 495
static void __remove_section(unsigned long pfn, unsigned long nr_pages,
			     unsigned long map_offset,
			     struct vmem_altmap *altmap)
496
{
497
	struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
498

499 500
	if (WARN_ON_ONCE(!valid_section(ms)))
		return;
501

502
	sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
503 504 505
}

/**
506
 * __remove_pages() - remove sections of pages
507
 * @pfn: starting pageframe (must be aligned to start of a section)
508
 * @nr_pages: number of pages to remove (must be multiple of section size)
509
 * @altmap: alternative device page map or %NULL if default memmap is used
510 511 512 513 514 515
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
516 517
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
		    struct vmem_altmap *altmap)
518
{
519 520
	const unsigned long end_pfn = pfn + nr_pages;
	unsigned long cur_nr_pages;
521 522
	unsigned long map_offset = 0;

523
	map_offset = vmem_altmap_offset(altmap);
524

525 526
	if (check_pfn_span(pfn, nr_pages, "remove"))
		return;
527

528
	for (; pfn < end_pfn; pfn += cur_nr_pages) {
529
		cond_resched();
530 531 532
		/* Select all remaining pages up to the next section boundary */
		cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK));
		__remove_section(pfn, cur_nr_pages, map_offset, altmap);
533
		map_offset = 0;
534 535 536
	}
}

537 538 539 540
int set_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

541 542
	get_online_mems();
	mutex_lock(&online_page_callback_lock);
543 544 545 546 547 548

	if (online_page_callback == generic_online_page) {
		online_page_callback = callback;
		rc = 0;
	}

549 550
	mutex_unlock(&online_page_callback_lock);
	put_online_mems();
551 552 553 554 555 556 557 558 559

	return rc;
}
EXPORT_SYMBOL_GPL(set_online_page_callback);

int restore_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

560 561
	get_online_mems();
	mutex_lock(&online_page_callback_lock);
562 563 564 565 566 567

	if (online_page_callback == callback) {
		online_page_callback = generic_online_page;
		rc = 0;
	}

568 569
	mutex_unlock(&online_page_callback_lock);
	put_online_mems();
570 571 572 573 574

	return rc;
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);

575
void generic_online_page(struct page *page, unsigned int order)
576
{
577
	kernel_map_pages(page, 1 << order, 1);
578 579 580 581 582 583 584
	__free_pages_core(page, order);
	totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
	if (PageHighMem(page))
		totalhigh_pages_add(1UL << order);
#endif
}
585
EXPORT_SYMBOL_GPL(generic_online_page);
586

587 588
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
589
{
590 591 592 593 594 595 596 597 598 599 600
	const unsigned long end_pfn = start_pfn + nr_pages;
	unsigned long pfn;
	int order;

	/*
	 * Online the pages. The callback might decide to keep some pages
	 * PG_reserved (to add them to the buddy later), but we still account
	 * them as being online/belonging to this zone ("present").
	 */
	for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
		order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
601 602 603
		/* __free_pages_core() wants pfns to be aligned to the order */
		if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
			order = 0;
604 605
		(*online_page_callback)(pfn_to_page(pfn), order);
	}
606

607 608
	/* mark all involved sections as online */
	online_mem_sections(start_pfn, end_pfn);
609

610
	*(unsigned long *)arg += nr_pages;
611 612 613
	return 0;
}

614 615 616 617 618 619
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);

620 621 622
	arg->status_change_nid = NUMA_NO_NODE;
	arg->status_change_nid_normal = NUMA_NO_NODE;
	arg->status_change_nid_high = NUMA_NO_NODE;
623

624 625 626
	if (!node_state(nid, N_MEMORY))
		arg->status_change_nid = nid;
	if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
627
		arg->status_change_nid_normal = nid;
628
#ifdef CONFIG_HIGHMEM
629
	if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
630 631
		arg->status_change_nid_high = nid;
#endif
632 633 634 635 636 637 638
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_set_state(node, N_NORMAL_MEMORY);

639 640 641
	if (arg->status_change_nid_high >= 0)
		node_set_state(node, N_HIGH_MEMORY);

642 643
	if (arg->status_change_nid >= 0)
		node_set_state(node, N_MEMORY);
644 645
}

646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
		unsigned long nr_pages)
{
	unsigned long old_end_pfn = zone_end_pfn(zone);

	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn;
}

static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn,
                                     unsigned long nr_pages)
{
	unsigned long old_end_pfn = pgdat_end_pfn(pgdat);

	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;

667 668 669 670 671 672
}
/*
 * Associate the pfn range with the given zone, initializing the memmaps
 * and resizing the pgdat/zone data to span the added pages. After this
 * call, all affected pages are PG_reserved.
 */
673 674
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
		unsigned long nr_pages, struct vmem_altmap *altmap)
675 676 677 678
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nid = pgdat->node_id;
	unsigned long flags;
679

680 681 682 683 684
	clear_zone_contiguous(zone);

	/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
	pgdat_resize_lock(pgdat, &flags);
	zone_span_writelock(zone);
685 686
	if (zone_is_empty(zone))
		init_currently_empty_zone(zone, start_pfn, nr_pages);
687 688 689 690 691 692 693 694 695 696 697
	resize_zone_range(zone, start_pfn, nr_pages);
	zone_span_writeunlock(zone);
	resize_pgdat_range(pgdat, start_pfn, nr_pages);
	pgdat_resize_unlock(pgdat, &flags);

	/*
	 * TODO now we have a visible range of pages which are not associated
	 * with their zone properly. Not nice but set_pfnblock_flags_mask
	 * expects the zone spans the pfn range. All the pages in the range
	 * are reserved so nobody should be touching them so we should be safe
	 */
698 699
	memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
			MEMMAP_HOTPLUG, altmap);
700 701 702 703

	set_zone_contiguous(zone);
}

704 705 706 707 708
/*
 * Returns a default kernel memory zone for the given pfn range.
 * If no kernel zone covers this pfn range it will automatically go
 * to the ZONE_NORMAL.
 */
709
static struct zone *default_kernel_zone_for_pfn(int nid, unsigned long start_pfn,
710 711 712 713 714 715 716 717 718 719 720 721 722 723 724
		unsigned long nr_pages)
{
	struct pglist_data *pgdat = NODE_DATA(nid);
	int zid;

	for (zid = 0; zid <= ZONE_NORMAL; zid++) {
		struct zone *zone = &pgdat->node_zones[zid];

		if (zone_intersects(zone, start_pfn, nr_pages))
			return zone;
	}

	return &pgdat->node_zones[ZONE_NORMAL];
}

725 726
static inline struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn,
		unsigned long nr_pages)
727
{
728 729 730 731 732
	struct zone *kernel_zone = default_kernel_zone_for_pfn(nid, start_pfn,
			nr_pages);
	struct zone *movable_zone = &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
	bool in_kernel = zone_intersects(kernel_zone, start_pfn, nr_pages);
	bool in_movable = zone_intersects(movable_zone, start_pfn, nr_pages);
733 734

	/*
735 736
	 * We inherit the existing zone in a simple case where zones do not
	 * overlap in the given range
737
	 */
738 739
	if (in_kernel ^ in_movable)
		return (in_kernel) ? kernel_zone : movable_zone;
740

741 742 743 744 745 746
	/*
	 * If the range doesn't belong to any zone or two zones overlap in the
	 * given range then we use movable zone only if movable_node is
	 * enabled because we always online to a kernel zone by default.
	 */
	return movable_node_enabled ? movable_zone : kernel_zone;
747 748
}

749 750
struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
		unsigned long nr_pages)
751
{
752 753
	if (online_type == MMOP_ONLINE_KERNEL)
		return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
754

755 756
	if (online_type == MMOP_ONLINE_MOVABLE)
		return &NODE_DATA(nid)->node_zones[ZONE_MOVABLE];
757

758
	return default_zone_for_pfn(nid, start_pfn, nr_pages);
759 760
}

761 762
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
		       int online_type, int nid)
763
{
764
	unsigned long flags;
765 766
	unsigned long onlined_pages = 0;
	struct zone *zone;
767
	int need_zonelists_rebuild = 0;
768 769
	int ret;
	struct memory_notify arg;
770

771 772
	mem_hotplug_begin();

773
	/* associate pfn range with the zone */
774 775
	zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
	move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
776

777 778
	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
779
	node_states_check_changes_online(nr_pages, zone, &arg);
780 781 782

	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
783 784 785
	if (ret)
		goto failed_addition;

786 787 788 789 790
	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
791
	if (!populated_zone(zone)) {
792
		need_zonelists_rebuild = 1;
793
		setup_zone_pageset(zone);
794
	}
795

K
KAMEZAWA Hiroyuki 已提交
796
	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
797
		online_pages_range);
798
	if (ret) {
799
		/* not a single memory resource was applicable */
800 801
		if (need_zonelists_rebuild)
			zone_pcp_reset(zone);
802
		goto failed_addition;
803 804
	}

805
	zone->present_pages += onlined_pages;
806 807

	pgdat_resize_lock(zone->zone_pgdat, &flags);
808
	zone->zone_pgdat->node_present_pages += onlined_pages;
809 810
	pgdat_resize_unlock(zone->zone_pgdat, &flags);

811 812
	shuffle_zone(zone);

813 814 815 816 817
	node_states_set_node(nid, &arg);
	if (need_zonelists_rebuild)
		build_all_zonelists(NULL);
	else
		zone_pcp_update(zone);
818

819 820
	init_per_zone_wmark_min();

821 822
	kswapd_run(nid);
	kcompactd_run(nid);
823

824
	vm_total_pages = nr_free_pagecache_pages();
825

826
	writeback_set_ratelimit();
827

828
	memory_notify(MEM_ONLINE, &arg);
829
	mem_hotplug_done();
830
	return 0;
831 832 833 834 835 836

failed_addition:
	pr_debug("online_pages [mem %#010llx-%#010llx] failed\n",
		 (unsigned long long) pfn << PAGE_SHIFT,
		 (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
	memory_notify(MEM_CANCEL_ONLINE, &arg);
837
	remove_pfn_range_from_zone(zone, pfn, nr_pages);
838
	mem_hotplug_done();
839
	return ret;
840
}
841
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
842

843 844 845 846 847 848 849 850 851 852
static void reset_node_present_pages(pg_data_t *pgdat)
{
	struct zone *z;

	for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
		z->present_pages = 0;

	pgdat->node_present_pages = 0;
}

853 854
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
855 856
{
	struct pglist_data *pgdat;
857
	unsigned long start_pfn = PFN_DOWN(start);
858

859 860 861 862 863
	pgdat = NODE_DATA(nid);
	if (!pgdat) {
		pgdat = arch_alloc_nodedata(nid);
		if (!pgdat)
			return NULL;
864

865 866
		pgdat->per_cpu_nodestats =
			alloc_percpu(struct per_cpu_nodestat);
867
		arch_refresh_nodedata(nid, pgdat);
868
	} else {
869
		int cpu;
870 871 872 873 874
		/*
		 * Reset the nr_zones, order and classzone_idx before reuse.
		 * Note that kswapd will init kswapd_classzone_idx properly
		 * when it starts in the near future.
		 */
875
		pgdat->nr_zones = 0;
876 877
		pgdat->kswapd_order = 0;
		pgdat->kswapd_classzone_idx = 0;
878 879 880 881 882 883
		for_each_online_cpu(cpu) {
			struct per_cpu_nodestat *p;

			p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
			memset(p, 0, sizeof(*p));
		}
884
	}
885 886 887

	/* we can use NODE_DATA(nid) from here */

888 889 890
	pgdat->node_id = nid;
	pgdat->node_start_pfn = start_pfn;

891
	/* init node's zones as empty zones, we don't have any present pages.*/
892
	free_area_init_core_hotplug(nid);
893

894 895 896 897
	/*
	 * The node we allocated has no zone fallback lists. For avoiding
	 * to access not-initialized zonelist, build here.
	 */
898
	build_all_zonelists(pgdat);
899

900 901 902 903 904
	/*
	 * When memory is hot-added, all the memory is in offline state. So
	 * clear all zones' present_pages because they will be updated in
	 * online_pages() and offline_pages().
	 */
905
	reset_node_managed_pages(pgdat);
906 907
	reset_node_present_pages(pgdat);

908 909 910
	return pgdat;
}

911
static void rollback_node_hotadd(int nid)
912
{
913 914
	pg_data_t *pgdat = NODE_DATA(nid);

915
	arch_refresh_nodedata(nid, NULL);
916
	free_percpu(pgdat->per_cpu_nodestats);
917 918 919
	arch_free_nodedata(pgdat);
}

920

921 922
/**
 * try_online_node - online a node if offlined
923
 * @nid: the node ID
924 925
 * @start: start addr of the node
 * @set_node_online: Whether we want to online the node
926
 * called by cpu_up() to online a node without onlined memory.
927 928 929 930 931
 *
 * Returns:
 * 1 -> a new node has been allocated
 * 0 -> the node is already online
 * -ENOMEM -> the node could not be allocated
932
 */
933
static int __try_online_node(int nid, u64 start, bool set_node_online)
934
{
935 936
	pg_data_t *pgdat;
	int ret = 1;
937

938 939 940
	if (node_online(nid))
		return 0;

941
	pgdat = hotadd_new_pgdat(nid, start);
942
	if (!pgdat) {
943
		pr_err("Cannot online node %d due to NULL pgdat\n", nid);
944 945 946
		ret = -ENOMEM;
		goto out;
	}
947 948 949 950 951 952

	if (set_node_online) {
		node_set_online(nid);
		ret = register_one_node(nid);
		BUG_ON(ret);
	}
953
out:
954 955 956 957 958 959 960 961 962 963 964 965
	return ret;
}

/*
 * Users of this function always want to online/register the node
 */
int try_online_node(int nid)
{
	int ret;

	mem_hotplug_begin();
	ret =  __try_online_node(nid, 0, true);
966
	mem_hotplug_done();
967 968 969
	return ret;
}

970 971
static int check_hotplug_memory_range(u64 start, u64 size)
{
972
	/* memory range must be block size aligned */
973 974
	if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
	    !IS_ALIGNED(size, memory_block_size_bytes())) {
975
		pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
976
		       memory_block_size_bytes(), start, size);
977 978 979 980 981 982
		return -EINVAL;
	}

	return 0;
}

983 984
static int online_memory_block(struct memory_block *mem, void *arg)
{
985
	return device_online(&mem->dev);
986 987
}

988 989 990 991 992 993
/*
 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
 * and online/offline operations (triggered e.g. by sysfs).
 *
 * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
 */
994
int __ref add_memory_resource(int nid, struct resource *res)
995
{
996
	struct mhp_restrictions restrictions = {};
997
	u64 start, size;
998
	bool new_node = false;
999 1000
	int ret;

1001 1002 1003
	start = res->start;
	size = resource_size(res);

1004 1005 1006 1007
	ret = check_hotplug_memory_range(start, size);
	if (ret)
		return ret;

1008
	mem_hotplug_begin();
1009

1010 1011 1012 1013 1014 1015 1016 1017
	/*
	 * Add new range to memblock so that when hotadd_new_pgdat() is called
	 * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
	 * this new range and calculate total pages correctly.  The range will
	 * be removed at hot-remove time.
	 */
	memblock_add_node(start, size, nid);

1018 1019 1020 1021
	ret = __try_online_node(nid, start, false);
	if (ret < 0)
		goto error;
	new_node = ret;
1022

1023
	/* call arch's memory hotadd */
1024
	ret = arch_add_memory(nid, start, size, &restrictions);
1025 1026 1027
	if (ret < 0)
		goto error;

1028 1029 1030 1031 1032 1033 1034
	/* create memory block devices after memory was added */
	ret = create_memory_block_devices(start, size);
	if (ret) {
		arch_remove_memory(nid, start, size, NULL);
		goto error;
	}

1035
	if (new_node) {
1036
		/* If sysfs file of new node can't be created, cpu on the node
1037 1038
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
1039
		 * We online node here. We can't roll back from here.
1040
		 */
1041 1042
		node_set_online(nid);
		ret = __register_one_node(nid);
1043 1044 1045
		BUG_ON(ret);
	}

1046
	/* link memory sections under this node.*/
1047
	ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
1048 1049
	BUG_ON(ret);

1050 1051 1052
	/* create new memmap entry */
	firmware_map_add_hotplug(start, start + size, "System RAM");

1053 1054 1055
	/* device_online() will take the lock when calling online_pages() */
	mem_hotplug_done();

1056
	/* online pages if requested */
1057
	if (memhp_auto_online)
1058
		walk_memory_blocks(start, size, NULL, online_memory_block);
1059

1060
	return ret;
1061 1062
error:
	/* rollback pgdat allocation and others */
1063 1064
	if (new_node)
		rollback_node_hotadd(nid);
1065
	memblock_remove(start, size);
1066
	mem_hotplug_done();
1067 1068
	return ret;
}
1069

1070 1071
/* requires device_hotplug_lock, see add_memory_resource() */
int __ref __add_memory(int nid, u64 start, u64 size)
1072 1073 1074 1075 1076
{
	struct resource *res;
	int ret;

	res = register_memory_resource(start, size);
1077 1078
	if (IS_ERR(res))
		return PTR_ERR(res);
1079

1080
	ret = add_memory_resource(nid, res);
1081 1082 1083 1084
	if (ret < 0)
		release_memory_resource(res);
	return ret;
}
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095

int add_memory(int nid, u64 start, u64 size)
{
	int rc;

	lock_device_hotplug();
	rc = __add_memory(nid, start, size);
	unlock_device_hotplug();

	return rc;
}
1096
EXPORT_SYMBOL_GPL(add_memory);
K
KAMEZAWA Hiroyuki 已提交
1097 1098

#ifdef CONFIG_MEMORY_HOTREMOVE
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
/*
 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 * set and the size of the free page is given by page_order(). Using this,
 * the function determines if the pageblock contains only free pages.
 * Due to buddy contraints, a free page at least the size of a pageblock will
 * be located at the start of the pageblock
 */
static inline int pageblock_free(struct page *page)
{
	return PageBuddy(page) && page_order(page) >= pageblock_order;
}

1111 1112
/* Return the pfn of the start of the next active pageblock after a given pfn */
static unsigned long next_active_pageblock(unsigned long pfn)
1113
{
1114 1115
	struct page *page = pfn_to_page(pfn);

1116
	/* Ensure the starting page is pageblock-aligned */
1117
	BUG_ON(pfn & (pageblock_nr_pages - 1));
1118 1119

	/* If the entire pageblock is free, move to the end of free page */
1120 1121 1122 1123 1124
	if (pageblock_free(page)) {
		int order;
		/* be careful. we don't have locks, page_order can be changed.*/
		order = page_order(page);
		if ((order < MAX_ORDER) && (order >= pageblock_order))
1125
			return pfn + (1 << order);
1126
	}
1127

1128
	return pfn + pageblock_nr_pages;
1129 1130
}

1131
static bool is_pageblock_removable_nolock(unsigned long pfn)
1132
{
1133
	struct page *page = pfn_to_page(pfn);
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
	struct zone *zone;

	/*
	 * We have to be careful here because we are iterating over memory
	 * sections which are not zone aware so we might end up outside of
	 * the zone but still within the section.
	 * We have to take care about the node as well. If the node is offline
	 * its NODE_DATA will be NULL - see page_zone.
	 */
	if (!node_online(page_to_nid(page)))
		return false;

	zone = page_zone(page);
	pfn = page_to_pfn(page);
	if (!zone_spans_pfn(zone, pfn))
		return false;

1151
	return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
1152
				    MEMORY_OFFLINE);
1153 1154
}

1155
/* Checks if this range of memory is likely to be hot-removable. */
1156
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
1157
{
1158 1159 1160 1161
	unsigned long end_pfn, pfn;

	end_pfn = min(start_pfn + nr_pages,
			zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
1162 1163

	/* Check the starting page of each pageblock within the range */
1164 1165
	for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
		if (!is_pageblock_removable_nolock(pfn))
1166
			return false;
1167
		cond_resched();
1168 1169 1170
	}

	/* All pageblocks in the memory block are likely to be hot-removable */
1171
	return true;
1172 1173
}

K
KAMEZAWA Hiroyuki 已提交
1174
/*
1175 1176
 * Confirm all pages in a range [start, end) belong to the same zone (skipping
 * memory holes). When true, return the zone.
K
KAMEZAWA Hiroyuki 已提交
1177
 */
1178 1179
struct zone *test_pages_in_a_zone(unsigned long start_pfn,
				  unsigned long end_pfn)
K
KAMEZAWA Hiroyuki 已提交
1180
{
1181
	unsigned long pfn, sec_end_pfn;
K
KAMEZAWA Hiroyuki 已提交
1182 1183 1184
	struct zone *zone = NULL;
	struct page *page;
	int i;
1185
	for (pfn = start_pfn, sec_end_pfn = SECTION_ALIGN_UP(start_pfn + 1);
K
KAMEZAWA Hiroyuki 已提交
1186
	     pfn < end_pfn;
1187
	     pfn = sec_end_pfn, sec_end_pfn += PAGES_PER_SECTION) {
1188 1189
		/* Make sure the memory section is present first */
		if (!present_section_nr(pfn_to_section_nr(pfn)))
K
KAMEZAWA Hiroyuki 已提交
1190
			continue;
1191 1192 1193 1194 1195 1196 1197
		for (; pfn < sec_end_pfn && pfn < end_pfn;
		     pfn += MAX_ORDER_NR_PAGES) {
			i = 0;
			/* This is just a CONFIG_HOLES_IN_ZONE check.*/
			while ((i < MAX_ORDER_NR_PAGES) &&
				!pfn_valid_within(pfn + i))
				i++;
1198
			if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
1199
				continue;
1200 1201
			/* Check if we got outside of the zone */
			if (zone && !zone_spans_pfn(zone, pfn + i))
1202
				return NULL;
1203 1204
			page = pfn_to_page(pfn + i);
			if (zone && page_zone(page) != zone)
1205
				return NULL;
1206 1207
			zone = page_zone(page);
		}
K
KAMEZAWA Hiroyuki 已提交
1208
	}
1209

1210
	return zone;
K
KAMEZAWA Hiroyuki 已提交
1211 1212 1213
}

/*
1214 1215 1216 1217
 * Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
 * non-lru movable pages and hugepages). We scan pfn because it's much
 * easier than scanning over linked list. This function returns the pfn
 * of the first found movable page if it's found, otherwise 0.
K
KAMEZAWA Hiroyuki 已提交
1218
 */
1219
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
K
KAMEZAWA Hiroyuki 已提交
1220 1221
{
	unsigned long pfn;
1222

K
KAMEZAWA Hiroyuki 已提交
1223
	for (pfn = start; pfn < end; pfn++) {
1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237
		struct page *page, *head;
		unsigned long skip;

		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
		if (PageLRU(page))
			return pfn;
		if (__PageMovable(page))
			return pfn;

		if (!PageHuge(page))
			continue;
		head = compound_head(page);
1238
		if (page_huge_active(head))
1239
			return pfn;
1240
		skip = compound_nr(head) - (page - head);
1241
		pfn += skip - 1;
K
KAMEZAWA Hiroyuki 已提交
1242 1243 1244 1245
	}
	return 0;
}

1246
static struct page *new_node_page(struct page *page, unsigned long private)
1247 1248
{
	int nid = page_to_nid(page);
1249
	nodemask_t nmask = node_states[N_MEMORY];
1250 1251 1252 1253 1254 1255 1256 1257 1258

	/*
	 * try to allocate from a different node but reuse this node if there
	 * are no other online nodes to be used (e.g. we are offlining a part
	 * of the only existing node)
	 */
	node_clear(nid, nmask);
	if (nodes_empty(nmask))
		node_set(nid, nmask);
1259

1260
	return new_page_nodemask(page, nid, &nmask);
1261 1262
}

K
KAMEZAWA Hiroyuki 已提交
1263 1264 1265 1266 1267 1268 1269 1270
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int ret = 0;
	LIST_HEAD(source);

1271
	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
K
KAMEZAWA Hiroyuki 已提交
1272 1273 1274
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
1275 1276 1277

		if (PageHuge(page)) {
			struct page *head = compound_head(page);
1278
			pfn = page_to_pfn(head) + compound_nr(head) - 1;
1279
			isolate_huge_page(head, &source);
1280
			continue;
M
Michal Hocko 已提交
1281
		} else if (PageTransHuge(page))
1282 1283
			pfn = page_to_pfn(compound_head(page))
				+ hpage_nr_pages(page) - 1;
1284

1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
		/*
		 * HWPoison pages have elevated reference counts so the migration would
		 * fail on them. It also doesn't make any sense to migrate them in the
		 * first place. Still try to unmap such a page in case it is still mapped
		 * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
		 * the unmap as the catch all safety net).
		 */
		if (PageHWPoison(page)) {
			if (WARN_ON(PageLRU(page)))
				isolate_lru_page(page);
			if (page_mapped(page))
				try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
			continue;
		}

1300
		if (!get_page_unless_zero(page))
K
KAMEZAWA Hiroyuki 已提交
1301 1302
			continue;
		/*
1303 1304
		 * We can skip free pages. And we can deal with pages on
		 * LRU and non-lru movable pages.
K
KAMEZAWA Hiroyuki 已提交
1305
		 */
1306 1307 1308 1309
		if (PageLRU(page))
			ret = isolate_lru_page(page);
		else
			ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
K
KAMEZAWA Hiroyuki 已提交
1310
		if (!ret) { /* Success */
1311
			list_add_tail(&page->lru, &source);
1312 1313 1314
			if (!__PageMovable(page))
				inc_node_page_state(page, NR_ISOLATED_ANON +
						    page_is_file_cache(page));
1315

K
KAMEZAWA Hiroyuki 已提交
1316
		} else {
1317
			pr_warn("failed to isolate pfn %lx\n", pfn);
1318
			dump_page(page, "isolation failed");
K
KAMEZAWA Hiroyuki 已提交
1319
		}
1320
		put_page(page);
K
KAMEZAWA Hiroyuki 已提交
1321
	}
1322
	if (!list_empty(&source)) {
1323 1324
		/* Allocate a new page from the nearest neighbor node */
		ret = migrate_pages(&source, new_node_page, NULL, 0,
1325
					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
1326 1327 1328 1329 1330 1331
		if (ret) {
			list_for_each_entry(page, &source, lru) {
				pr_warn("migrating pfn %lx failed ret:%d ",
				       page_to_pfn(page), ret);
				dump_page(page, "migration failure");
			}
1332
			putback_movable_pages(&source);
1333
		}
K
KAMEZAWA Hiroyuki 已提交
1334
	}
1335

K
KAMEZAWA Hiroyuki 已提交
1336 1337 1338
	return ret;
}

1339
/* Mark all sections offline and remove all free pages from the buddy. */
K
KAMEZAWA Hiroyuki 已提交
1340 1341 1342 1343
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
			void *data)
{
1344
	unsigned long *offlined_pages = (unsigned long *)data;
K
KAMEZAWA Hiroyuki 已提交
1345

1346 1347
	*offlined_pages += __offline_isolated_pages(start, start + nr_pages);
	return 0;
K
KAMEZAWA Hiroyuki 已提交
1348 1349 1350 1351 1352 1353 1354 1355 1356
}

/*
 * Check all pages in range, recoreded as memory resource, are isolated.
 */
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
			void *data)
{
1357 1358
	return test_pages_isolated(start_pfn, start_pfn + nr_pages,
				   MEMORY_OFFLINE);
K
KAMEZAWA Hiroyuki 已提交
1359 1360
}

1361 1362
static int __init cmdline_parse_movable_node(char *p)
{
1363
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
1364
	movable_node_enabled = true;
1365 1366 1367
#else
	pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
#endif
1368 1369 1370 1371
	return 0;
}
early_param("movable_node", cmdline_parse_movable_node);

1372 1373 1374 1375 1376 1377
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
		struct zone *zone, struct memory_notify *arg)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	unsigned long present_pages = 0;
1378
	enum zone_type zt;
1379

1380 1381 1382
	arg->status_change_nid = NUMA_NO_NODE;
	arg->status_change_nid_normal = NUMA_NO_NODE;
	arg->status_change_nid_high = NUMA_NO_NODE;
1383 1384

	/*
1385 1386 1387 1388 1389 1390
	 * Check whether node_states[N_NORMAL_MEMORY] will be changed.
	 * If the memory to be offline is within the range
	 * [0..ZONE_NORMAL], and it is the last present memory there,
	 * the zones in that range will become empty after the offlining,
	 * thus we can determine that we need to clear the node from
	 * node_states[N_NORMAL_MEMORY].
1391
	 */
1392
	for (zt = 0; zt <= ZONE_NORMAL; zt++)
1393
		present_pages += pgdat->node_zones[zt].present_pages;
1394
	if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
1395 1396
		arg->status_change_nid_normal = zone_to_nid(zone);

1397 1398
#ifdef CONFIG_HIGHMEM
	/*
1399 1400 1401 1402 1403 1404
	 * node_states[N_HIGH_MEMORY] contains nodes which
	 * have normal memory or high memory.
	 * Here we add the present_pages belonging to ZONE_HIGHMEM.
	 * If the zone is within the range of [0..ZONE_HIGHMEM), and
	 * we determine that the zones in that range become empty,
	 * we need to clear the node for N_HIGH_MEMORY.
1405
	 */
1406 1407
	present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
	if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
1408 1409 1410
		arg->status_change_nid_high = zone_to_nid(zone);
#endif

1411
	/*
1412 1413 1414 1415 1416 1417 1418 1419
	 * We have accounted the pages from [0..ZONE_NORMAL), and
	 * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
	 * as well.
	 * Here we count the possible pages from ZONE_MOVABLE.
	 * If after having accounted all the pages, we see that the nr_pages
	 * to be offlined is over or equal to the accounted pages,
	 * we know that the node will become empty, and so, we can clear
	 * it for N_MEMORY as well.
1420
	 */
1421
	present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431

	if (nr_pages >= present_pages)
		arg->status_change_nid = zone_to_nid(zone);
}

static void node_states_clear_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_clear_state(node, N_NORMAL_MEMORY);

1432
	if (arg->status_change_nid_high >= 0)
1433
		node_clear_state(node, N_HIGH_MEMORY);
1434

1435
	if (arg->status_change_nid >= 0)
1436
		node_clear_state(node, N_MEMORY);
1437 1438
}

1439 1440 1441 1442 1443 1444 1445 1446 1447
static int count_system_ram_pages_cb(unsigned long start_pfn,
				     unsigned long nr_pages, void *data)
{
	unsigned long *nr_system_ram_pages = data;

	*nr_system_ram_pages += nr_pages;
	return 0;
}

1448
static int __ref __offline_pages(unsigned long start_pfn,
1449
		  unsigned long end_pfn)
K
KAMEZAWA Hiroyuki 已提交
1450
{
1451
	unsigned long pfn, nr_pages = 0;
1452
	unsigned long offlined_pages = 0;
1453
	int ret, node, nr_isolate_pageblock;
1454
	unsigned long flags;
K
KAMEZAWA Hiroyuki 已提交
1455
	struct zone *zone;
1456
	struct memory_notify arg;
1457
	char *reason;
K
KAMEZAWA Hiroyuki 已提交
1458

1459 1460
	mem_hotplug_begin();

1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476
	/*
	 * Don't allow to offline memory blocks that contain holes.
	 * Consequently, memory blocks with holes can never get onlined
	 * via the hotplug path - online_pages() - as hotplugged memory has
	 * no holes. This way, we e.g., don't have to worry about marking
	 * memory holes PG_reserved, don't need pfn_valid() checks, and can
	 * avoid using walk_system_ram_range() later.
	 */
	walk_system_ram_range(start_pfn, end_pfn - start_pfn, &nr_pages,
			      count_system_ram_pages_cb);
	if (nr_pages != end_pfn - start_pfn) {
		ret = -EINVAL;
		reason = "memory holes";
		goto failed_removal;
	}

K
KAMEZAWA Hiroyuki 已提交
1477 1478
	/* This makes hotplug much easier...and readable.
	   we assume this for now. .*/
1479 1480
	zone = test_pages_in_a_zone(start_pfn, end_pfn);
	if (!zone) {
1481 1482 1483
		ret = -EINVAL;
		reason = "multizone range";
		goto failed_removal;
1484
	}
1485 1486
	node = zone_to_nid(zone);

K
KAMEZAWA Hiroyuki 已提交
1487
	/* set above range as isolated */
1488
	ret = start_isolate_page_range(start_pfn, end_pfn,
1489
				       MIGRATE_MOVABLE,
1490
				       MEMORY_OFFLINE | REPORT_FAILURE);
1491
	if (ret < 0) {
1492 1493
		reason = "failure to isolate range";
		goto failed_removal;
1494
	}
1495
	nr_isolate_pageblock = ret;
1496 1497 1498

	arg.start_pfn = start_pfn;
	arg.nr_pages = nr_pages;
1499
	node_states_check_changes_offline(nr_pages, zone, &arg);
1500 1501 1502

	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = notifier_to_errno(ret);
1503 1504 1505 1506
	if (ret) {
		reason = "notifier failure";
		goto failed_removal_isolated;
	}
1507

1508 1509 1510 1511 1512 1513 1514
	do {
		for (pfn = start_pfn; pfn;) {
			if (signal_pending(current)) {
				ret = -EINTR;
				reason = "signal backoff";
				goto failed_removal_isolated;
			}
1515

1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
			cond_resched();
			lru_add_drain_all();

			pfn = scan_movable_pages(pfn, end_pfn);
			if (pfn) {
				/*
				 * TODO: fatal migration failures should bail
				 * out
				 */
				do_migrate_range(pfn, end_pfn);
			}
		}
K
KAMEZAWA Hiroyuki 已提交
1528

1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539
		/*
		 * Dissolve free hugepages in the memory block before doing
		 * offlining actually in order to make hugetlbfs's object
		 * counting consistent.
		 */
		ret = dissolve_free_huge_pages(start_pfn, end_pfn);
		if (ret) {
			reason = "failure to dissolve huge pages";
			goto failed_removal_isolated;
		}
		/* check again */
1540 1541 1542
		ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
					    NULL, check_pages_isolated_cb);
	} while (ret);
1543

1544
	/* Ok, all of our target is isolated.
K
KAMEZAWA Hiroyuki 已提交
1545
	   We cannot do rollback at this point. */
1546 1547 1548
	walk_system_ram_range(start_pfn, end_pfn - start_pfn,
			      &offlined_pages, offline_isolated_pages_cb);
	pr_info("Offlined Pages %ld\n", offlined_pages);
1549 1550 1551 1552 1553 1554 1555 1556 1557
	/*
	 * Onlining will reset pagetype flags and makes migrate type
	 * MOVABLE, so just need to decrease the number of isolated
	 * pageblocks zone counter here.
	 */
	spin_lock_irqsave(&zone->lock, flags);
	zone->nr_isolate_pageblock -= nr_isolate_pageblock;
	spin_unlock_irqrestore(&zone->lock, flags);

K
KAMEZAWA Hiroyuki 已提交
1558
	/* removal success */
1559
	adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
K
KAMEZAWA Hiroyuki 已提交
1560
	zone->present_pages -= offlined_pages;
1561 1562

	pgdat_resize_lock(zone->zone_pgdat, &flags);
K
KAMEZAWA Hiroyuki 已提交
1563
	zone->zone_pgdat->node_present_pages -= offlined_pages;
1564
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
1565

1566 1567
	init_per_zone_wmark_min();

1568
	if (!populated_zone(zone)) {
1569
		zone_pcp_reset(zone);
1570
		build_all_zonelists(NULL);
1571 1572
	} else
		zone_pcp_update(zone);
1573

1574
	node_states_clear_node(node, &arg);
1575
	if (arg.status_change_nid >= 0) {
1576
		kswapd_stop(node);
1577 1578
		kcompactd_stop(node);
	}
1579

K
KAMEZAWA Hiroyuki 已提交
1580 1581
	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();
1582 1583

	memory_notify(MEM_OFFLINE, &arg);
1584
	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
1585
	mem_hotplug_done();
K
KAMEZAWA Hiroyuki 已提交
1586 1587
	return 0;

1588 1589
failed_removal_isolated:
	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1590
	memory_notify(MEM_CANCEL_OFFLINE, &arg);
K
KAMEZAWA Hiroyuki 已提交
1591
failed_removal:
1592
	pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
1593
		 (unsigned long long) start_pfn << PAGE_SHIFT,
1594 1595
		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
		 reason);
K
KAMEZAWA Hiroyuki 已提交
1596
	/* pushback to free area */
1597
	mem_hotplug_done();
K
KAMEZAWA Hiroyuki 已提交
1598 1599
	return ret;
}
1600

1601 1602
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
1603
	return __offline_pages(start_pfn, start_pfn + nr_pages);
1604 1605
}

1606
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
1607 1608 1609
{
	int ret = !is_memblock_offlined(mem);

1610 1611 1612 1613
	if (unlikely(ret)) {
		phys_addr_t beginpa, endpa;

		beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
1614
		endpa = beginpa + memory_block_size_bytes() - 1;
J
Joe Perches 已提交
1615
		pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
1616
			&beginpa, &endpa);
1617

1618 1619 1620
		return -EBUSY;
	}
	return 0;
1621 1622
}

1623
static int check_cpu_on_node(pg_data_t *pgdat)
1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638
{
	int cpu;

	for_each_present_cpu(cpu) {
		if (cpu_to_node(cpu) == pgdat->node_id)
			/*
			 * the cpu on this node isn't removed, and we can't
			 * offline this node.
			 */
			return -EBUSY;
	}

	return 0;
}

1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650
static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
{
	int nid = *(int *)arg;

	/*
	 * If a memory block belongs to multiple nodes, the stored nid is not
	 * reliable. However, such blocks are always online (e.g., cannot get
	 * offlined) and, therefore, are still spanned by the node.
	 */
	return mem->nid == nid ? -EEXIST : 0;
}

1651 1652
/**
 * try_offline_node
1653
 * @nid: the node ID
1654 1655 1656 1657 1658 1659
 *
 * Offline a node if all memory sections and cpus of the node are removed.
 *
 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
 * and online/offline operations before this call.
 */
1660
void try_offline_node(int nid)
1661
{
1662
	pg_data_t *pgdat = NODE_DATA(nid);
1663
	int rc;
1664

1665 1666 1667 1668 1669 1670 1671
	/*
	 * If the node still spans pages (especially ZONE_DEVICE), don't
	 * offline it. A node spans memory after move_pfn_range_to_zone(),
	 * e.g., after the memory block was onlined.
	 */
	if (pgdat->node_spanned_pages)
		return;
1672

1673 1674 1675 1676 1677 1678 1679
	/*
	 * Especially offline memory blocks might not be spanned by the
	 * node. They will get spanned by the node once they get onlined.
	 * However, they link to the node in sysfs and can get onlined later.
	 */
	rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
	if (rc)
1680 1681
		return;

1682
	if (check_cpu_on_node(pgdat))
1683 1684 1685 1686 1687 1688 1689 1690 1691
		return;

	/*
	 * all memory/cpu of this node are removed, we can offline this
	 * node now.
	 */
	node_set_offline(nid);
	unregister_one_node(nid);
}
1692
EXPORT_SYMBOL(try_offline_node);
1693

1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713
static void __release_memory_resource(resource_size_t start,
				      resource_size_t size)
{
	int ret;

	/*
	 * When removing memory in the same granularity as it was added,
	 * this function never fails. It might only fail if resources
	 * have to be adjusted or split. We'll ignore the error, as
	 * removing of memory cannot fail.
	 */
	ret = release_mem_region_adjustable(&iomem_resource, start, size);
	if (ret) {
		resource_size_t endres = start + size - 1;

		pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
			&start, &endres, ret);
	}
}

1714
static int __ref try_remove_memory(int nid, u64 start, u64 size)
1715
{
1716
	int rc = 0;
1717

1718 1719
	BUG_ON(check_hotplug_memory_range(start, size));

1720
	/*
1721
	 * All memory blocks must be offlined before removing memory.  Check
1722
	 * whether all memory blocks in question are offline and return error
1723
	 * if this is not the case.
1724
	 */
1725
	rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
1726 1727
	if (rc)
		goto done;
1728

1729 1730
	/* remove memmap entry */
	firmware_map_remove(start, start + size, "System RAM");
1731

1732 1733 1734 1735
	/*
	 * Memory block device removal under the device_hotplug_lock is
	 * a barrier against racing online attempts.
	 */
1736
	remove_memory_block_devices(start, size);
1737

1738 1739
	mem_hotplug_begin();

1740
	arch_remove_memory(nid, start, size, NULL);
1741 1742
	memblock_free(start, size);
	memblock_remove(start, size);
1743
	__release_memory_resource(start, size);
1744

1745 1746
	try_offline_node(nid);

1747
done:
1748
	mem_hotplug_done();
1749
	return rc;
1750
}
1751

1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765
/**
 * remove_memory
 * @nid: the node ID
 * @start: physical address of the region to remove
 * @size: size of the region to remove
 *
 * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
 * and online/offline operations before this call, as required by
 * try_offline_node().
 */
void __remove_memory(int nid, u64 start, u64 size)
{

	/*
S
Souptick Joarder 已提交
1766
	 * trigger BUG() if some memory is not offlined prior to calling this
1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777
	 * function
	 */
	if (try_remove_memory(nid, start, size))
		BUG();
}

/*
 * Remove memory if every memory block is offline, otherwise return -EBUSY is
 * some memory is not offline
 */
int remove_memory(int nid, u64 start, u64 size)
1778
{
1779 1780
	int rc;

1781
	lock_device_hotplug();
1782
	rc  = try_remove_memory(nid, start, size);
1783
	unlock_device_hotplug();
1784 1785

	return rc;
1786
}
1787
EXPORT_SYMBOL_GPL(remove_memory);
1788
#endif /* CONFIG_MEMORY_HOTREMOVE */