memory_hotplug.c 38.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
14
#include <linux/export.h>
15
#include <linux/pagevec.h>
16
#include <linux/writeback.h>
17 18 19 20 21 22 23
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
24
#include <linux/ioport.h>
K
KAMEZAWA Hiroyuki 已提交
25 26 27
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>
28
#include <linux/pfn.h>
29
#include <linux/suspend.h>
30
#include <linux/mm_inline.h>
31
#include <linux/firmware-map.h>
32 33 34

#include <asm/tlbflush.h>

35 36
#include "internal.h"

37 38 39 40 41 42 43 44 45 46 47
/*
 * online_page_callback contains pointer to current page onlining function.
 * Initially it is generic_online_page(). If it is required it could be
 * changed by calling set_online_page_callback() for callback registration
 * and restore_online_page_callback() for generic callback restore.
 */

static void generic_online_page(struct page *page);

static online_page_callback_t online_page_callback = generic_online_page;

48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
DEFINE_MUTEX(mem_hotplug_mutex);

void lock_memory_hotplug(void)
{
	mutex_lock(&mem_hotplug_mutex);

	/* for exclusive hibernation if CONFIG_HIBERNATION=y */
	lock_system_sleep();
}

void unlock_memory_hotplug(void)
{
	unlock_system_sleep();
	mutex_unlock(&mem_hotplug_mutex);
}


65 66 67 68 69 70 71 72 73 74
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
	struct resource *res;
	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	BUG_ON(!res);

	res->name = "System RAM";
	res->start = start;
	res->end = start + size - 1;
75
	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
76
	if (request_resource(&iomem_resource, res) < 0) {
77
		printk("System RAM resource %pR cannot be added\n", res);
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
		kfree(res);
		res = NULL;
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
	return;
}

93
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
94 95
void get_page_bootmem(unsigned long info,  struct page *page,
		      unsigned long type)
96
{
A
Andrea Arcangeli 已提交
97
	page->lru.next = (struct list_head *) type;
98 99 100 101 102
	SetPagePrivate(page);
	set_page_private(page, info);
	atomic_inc(&page->_count);
}

103 104 105
/* reference to __meminit __free_pages_bootmem is valid
 * so use __ref to tell modpost not to generate a warning */
void __ref put_page_bootmem(struct page *page)
106
{
A
Andrea Arcangeli 已提交
107
	unsigned long type;
108
	static DEFINE_MUTEX(ppb_lock);
109

A
Andrea Arcangeli 已提交
110 111 112
	type = (unsigned long) page->lru.next;
	BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
	       type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
113 114 115 116

	if (atomic_dec_return(&page->_count) == 1) {
		ClearPagePrivate(page);
		set_page_private(page, 0);
A
Andrea Arcangeli 已提交
117
		INIT_LIST_HEAD(&page->lru);
118 119 120 121 122 123

		/*
		 * Please refer to comment for __free_pages_bootmem()
		 * for why we serialize here.
		 */
		mutex_lock(&ppb_lock);
124
		__free_pages_bootmem(page, 0);
125
		mutex_unlock(&ppb_lock);
126 127 128 129
	}

}

130 131
#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
#ifndef CONFIG_SPARSEMEM_VMEMMAP
132
static void register_page_bootmem_info_section(unsigned long start_pfn)
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	/* Get section's memmap address */
	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	/*
	 * Get page for the memmap's phys address
	 * XXX: need more consideration for sparse_vmemmap...
	 */
	page = virt_to_page(memmap);
	mapsize = sizeof(struct page) * PAGES_PER_SECTION;
	mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;

	/* remember memmap's page */
	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, SECTION_INFO);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
162
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
163 164

}
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
	unsigned long *usemap, mapsize, section_nr, i;
	struct mem_section *ms;
	struct page *page, *memmap;

	if (!pfn_valid(start_pfn))
		return;

	section_nr = pfn_to_section_nr(start_pfn);
	ms = __nr_to_section(section_nr);

	memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);

	register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);

	usemap = __nr_to_section(section_nr)->pageblock_flags;
	page = virt_to_page(usemap);

	mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;

	for (i = 0; i < mapsize; i++, page++)
		get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221

void register_page_bootmem_info_node(struct pglist_data *pgdat)
{
	unsigned long i, pfn, end_pfn, nr_pages;
	int node = pgdat->node_id;
	struct page *page;
	struct zone *zone;

	nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
	page = virt_to_page(pgdat);

	for (i = 0; i < nr_pages; i++, page++)
		get_page_bootmem(node, page, NODE_INFO);

	zone = &pgdat->node_zones[0];
	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
		if (zone->wait_table) {
			nr_pages = zone->wait_table_hash_nr_entries
				* sizeof(wait_queue_head_t);
			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
			page = virt_to_page(zone->wait_table);

			for (i = 0; i < nr_pages; i++, page++)
				get_page_bootmem(node, page, NODE_INFO);
		}
	}

	pfn = pgdat->node_start_pfn;
	end_pfn = pfn + pgdat->node_spanned_pages;

	/* register_section info */
222 223 224 225 226 227 228 229 230 231
	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
		/*
		 * Some platforms can assign the same pfn to multiple nodes - on
		 * node0 as well as nodeN.  To avoid registering a pfn against
		 * multiple nodes we check that this pfn does not already
		 * reside in some other node.
		 */
		if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
			register_page_bootmem_info_section(pfn);
	}
232
}
233
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
234

235 236 237 238 239 240 241 242
static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
			   unsigned long end_pfn)
{
	unsigned long old_zone_end_pfn;

	zone_span_writelock(zone);

	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
243
	if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
244 245 246 247 248 249 250 251
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
				zone->zone_start_pfn;

	zone_span_writeunlock(zone);
}

252 253 254 255 256
static void resize_zone(struct zone *zone, unsigned long start_pfn,
		unsigned long end_pfn)
{
	zone_span_writelock(zone);

257 258 259 260 261 262 263 264 265 266 267
	if (end_pfn - start_pfn) {
		zone->zone_start_pfn = start_pfn;
		zone->spanned_pages = end_pfn - start_pfn;
	} else {
		/*
		 * make it consist as free_area_init_core(),
		 * if spanned_pages = 0, then keep start_pfn = 0
		 */
		zone->zone_start_pfn = 0;
		zone->spanned_pages = 0;
	}
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282

	zone_span_writeunlock(zone);
}

static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
		unsigned long end_pfn)
{
	enum zone_type zid = zone_idx(zone);
	int nid = zone->zone_pgdat->node_id;
	unsigned long pfn;

	for (pfn = start_pfn; pfn < end_pfn; pfn++)
		set_page_links(pfn_to_page(pfn), zid, nid, pfn);
}

283
static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
284 285
		unsigned long start_pfn, unsigned long end_pfn)
{
286
	int ret;
287
	unsigned long flags;
288 289 290 291 292 293 294 295
	unsigned long z1_start_pfn;

	if (!z1->wait_table) {
		ret = init_currently_empty_zone(z1, start_pfn,
			end_pfn - start_pfn, MEMMAP_HOTPLUG);
		if (ret)
			return ret;
	}
296 297 298 299 300 301 302 303 304 305 306 307 308

	pgdat_resize_lock(z1->zone_pgdat, &flags);

	/* can't move pfns which are higher than @z2 */
	if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
		goto out_fail;
	/* the move out part mast at the left most of @z2 */
	if (start_pfn > z2->zone_start_pfn)
		goto out_fail;
	/* must included/overlap */
	if (end_pfn <= z2->zone_start_pfn)
		goto out_fail;

309 310 311 312 313 314 315
	/* use start_pfn for z1's start_pfn if z1 is empty */
	if (z1->spanned_pages)
		z1_start_pfn = z1->zone_start_pfn;
	else
		z1_start_pfn = start_pfn;

	resize_zone(z1, z1_start_pfn, end_pfn);
316 317 318 319 320 321 322 323 324 325 326 327
	resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);

	pgdat_resize_unlock(z1->zone_pgdat, &flags);

	fix_zone_id(z1, start_pfn, end_pfn);

	return 0;
out_fail:
	pgdat_resize_unlock(z1->zone_pgdat, &flags);
	return -1;
}

328
static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
329 330
		unsigned long start_pfn, unsigned long end_pfn)
{
331
	int ret;
332
	unsigned long flags;
333 334 335 336 337 338 339 340
	unsigned long z2_end_pfn;

	if (!z2->wait_table) {
		ret = init_currently_empty_zone(z2, start_pfn,
			end_pfn - start_pfn, MEMMAP_HOTPLUG);
		if (ret)
			return ret;
	}
341 342 343 344 345 346 347 348 349 350 351 352 353

	pgdat_resize_lock(z1->zone_pgdat, &flags);

	/* can't move pfns which are lower than @z1 */
	if (z1->zone_start_pfn > start_pfn)
		goto out_fail;
	/* the move out part mast at the right most of @z1 */
	if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
		goto out_fail;
	/* must included/overlap */
	if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
		goto out_fail;

354 355 356 357 358 359
	/* use end_pfn for z2's end_pfn if z2 is empty */
	if (z2->spanned_pages)
		z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
	else
		z2_end_pfn = end_pfn;

360
	resize_zone(z1, z1->zone_start_pfn, start_pfn);
361
	resize_zone(z2, start_pfn, z2_end_pfn);
362 363 364 365 366 367 368 369 370 371 372

	pgdat_resize_unlock(z1->zone_pgdat, &flags);

	fix_zone_id(z2, start_pfn, end_pfn);

	return 0;
out_fail:
	pgdat_resize_unlock(z1->zone_pgdat, &flags);
	return -1;
}

373 374 375 376 377 378
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
			    unsigned long end_pfn)
{
	unsigned long old_pgdat_end_pfn =
		pgdat->node_start_pfn + pgdat->node_spanned_pages;

379
	if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
380 381 382 383 384 385
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
					pgdat->node_start_pfn;
}

A
Al Viro 已提交
386
static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
387 388 389 390 391
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	int nid = pgdat->node_id;
	int zone_type;
392
	unsigned long flags;
393 394

	zone_type = zone - pgdat->node_zones;
395 396 397 398 399 400 401 402 403 404 405 406 407
	if (!zone->wait_table) {
		int ret;

		ret = init_currently_empty_zone(zone, phys_start_pfn,
						nr_pages, MEMMAP_HOTPLUG);
		if (ret)
			return ret;
	}
	pgdat_resize_lock(zone->zone_pgdat, &flags);
	grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
	grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
			phys_start_pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);
D
Dave Hansen 已提交
408 409
	memmap_init_zone(nr_pages, nid, zone_type,
			 phys_start_pfn, MEMMAP_HOTPLUG);
410
	return 0;
411 412
}

413 414
static int __meminit __add_section(int nid, struct zone *zone,
					unsigned long phys_start_pfn)
415 416 417 418
{
	int nr_pages = PAGES_PER_SECTION;
	int ret;

419 420 421
	if (pfn_valid(phys_start_pfn))
		return -EEXIST;

422
	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
423 424 425 426

	if (ret < 0)
		return ret;

427 428 429 430 431
	ret = __add_zone(zone, phys_start_pfn);

	if (ret < 0)
		return ret;

432
	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
433 434
}

435 436 437 438 439 440 441 442 443 444
#ifdef CONFIG_SPARSEMEM_VMEMMAP
static int __remove_section(struct zone *zone, struct mem_section *ms)
{
	/*
	 * XXX: Freeing memmap with vmemmap is not implement yet.
	 *      This should be removed later.
	 */
	return -EBUSY;
}
#else
445 446 447 448 449 450 451 452 453 454 455 456 457 458
static int __remove_section(struct zone *zone, struct mem_section *ms)
{
	int ret = -EINVAL;

	if (!valid_section(ms))
		return ret;

	ret = unregister_memory_section(ms);
	if (ret)
		return ret;

	sparse_remove_one_section(zone, ms);
	return 0;
}
459
#endif
460

461 462 463 464 465 466
/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
467 468
int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
			unsigned long nr_pages)
469 470 471
{
	unsigned long i;
	int err = 0;
472 473 474 475
	int start_sec, end_sec;
	/* during initialize mem_map, align hot-added range to section */
	start_sec = pfn_to_section_nr(phys_start_pfn);
	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
476

477
	for (i = start_sec; i <= end_sec; i++) {
478
		err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
479

480
		/*
S
Simon Arlott 已提交
481
		 * EEXIST is finally dealt with by ioresource collision
482 483
		 * check. see add_memory() => register_memory_resource()
		 * Warning will be printed if there is collision.
484 485
		 */
		if (err && (err != -EEXIST))
486
			break;
487
		err = 0;
488 489 490 491
	}

	return err;
}
492
EXPORT_SYMBOL_GPL(__add_pages);
493

494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516
/**
 * __remove_pages() - remove sections of pages from a zone
 * @zone: zone from which pages need to be removed
 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 * @nr_pages: number of pages to remove (must be multiple of section size)
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
	unsigned long i, ret = 0;
	int sections_to_remove;

	/*
	 * We can only remove entire sections
	 */
	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
	BUG_ON(nr_pages % PAGES_PER_SECTION);

517 518
	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);

519 520 521 522 523 524 525 526 527 528 529
	sections_to_remove = nr_pages / PAGES_PER_SECTION;
	for (i = 0; i < sections_to_remove; i++) {
		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
		ret = __remove_section(zone, __pfn_to_section(pfn));
		if (ret)
			break;
	}
	return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);

530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
int set_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

	lock_memory_hotplug();

	if (online_page_callback == generic_online_page) {
		online_page_callback = callback;
		rc = 0;
	}

	unlock_memory_hotplug();

	return rc;
}
EXPORT_SYMBOL_GPL(set_online_page_callback);

int restore_online_page_callback(online_page_callback_t callback)
{
	int rc = -EINVAL;

	lock_memory_hotplug();

	if (online_page_callback == callback) {
		online_page_callback = generic_online_page;
		rc = 0;
	}

	unlock_memory_hotplug();

	return rc;
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);

void __online_page_set_limits(struct page *page)
565
{
566 567 568 569
	unsigned long pfn = page_to_pfn(page);

	if (pfn >= num_physpages)
		num_physpages = pfn + 1;
570 571 572 573 574 575
}
EXPORT_SYMBOL_GPL(__online_page_set_limits);

void __online_page_increment_counters(struct page *page)
{
	totalram_pages++;
576 577 578 579 580

#ifdef CONFIG_HIGHMEM
	if (PageHighMem(page))
		totalhigh_pages++;
#endif
581 582
}
EXPORT_SYMBOL_GPL(__online_page_increment_counters);
583

584 585
void __online_page_free(struct page *page)
{
586 587 588 589
	ClearPageReserved(page);
	init_page_count(page);
	__free_page(page);
}
590 591 592 593 594 595 596 597
EXPORT_SYMBOL_GPL(__online_page_free);

static void generic_online_page(struct page *page)
{
	__online_page_set_limits(page);
	__online_page_increment_counters(page);
	__online_page_free(page);
}
598

599 600
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
601 602
{
	unsigned long i;
603 604 605 606 607
	unsigned long onlined_pages = *(unsigned long *)arg;
	struct page *page;
	if (PageReserved(pfn_to_page(start_pfn)))
		for (i = 0; i < nr_pages; i++) {
			page = pfn_to_page(start_pfn + i);
608
			(*online_page_callback)(page);
609 610 611 612 613 614
			onlined_pages++;
		}
	*(unsigned long *)arg = onlined_pages;
	return 0;
}

615
#ifdef CONFIG_MOVABLE_NODE
T
Tang Chen 已提交
616 617 618 619
/*
 * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
 * normal memory.
 */
620 621 622 623
static bool can_online_high_movable(struct zone *zone)
{
	return true;
}
T
Tang Chen 已提交
624
#else /* CONFIG_MOVABLE_NODE */
625 626 627 628 629
/* ensure every online node has NORMAL memory */
static bool can_online_high_movable(struct zone *zone)
{
	return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
}
T
Tang Chen 已提交
630
#endif /* CONFIG_MOVABLE_NODE */
631

632 633 634 635 636 637 638 639
/* check which state of node_states will be changed when online memory */
static void node_states_check_changes_online(unsigned long nr_pages,
	struct zone *zone, struct memory_notify *arg)
{
	int nid = zone_to_nid(zone);
	enum zone_type zone_last = ZONE_NORMAL;

	/*
640 641 642
	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_NORMAL,
	 * set zone_last to ZONE_NORMAL.
643
	 *
644 645 646
	 * If we don't have HIGHMEM nor movable node,
	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
647
	 */
648
	if (N_MEMORY == N_NORMAL_MEMORY)
649 650 651 652 653 654 655 656 657 658 659 660 661
		zone_last = ZONE_MOVABLE;

	/*
	 * if the memory to be online is in a zone of 0...zone_last, and
	 * the zones of 0...zone_last don't have memory before online, we will
	 * need to set the node to node_states[N_NORMAL_MEMORY] after
	 * the memory is online.
	 */
	if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
		arg->status_change_nid_normal = nid;
	else
		arg->status_change_nid_normal = -1;

662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
#ifdef CONFIG_HIGHMEM
	/*
	 * If we have movable node, node_states[N_HIGH_MEMORY]
	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
	 * set zone_last to ZONE_HIGHMEM.
	 *
	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_MOVABLE,
	 * set zone_last to ZONE_MOVABLE.
	 */
	zone_last = ZONE_HIGHMEM;
	if (N_MEMORY == N_HIGH_MEMORY)
		zone_last = ZONE_MOVABLE;

	if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
		arg->status_change_nid_high = nid;
	else
		arg->status_change_nid_high = -1;
#else
	arg->status_change_nid_high = arg->status_change_nid_normal;
#endif

684 685
	/*
	 * if the node don't have memory befor online, we will need to
686
	 * set the node to node_states[N_MEMORY] after the memory
687 688
	 * is online.
	 */
689
	if (!node_state(nid, N_MEMORY))
690 691 692 693 694 695 696 697 698 699
		arg->status_change_nid = nid;
	else
		arg->status_change_nid = -1;
}

static void node_states_set_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_set_state(node, N_NORMAL_MEMORY);

700 701 702 703
	if (arg->status_change_nid_high >= 0)
		node_set_state(node, N_HIGH_MEMORY);

	node_set_state(node, N_MEMORY);
704 705
}

706

707
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
708
{
709 710
	unsigned long onlined_pages = 0;
	struct zone *zone;
711
	int need_zonelists_rebuild = 0;
712 713 714 715
	int nid;
	int ret;
	struct memory_notify arg;

716
	lock_memory_hotplug();
717 718 719 720 721 722 723
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
	 * memory_block->state_mutex.
	 */
	zone = page_zone(pfn_to_page(pfn));

724 725 726 727 728 729
	if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
	    !can_online_high_movable(zone)) {
		unlock_memory_hotplug();
		return -1;
	}

730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
	if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
		if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
			unlock_memory_hotplug();
			return -1;
		}
	}
	if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
		if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
			unlock_memory_hotplug();
			return -1;
		}
	}

	/* Previous code may changed the zone of the pfn range */
	zone = page_zone(pfn_to_page(pfn));

746 747
	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
748
	node_states_check_changes_online(nr_pages, zone, &arg);
749 750

	nid = page_to_nid(pfn_to_page(pfn));
751

752 753 754 755
	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret) {
		memory_notify(MEM_CANCEL_ONLINE, &arg);
756
		unlock_memory_hotplug();
757 758
		return ret;
	}
759 760 761 762 763
	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
764
	mutex_lock(&zonelists_mutex);
765
	if (!populated_zone(zone)) {
766
		need_zonelists_rebuild = 1;
767 768
		build_all_zonelists(NULL, zone);
	}
769

K
KAMEZAWA Hiroyuki 已提交
770
	ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
771
		online_pages_range);
772
	if (ret) {
773 774
		if (need_zonelists_rebuild)
			zone_pcp_reset(zone);
775
		mutex_unlock(&zonelists_mutex);
776 777 778 779
		printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
		       (unsigned long long) pfn << PAGE_SHIFT,
		       (((unsigned long long) pfn + nr_pages)
			    << PAGE_SHIFT) - 1);
780
		memory_notify(MEM_CANCEL_ONLINE, &arg);
781
		unlock_memory_hotplug();
782 783 784
		return ret;
	}

785
	zone->managed_pages += onlined_pages;
786
	zone->present_pages += onlined_pages;
787
	zone->zone_pgdat->node_present_pages += onlined_pages;
788
	if (onlined_pages) {
789
		node_states_set_node(zone_to_nid(zone), &arg);
790
		if (need_zonelists_rebuild)
791
			build_all_zonelists(NULL, NULL);
792 793 794
		else
			zone_pcp_update(zone);
	}
795

796
	mutex_unlock(&zonelists_mutex);
797 798 799

	init_per_zone_wmark_min();

800
	if (onlined_pages)
801
		kswapd_run(zone_to_nid(zone));
802

803
	vm_total_pages = nr_free_pagecache_pages();
804

805
	writeback_set_ratelimit();
806 807 808

	if (onlined_pages)
		memory_notify(MEM_ONLINE, &arg);
809
	unlock_memory_hotplug();
810

811 812
	return 0;
}
813
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
814

815 816
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
{
	struct pglist_data *pgdat;
	unsigned long zones_size[MAX_NR_ZONES] = {0};
	unsigned long zholes_size[MAX_NR_ZONES] = {0};
	unsigned long start_pfn = start >> PAGE_SHIFT;

	pgdat = arch_alloc_nodedata(nid);
	if (!pgdat)
		return NULL;

	arch_refresh_nodedata(nid, pgdat);

	/* we can use NODE_DATA(nid) from here */

	/* init node's zones as empty zones, we don't have any present pages.*/
832
	free_area_init_node(nid, zones_size, start_pfn, zholes_size);
833

834 835 836 837
	/*
	 * The node we allocated has no zone fallback lists. For avoiding
	 * to access not-initialized zonelist, build here.
	 */
838
	mutex_lock(&zonelists_mutex);
839
	build_all_zonelists(pgdat, NULL);
840
	mutex_unlock(&zonelists_mutex);
841

842 843 844 845 846 847 848 849 850 851
	return pgdat;
}

static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
	arch_refresh_nodedata(nid, NULL);
	arch_free_nodedata(pgdat);
	return;
}

852

853 854 855 856 857 858 859 860
/*
 * called by cpu_up() to online a node without onlined memory.
 */
int mem_online_node(int nid)
{
	pg_data_t	*pgdat;
	int	ret;

861
	lock_memory_hotplug();
862
	pgdat = hotadd_new_pgdat(nid, 0);
863
	if (!pgdat) {
864 865 866 867 868 869 870 871
		ret = -ENOMEM;
		goto out;
	}
	node_set_online(nid);
	ret = register_one_node(nid);
	BUG_ON(ret);

out:
872
	unlock_memory_hotplug();
873 874 875
	return ret;
}

A
Al Viro 已提交
876 877
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
int __ref add_memory(int nid, u64 start, u64 size)
878
{
879 880
	pg_data_t *pgdat = NULL;
	int new_pgdat = 0;
881
	struct resource *res;
882 883
	int ret;

884
	lock_memory_hotplug();
885

886
	res = register_memory_resource(start, size);
887
	ret = -EEXIST;
888
	if (!res)
889
		goto out;
890

891 892
	if (!node_online(nid)) {
		pgdat = hotadd_new_pgdat(nid, start);
893
		ret = -ENOMEM;
894
		if (!pgdat)
895
			goto error;
896 897 898
		new_pgdat = 1;
	}

899 900 901
	/* call arch's memory hotadd */
	ret = arch_add_memory(nid, start, size);

902 903 904
	if (ret < 0)
		goto error;

905
	/* we online node here. we can't roll back from here. */
906 907
	node_set_online(nid);

908 909 910 911 912 913 914 915 916 917
	if (new_pgdat) {
		ret = register_one_node(nid);
		/*
		 * If sysfs file of new node can't create, cpu on the node
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
		 */
		BUG_ON(ret);
	}

918 919 920
	/* create new memmap entry */
	firmware_map_add_hotplug(start, start + size, "System RAM");

921 922
	goto out;

923 924 925 926
error:
	/* rollback pgdat allocation and others */
	if (new_pgdat)
		rollback_node_hotadd(nid, pgdat);
927
	release_memory_resource(res);
928

929
out:
930
	unlock_memory_hotplug();
931 932 933
	return ret;
}
EXPORT_SYMBOL_GPL(add_memory);
K
KAMEZAWA Hiroyuki 已提交
934 935

#ifdef CONFIG_MEMORY_HOTREMOVE
936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954
/*
 * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 * set and the size of the free page is given by page_order(). Using this,
 * the function determines if the pageblock contains only free pages.
 * Due to buddy contraints, a free page at least the size of a pageblock will
 * be located at the start of the pageblock
 */
static inline int pageblock_free(struct page *page)
{
	return PageBuddy(page) && page_order(page) >= pageblock_order;
}

/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
	/* Ensure the starting page is pageblock-aligned */
	BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));

	/* If the entire pageblock is free, move to the end of free page */
955 956 957 958 959 960 961
	if (pageblock_free(page)) {
		int order;
		/* be careful. we don't have locks, page_order can be changed.*/
		order = page_order(page);
		if ((order < MAX_ORDER) && (order >= pageblock_order))
			return page + (1 << order);
	}
962

963
	return page + pageblock_nr_pages;
964 965 966 967 968 969 970 971 972 973
}

/* Checks if this range of memory is likely to be hot-removable. */
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
	struct page *page = pfn_to_page(start_pfn);
	struct page *end_page = page + nr_pages;

	/* Check the starting page of each pageblock within the range */
	for (; page < end_page; page = next_active_pageblock(page)) {
974
		if (!is_pageblock_removable_nolock(page))
975
			return 0;
976
		cond_resched();
977 978 979 980 981 982
	}

	/* All pageblocks in the memory block are likely to be hot-removable */
	return 1;
}

K
KAMEZAWA Hiroyuki 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct zone *zone = NULL;
	struct page *page;
	int i;
	for (pfn = start_pfn;
	     pfn < end_pfn;
	     pfn += MAX_ORDER_NR_PAGES) {
		i = 0;
		/* This is just a CONFIG_HOLES_IN_ZONE check.*/
		while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
			i++;
		if (i == MAX_ORDER_NR_PAGES)
			continue;
		page = pfn_to_page(pfn + i);
		if (zone && page_zone(page) != zone)
			return 0;
		zone = page_zone(page);
	}
	return 1;
}

/*
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
1013
static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
K
KAMEZAWA Hiroyuki 已提交
1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
{
	unsigned long pfn;
	struct page *page;
	for (pfn = start; pfn < end; pfn++) {
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageLRU(page))
				return pfn;
		}
	}
	return 0;
}

#define NR_OFFLINE_AT_ONCE_PAGES	(256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	int not_managed = 0;
	int ret = 0;
	LIST_HEAD(source);

	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
1042
		if (!get_page_unless_zero(page))
K
KAMEZAWA Hiroyuki 已提交
1043 1044 1045 1046 1047
			continue;
		/*
		 * We can skip free pages. And we can only deal with pages on
		 * LRU.
		 */
1048
		ret = isolate_lru_page(page);
K
KAMEZAWA Hiroyuki 已提交
1049
		if (!ret) { /* Success */
1050
			put_page(page);
1051
			list_add_tail(&page->lru, &source);
K
KAMEZAWA Hiroyuki 已提交
1052
			move_pages--;
1053 1054 1055
			inc_zone_page_state(page, NR_ISOLATED_ANON +
					    page_is_file_cache(page));

K
KAMEZAWA Hiroyuki 已提交
1056 1057
		} else {
#ifdef CONFIG_DEBUG_VM
1058 1059 1060
			printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
			       pfn);
			dump_page(page);
K
KAMEZAWA Hiroyuki 已提交
1061
#endif
1062
			put_page(page);
L
Lucas De Marchi 已提交
1063
			/* Because we don't have big zone->lock. we should
1064 1065 1066
			   check this again here. */
			if (page_count(page)) {
				not_managed++;
1067
				ret = -EBUSY;
1068 1069
				break;
			}
K
KAMEZAWA Hiroyuki 已提交
1070 1071
		}
	}
1072 1073 1074 1075 1076
	if (!list_empty(&source)) {
		if (not_managed) {
			putback_lru_pages(&source);
			goto out;
		}
1077 1078 1079 1080 1081 1082

		/*
		 * alloc_migrate_target should be improooooved!!
		 * migrate_pages returns # of failed pages.
		 */
		ret = migrate_pages(&source, alloc_migrate_target, 0,
1083 1084
							true, MIGRATE_SYNC,
							MR_MEMORY_HOTPLUG);
1085
		if (ret)
K
KAMEZAWA Hiroyuki 已提交
1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
			putback_lru_pages(&source);
	}
out:
	return ret;
}

/*
 * remove from free_area[] and mark all as Reserved.
 */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
			void *data)
{
	__offline_isolated_pages(start, start + nr_pages);
	return 0;
}

static void
offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
K
KAMEZAWA Hiroyuki 已提交
1106
	walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
K
KAMEZAWA Hiroyuki 已提交
1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
				offline_isolated_pages_cb);
}

/*
 * Check all pages in range, recoreded as memory resource, are isolated.
 */
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
			void *data)
{
	int ret;
	long offlined = *(long *)data;
1119
	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
K
KAMEZAWA Hiroyuki 已提交
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
	offlined = nr_pages;
	if (!ret)
		*(long *)data += offlined;
	return ret;
}

static long
check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
	long offlined = 0;
	int ret;

K
KAMEZAWA Hiroyuki 已提交
1132
	ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
K
KAMEZAWA Hiroyuki 已提交
1133 1134 1135 1136 1137 1138
			check_pages_isolated_cb);
	if (ret < 0)
		offlined = (long)ret;
	return offlined;
}

1139
#ifdef CONFIG_MOVABLE_NODE
T
Tang Chen 已提交
1140 1141 1142 1143
/*
 * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
 * normal memory.
 */
1144 1145 1146 1147
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
{
	return true;
}
T
Tang Chen 已提交
1148
#else /* CONFIG_MOVABLE_NODE */
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
/* ensure the node has NORMAL memory if it is still online */
static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	unsigned long present_pages = 0;
	enum zone_type zt;

	for (zt = 0; zt <= ZONE_NORMAL; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;

	if (present_pages > nr_pages)
		return true;

	present_pages = 0;
	for (; zt <= ZONE_MOVABLE; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;

	/*
	 * we can't offline the last normal memory until all
	 * higher memory is offlined.
	 */
	return present_pages == 0;
}
T
Tang Chen 已提交
1172
#endif /* CONFIG_MOVABLE_NODE */
1173

1174 1175 1176 1177 1178 1179 1180 1181 1182
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
		struct zone *zone, struct memory_notify *arg)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	unsigned long present_pages = 0;
	enum zone_type zt, zone_last = ZONE_NORMAL;

	/*
1183 1184 1185
	 * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_NORMAL,
	 * set zone_last to ZONE_NORMAL.
1186
	 *
1187 1188 1189
	 * If we don't have HIGHMEM nor movable node,
	 * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
	 * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
1190
	 */
1191
	if (N_MEMORY == N_NORMAL_MEMORY)
1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
		zone_last = ZONE_MOVABLE;

	/*
	 * check whether node_states[N_NORMAL_MEMORY] will be changed.
	 * If the memory to be offline is in a zone of 0...zone_last,
	 * and it is the last present memory, 0...zone_last will
	 * become empty after offline , thus we can determind we will
	 * need to clear the node from node_states[N_NORMAL_MEMORY].
	 */
	for (zt = 0; zt <= zone_last; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;
	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
		arg->status_change_nid_normal = zone_to_nid(zone);
	else
		arg->status_change_nid_normal = -1;

1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
#ifdef CONFIG_HIGHMEM
	/*
	 * If we have movable node, node_states[N_HIGH_MEMORY]
	 * contains nodes which have zones of 0...ZONE_HIGHMEM,
	 * set zone_last to ZONE_HIGHMEM.
	 *
	 * If we don't have movable node, node_states[N_NORMAL_MEMORY]
	 * contains nodes which have zones of 0...ZONE_MOVABLE,
	 * set zone_last to ZONE_MOVABLE.
	 */
	zone_last = ZONE_HIGHMEM;
	if (N_MEMORY == N_HIGH_MEMORY)
		zone_last = ZONE_MOVABLE;

	for (; zt <= zone_last; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;
	if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
		arg->status_change_nid_high = zone_to_nid(zone);
	else
		arg->status_change_nid_high = -1;
#else
	arg->status_change_nid_high = arg->status_change_nid_normal;
#endif

1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
	/*
	 * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
	 */
	zone_last = ZONE_MOVABLE;

	/*
	 * check whether node_states[N_HIGH_MEMORY] will be changed
	 * If we try to offline the last present @nr_pages from the node,
	 * we can determind we will need to clear the node from
	 * node_states[N_HIGH_MEMORY].
	 */
	for (; zt <= zone_last; zt++)
		present_pages += pgdat->node_zones[zt].present_pages;
	if (nr_pages >= present_pages)
		arg->status_change_nid = zone_to_nid(zone);
	else
		arg->status_change_nid = -1;
}

static void node_states_clear_node(int node, struct memory_notify *arg)
{
	if (arg->status_change_nid_normal >= 0)
		node_clear_state(node, N_NORMAL_MEMORY);

1256 1257
	if ((N_MEMORY != N_NORMAL_MEMORY) &&
	    (arg->status_change_nid_high >= 0))
1258
		node_clear_state(node, N_HIGH_MEMORY);
1259 1260 1261 1262

	if ((N_MEMORY != N_HIGH_MEMORY) &&
	    (arg->status_change_nid >= 0))
		node_clear_state(node, N_MEMORY);
1263 1264
}

1265
static int __ref __offline_pages(unsigned long start_pfn,
K
KAMEZAWA Hiroyuki 已提交
1266 1267 1268 1269
		  unsigned long end_pfn, unsigned long timeout)
{
	unsigned long pfn, nr_pages, expire;
	long offlined_pages;
1270
	int ret, drain, retry_max, node;
K
KAMEZAWA Hiroyuki 已提交
1271
	struct zone *zone;
1272
	struct memory_notify arg;
K
KAMEZAWA Hiroyuki 已提交
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283

	BUG_ON(start_pfn >= end_pfn);
	/* at least, alignment against pageblock is necessary */
	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
		return -EINVAL;
	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
		return -EINVAL;
	/* This makes hotplug much easier...and readable.
	   we assume this for now. .*/
	if (!test_pages_in_a_zone(start_pfn, end_pfn))
		return -EINVAL;
1284

1285
	lock_memory_hotplug();
1286

1287 1288 1289 1290
	zone = page_zone(pfn_to_page(start_pfn));
	node = zone_to_nid(zone);
	nr_pages = end_pfn - start_pfn;

1291 1292 1293 1294
	ret = -EINVAL;
	if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
		goto out;

K
KAMEZAWA Hiroyuki 已提交
1295
	/* set above range as isolated */
1296 1297
	ret = start_isolate_page_range(start_pfn, end_pfn,
				       MIGRATE_MOVABLE, true);
K
KAMEZAWA Hiroyuki 已提交
1298
	if (ret)
1299
		goto out;
1300 1301 1302

	arg.start_pfn = start_pfn;
	arg.nr_pages = nr_pages;
1303
	node_states_check_changes_offline(nr_pages, zone, &arg);
1304 1305 1306 1307 1308 1309

	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret)
		goto failed_removal;

K
KAMEZAWA Hiroyuki 已提交
1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325
	pfn = start_pfn;
	expire = jiffies + timeout;
	drain = 0;
	retry_max = 5;
repeat:
	/* start memory hot removal */
	ret = -EAGAIN;
	if (time_after(jiffies, expire))
		goto failed_removal;
	ret = -EINTR;
	if (signal_pending(current))
		goto failed_removal;
	ret = 0;
	if (drain) {
		lru_add_drain_all();
		cond_resched();
1326
		drain_all_pages();
K
KAMEZAWA Hiroyuki 已提交
1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343
	}

	pfn = scan_lru_pages(start_pfn, end_pfn);
	if (pfn) { /* We have page on LRU */
		ret = do_migrate_range(pfn, end_pfn);
		if (!ret) {
			drain = 1;
			goto repeat;
		} else {
			if (ret < 0)
				if (--retry_max == 0)
					goto failed_removal;
			yield();
			drain = 1;
			goto repeat;
		}
	}
1344
	/* drain all zone's lru pagevec, this is asynchronous... */
K
KAMEZAWA Hiroyuki 已提交
1345 1346
	lru_add_drain_all();
	yield();
1347
	/* drain pcp pages, this is synchronous. */
1348
	drain_all_pages();
K
KAMEZAWA Hiroyuki 已提交
1349 1350 1351 1352 1353 1354 1355
	/* check again */
	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
	if (offlined_pages < 0) {
		ret = -EBUSY;
		goto failed_removal;
	}
	printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
1356
	/* Ok, all of our target is isolated.
K
KAMEZAWA Hiroyuki 已提交
1357 1358
	   We cannot do rollback at this point. */
	offline_isolated_pages(start_pfn, end_pfn);
1359
	/* reset pagetype flags and makes migrate type to be MOVABLE */
1360
	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
K
KAMEZAWA Hiroyuki 已提交
1361
	/* removal success */
1362
	zone->managed_pages -= offlined_pages;
K
KAMEZAWA Hiroyuki 已提交
1363 1364 1365
	zone->present_pages -= offlined_pages;
	zone->zone_pgdat->node_present_pages -= offlined_pages;
	totalram_pages -= offlined_pages;
1366

1367 1368
	init_per_zone_wmark_min();

1369
	if (!populated_zone(zone)) {
1370
		zone_pcp_reset(zone);
1371 1372 1373 1374 1375
		mutex_lock(&zonelists_mutex);
		build_all_zonelists(NULL, NULL);
		mutex_unlock(&zonelists_mutex);
	} else
		zone_pcp_update(zone);
1376

1377 1378
	node_states_clear_node(node, &arg);
	if (arg.status_change_nid >= 0)
1379
		kswapd_stop(node);
1380

K
KAMEZAWA Hiroyuki 已提交
1381 1382
	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();
1383 1384

	memory_notify(MEM_OFFLINE, &arg);
1385
	unlock_memory_hotplug();
K
KAMEZAWA Hiroyuki 已提交
1386 1387 1388
	return 0;

failed_removal:
1389 1390 1391
	printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
	       (unsigned long long) start_pfn << PAGE_SHIFT,
	       ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1392
	memory_notify(MEM_CANCEL_OFFLINE, &arg);
K
KAMEZAWA Hiroyuki 已提交
1393
	/* pushback to free area */
1394
	undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1395

1396
out:
1397
	unlock_memory_hotplug();
K
KAMEZAWA Hiroyuki 已提交
1398 1399
	return ret;
}
1400

1401 1402 1403 1404 1405
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
	return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
}

1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
/**
 * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
 * @start_pfn: start pfn of the memory range
 * @end_pfn: end pft of the memory range
 * @arg: argument passed to func
 * @func: callback for each memory section walked
 *
 * This function walks through all present mem sections in range
 * [start_pfn, end_pfn) and call func on each mem section.
 *
 * Returns the return value of func.
 */
static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
		void *arg, int (*func)(struct memory_block *, void *))
1420
{
1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441
	struct memory_block *mem = NULL;
	struct mem_section *section;
	unsigned long pfn, section_nr;
	int ret;

	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
		section_nr = pfn_to_section_nr(pfn);
		if (!present_section_nr(section_nr))
			continue;

		section = __nr_to_section(section_nr);
		/* same memblock? */
		if (mem)
			if ((section_nr >= mem->start_section_nr) &&
			    (section_nr <= mem->end_section_nr))
				continue;

		mem = find_memory_block_hinted(section, mem);
		if (!mem)
			continue;

1442
		ret = func(mem, arg);
1443
		if (ret) {
1444 1445
			kobject_put(&mem->dev.kobj);
			return ret;
1446 1447 1448 1449 1450 1451
		}
	}

	if (mem)
		kobject_put(&mem->dev.kobj);

1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485
	return 0;
}

/**
 * offline_memory_block_cb - callback function for offlining memory block
 * @mem: the memory block to be offlined
 * @arg: buffer to hold error msg
 *
 * Always return 0, and put the error msg in arg if any.
 */
static int offline_memory_block_cb(struct memory_block *mem, void *arg)
{
	int *ret = arg;
	int error = offline_memory_block(mem);

	if (error != 0 && *ret == 0)
		*ret = error;

	return 0;
}

static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
	int ret = !is_memblock_offlined(mem);

	if (unlikely(ret))
		pr_warn("removing memory fails, because memory "
			"[%#010llx-%#010llx] is onlined\n",
			PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)),
			PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1);

	return ret;
}

1486
int __ref remove_memory(u64 start, u64 size)
1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511
{
	unsigned long start_pfn, end_pfn;
	int ret = 0;
	int retry = 1;

	start_pfn = PFN_DOWN(start);
	end_pfn = start_pfn + PFN_DOWN(size);

	/*
	 * When CONFIG_MEMCG is on, one memory block may be used by other
	 * blocks to store page cgroup when onlining pages. But we don't know
	 * in what order pages are onlined. So we iterate twice to offline
	 * memory:
	 * 1st iterate: offline every non primary memory block.
	 * 2nd iterate: offline primary (i.e. first added) memory block.
	 */
repeat:
	walk_memory_range(start_pfn, end_pfn, &ret,
			  offline_memory_block_cb);
	if (ret) {
		if (!retry)
			return ret;

		retry = 0;
		ret = 0;
1512 1513 1514
		goto repeat;
	}

1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
	lock_memory_hotplug();

	/*
	 * we have offlined all memory blocks like this:
	 *   1. lock memory hotplug
	 *   2. offline a memory block
	 *   3. unlock memory hotplug
	 *
	 * repeat step1-3 to offline the memory block. All memory blocks
	 * must be offlined before removing memory. But we don't hold the
	 * lock in the whole operation. So we should check whether all
	 * memory blocks are offlined.
	 */

1529 1530 1531 1532 1533
	ret = walk_memory_range(start_pfn, end_pfn, NULL,
				is_memblock_offlined_cb);
	if (ret) {
		unlock_memory_hotplug();
		return ret;
1534 1535
	}

1536 1537 1538
	/* remove memmap entry */
	firmware_map_remove(start, start + size, "System RAM");

1539 1540
	arch_remove_memory(start, size);

1541 1542
	unlock_memory_hotplug();

1543
	return 0;
1544
}
1545
#else
1546 1547 1548 1549
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
	return -EINVAL;
}
1550 1551 1552 1553
int remove_memory(u64 start, u64 size)
{
	return -EINVAL;
}
K
KAMEZAWA Hiroyuki 已提交
1554
#endif /* CONFIG_MEMORY_HOTREMOVE */
1555
EXPORT_SYMBOL_GPL(remove_memory);