compaction.c 57.1 KB
Newer Older
1 2 3 4 5 6 7 8 9
/*
 * linux/mm/compaction.c
 *
 * Memory compaction for the reduction of external fragmentation. Note that
 * this heavily depends upon page migration to do all the real heavy
 * lifting
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
 */
10
#include <linux/cpu.h>
11 12 13 14
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
15
#include <linux/sched/signal.h>
16
#include <linux/backing-dev.h>
17
#include <linux/sysctl.h>
18
#include <linux/sysfs.h>
19
#include <linux/page-isolation.h>
20
#include <linux/kasan.h>
21 22
#include <linux/kthread.h>
#include <linux/freezer.h>
23
#include <linux/page_owner.h>
24 25
#include "internal.h"

26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
#ifdef CONFIG_COMPACTION
static inline void count_compact_event(enum vm_event_item item)
{
	count_vm_event(item);
}

static inline void count_compact_events(enum vm_event_item item, long delta)
{
	count_vm_events(item, delta);
}
#else
#define count_compact_event(item) do { } while (0)
#define count_compact_events(item, delta) do { } while (0)
#endif

41 42
#if defined CONFIG_COMPACTION || defined CONFIG_CMA

43 44 45
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>

46 47 48 49 50
#define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
#define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
#define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
#define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)

51 52 53
static unsigned long release_freepages(struct list_head *freelist)
{
	struct page *page, *next;
54
	unsigned long high_pfn = 0;
55 56

	list_for_each_entry_safe(page, next, freelist, lru) {
57
		unsigned long pfn = page_to_pfn(page);
58 59
		list_del(&page->lru);
		__free_page(page);
60 61
		if (pfn > high_pfn)
			high_pfn = pfn;
62 63
	}

64
	return high_pfn;
65 66
}

67 68
static void map_pages(struct list_head *list)
{
69 70 71 72 73 74 75 76 77 78
	unsigned int i, order, nr_pages;
	struct page *page, *next;
	LIST_HEAD(tmp_list);

	list_for_each_entry_safe(page, next, list, lru) {
		list_del(&page->lru);

		order = page_private(page);
		nr_pages = 1 << order;

79
		post_alloc_hook(page, order, __GFP_MOVABLE);
80 81
		if (order)
			split_page(page, order);
82

83 84 85 86
		for (i = 0; i < nr_pages; i++) {
			list_add(&page->lru, &tmp_list);
			page++;
		}
87
	}
88 89

	list_splice(&tmp_list, list);
90 91
}

92
#ifdef CONFIG_COMPACTION
93

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
int PageMovable(struct page *page)
{
	struct address_space *mapping;

	VM_BUG_ON_PAGE(!PageLocked(page), page);
	if (!__PageMovable(page))
		return 0;

	mapping = page_mapping(page);
	if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
		return 1;

	return 0;
}
EXPORT_SYMBOL(PageMovable);

void __SetPageMovable(struct page *page, struct address_space *mapping)
{
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
	page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__SetPageMovable);

void __ClearPageMovable(struct page *page)
{
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageMovable(page), page);
	/*
	 * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
	 * flag so that VM can catch up released page by driver after isolation.
	 * With it, VM migration doesn't try to put it back.
	 */
	page->mapping = (void *)((unsigned long)page->mapping &
				PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__ClearPageMovable);

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
/* Do not skip compaction more than 64 times */
#define COMPACT_MAX_DEFER_SHIFT 6

/*
 * Compaction is deferred when compaction fails to result in a page
 * allocation success. 1 << compact_defer_limit compactions are skipped up
 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
 */
void defer_compaction(struct zone *zone, int order)
{
	zone->compact_considered = 0;
	zone->compact_defer_shift++;

	if (order < zone->compact_order_failed)
		zone->compact_order_failed = order;

	if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
		zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;

	trace_mm_compaction_defer_compaction(zone, order);
}

/* Returns true if compaction should be skipped this time */
bool compaction_deferred(struct zone *zone, int order)
{
	unsigned long defer_limit = 1UL << zone->compact_defer_shift;

	if (order < zone->compact_order_failed)
		return false;

	/* Avoid possible overflow */
	if (++zone->compact_considered > defer_limit)
		zone->compact_considered = defer_limit;

	if (zone->compact_considered >= defer_limit)
		return false;

	trace_mm_compaction_deferred(zone, order);

	return true;
}

/*
 * Update defer tracking counters after successful compaction of given order,
 * which means an allocation either succeeded (alloc_success == true) or is
 * expected to succeed.
 */
void compaction_defer_reset(struct zone *zone, int order,
		bool alloc_success)
{
	if (alloc_success) {
		zone->compact_considered = 0;
		zone->compact_defer_shift = 0;
	}
	if (order >= zone->compact_order_failed)
		zone->compact_order_failed = order + 1;

	trace_mm_compaction_defer_reset(zone, order);
}

/* Returns true if restarting compaction after many failures */
bool compaction_restarting(struct zone *zone, int order)
{
	if (order < zone->compact_order_failed)
		return false;

	return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
		zone->compact_considered >= 1UL << zone->compact_defer_shift;
}

202 203 204 205 206 207 208 209 210 211
/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
					struct page *page)
{
	if (cc->ignore_skip_hint)
		return true;

	return !get_pageblock_skip(page);
}

212 213 214 215
static void reset_cached_positions(struct zone *zone)
{
	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
216
	zone->compact_cached_free_pfn =
217
				pageblock_start_pfn(zone_end_pfn(zone) - 1);
218 219
}

220 221 222 223 224
/*
 * This function is called to clear all cached information on pageblocks that
 * should be skipped for page isolation when the migrate and free page scanner
 * meet.
 */
225
static void __reset_isolation_suitable(struct zone *zone)
226 227
{
	unsigned long start_pfn = zone->zone_start_pfn;
228
	unsigned long end_pfn = zone_end_pfn(zone);
229 230
	unsigned long pfn;

231
	zone->compact_blockskip_flush = false;
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

	/* Walk the zone and mark every pageblock as suitable for isolation */
	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
		struct page *page;

		cond_resched();

		if (!pfn_valid(pfn))
			continue;

		page = pfn_to_page(pfn);
		if (zone != page_zone(page))
			continue;

		clear_pageblock_skip(page);
	}
248 249

	reset_cached_positions(zone);
250 251
}

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
void reset_isolation_suitable(pg_data_t *pgdat)
{
	int zoneid;

	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
		struct zone *zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

		/* Only flush if a full compaction finished recently */
		if (zone->compact_blockskip_flush)
			__reset_isolation_suitable(zone);
	}
}

267 268
/*
 * If no pages were isolated then mark this pageblock to be skipped in the
269
 * future. The information is later cleared by __reset_isolation_suitable().
270
 */
271 272
static void update_pageblock_skip(struct compact_control *cc,
			struct page *page, unsigned long nr_isolated,
273
			bool migrate_scanner)
274
{
275
	struct zone *zone = cc->zone;
276
	unsigned long pfn;
277 278 279 280

	if (cc->ignore_skip_hint)
		return;

281 282 283
	if (!page)
		return;

284 285 286
	if (nr_isolated)
		return;

287
	set_pageblock_skip(page);
288

289 290 291 292 293 294
	pfn = page_to_pfn(page);

	/* Update where async and sync compaction should restart */
	if (migrate_scanner) {
		if (pfn > zone->compact_cached_migrate_pfn[0])
			zone->compact_cached_migrate_pfn[0] = pfn;
295 296
		if (cc->mode != MIGRATE_ASYNC &&
		    pfn > zone->compact_cached_migrate_pfn[1])
297 298 299 300
			zone->compact_cached_migrate_pfn[1] = pfn;
	} else {
		if (pfn < zone->compact_cached_free_pfn)
			zone->compact_cached_free_pfn = pfn;
301
	}
302 303 304 305 306 307 308 309
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
					struct page *page)
{
	return true;
}

310 311
static void update_pageblock_skip(struct compact_control *cc,
			struct page *page, unsigned long nr_isolated,
312
			bool migrate_scanner)
313 314 315 316
{
}
#endif /* CONFIG_COMPACTION */

317 318 319 320 321 322 323 324 325 326
/*
 * Compaction requires the taking of some coarse locks that are potentially
 * very heavily contended. For async compaction, back out if the lock cannot
 * be taken immediately. For sync compaction, spin on the lock if needed.
 *
 * Returns true if the lock is held
 * Returns false if the lock is not held and compaction should abort
 */
static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
						struct compact_control *cc)
327
{
328 329
	if (cc->mode == MIGRATE_ASYNC) {
		if (!spin_trylock_irqsave(lock, *flags)) {
330
			cc->contended = true;
331 332 333 334 335
			return false;
		}
	} else {
		spin_lock_irqsave(lock, *flags);
	}
336

337
	return true;
338 339
}

340 341
/*
 * Compaction requires the taking of some coarse locks that are potentially
342 343 344 345 346 347 348
 * very heavily contended. The lock should be periodically unlocked to avoid
 * having disabled IRQs for a long time, even when there is nobody waiting on
 * the lock. It might also be that allowing the IRQs will result in
 * need_resched() becoming true. If scheduling is needed, async compaction
 * aborts. Sync compaction schedules.
 * Either compaction type will also abort if a fatal signal is pending.
 * In either case if the lock was locked, it is dropped and not regained.
349
 *
350 351 352 353
 * Returns true if compaction should abort due to fatal signal pending, or
 *		async compaction due to need_resched()
 * Returns false when compaction can continue (sync compaction might have
 *		scheduled)
354
 */
355 356
static bool compact_unlock_should_abort(spinlock_t *lock,
		unsigned long flags, bool *locked, struct compact_control *cc)
357
{
358 359 360 361
	if (*locked) {
		spin_unlock_irqrestore(lock, flags);
		*locked = false;
	}
362

363
	if (fatal_signal_pending(current)) {
364
		cc->contended = true;
365 366
		return true;
	}
367

368
	if (need_resched()) {
369
		if (cc->mode == MIGRATE_ASYNC) {
370
			cc->contended = true;
371
			return true;
372 373 374 375
		}
		cond_resched();
	}

376
	return false;
377 378
}

379 380 381
/*
 * Aside from avoiding lock contention, compaction also periodically checks
 * need_resched() and either schedules in sync compaction or aborts async
382
 * compaction. This is similar to what compact_unlock_should_abort() does, but
383 384 385 386 387 388 389 390 391 392
 * is used where no lock is concerned.
 *
 * Returns false when no scheduling was needed, or sync compaction scheduled.
 * Returns true when async compaction should abort.
 */
static inline bool compact_should_abort(struct compact_control *cc)
{
	/* async compaction aborts if contended */
	if (need_resched()) {
		if (cc->mode == MIGRATE_ASYNC) {
393
			cc->contended = true;
394 395 396 397 398 399 400 401 402
			return true;
		}

		cond_resched();
	}

	return false;
}

403
/*
404 405 406
 * Isolate free pages onto a private freelist. If @strict is true, will abort
 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 * (even though it may still end up isolating some pages).
407
 */
408
static unsigned long isolate_freepages_block(struct compact_control *cc,
409
				unsigned long *start_pfn,
410 411 412
				unsigned long end_pfn,
				struct list_head *freelist,
				bool strict)
413
{
414
	int nr_scanned = 0, total_isolated = 0;
415
	struct page *cursor, *valid_page = NULL;
416
	unsigned long flags = 0;
417
	bool locked = false;
418
	unsigned long blockpfn = *start_pfn;
419
	unsigned int order;
420 421 422

	cursor = pfn_to_page(blockpfn);

423
	/* Isolate free pages. */
424
	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
425
		int isolated;
426 427
		struct page *page = cursor;

428 429 430 431 432 433 434 435 436 437
		/*
		 * Periodically drop the lock (if held) regardless of its
		 * contention, to give chance to IRQs. Abort if fatal signal
		 * pending or async compaction detects need_resched()
		 */
		if (!(blockpfn % SWAP_CLUSTER_MAX)
		    && compact_unlock_should_abort(&cc->zone->lock, flags,
								&locked, cc))
			break;

438
		nr_scanned++;
439
		if (!pfn_valid_within(blockpfn))
440 441
			goto isolate_fail;

442 443
		if (!valid_page)
			valid_page = page;
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461

		/*
		 * For compound pages such as THP and hugetlbfs, we can save
		 * potentially a lot of iterations if we skip them at once.
		 * The check is racy, but we can consider only valid values
		 * and the only danger is skipping too much.
		 */
		if (PageCompound(page)) {
			unsigned int comp_order = compound_order(page);

			if (likely(comp_order < MAX_ORDER)) {
				blockpfn += (1UL << comp_order) - 1;
				cursor += (1UL << comp_order) - 1;
			}

			goto isolate_fail;
		}

462
		if (!PageBuddy(page))
463
			goto isolate_fail;
464 465

		/*
466 467 468 469 470
		 * If we already hold the lock, we can skip some rechecking.
		 * Note that if we hold the lock now, checked_pageblock was
		 * already set in some previous iteration (or strict is true),
		 * so it is correct to skip the suitable migration target
		 * recheck as well.
471
		 */
472 473 474 475 476 477 478 479 480
		if (!locked) {
			/*
			 * The zone lock must be held to isolate freepages.
			 * Unfortunately this is a very coarse lock and can be
			 * heavily contended if there are parallel allocations
			 * or parallel compactions. For async compaction do not
			 * spin on the lock and we acquire the lock as late as
			 * possible.
			 */
481 482
			locked = compact_trylock_irqsave(&cc->zone->lock,
								&flags, cc);
483 484
			if (!locked)
				break;
485

486 487 488 489
			/* Recheck this is a buddy page under lock */
			if (!PageBuddy(page))
				goto isolate_fail;
		}
490

491 492 493
		/* Found a free page, will break it into order-0 pages */
		order = page_order(page);
		isolated = __isolate_free_page(page, order);
494 495
		if (!isolated)
			break;
496
		set_page_private(page, order);
497

498
		total_isolated += isolated;
499
		cc->nr_freepages += isolated;
500 501
		list_add_tail(&page->lru, freelist);

502 503 504
		if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
			blockpfn += isolated;
			break;
505
		}
506 507 508 509
		/* Advance to the end of split page */
		blockpfn += isolated - 1;
		cursor += isolated - 1;
		continue;
510 511 512 513 514 515 516

isolate_fail:
		if (strict)
			break;
		else
			continue;

517 518
	}

519 520 521
	if (locked)
		spin_unlock_irqrestore(&cc->zone->lock, flags);

522 523 524 525 526 527 528
	/*
	 * There is a tiny chance that we have read bogus compound_order(),
	 * so be careful to not go outside of the pageblock.
	 */
	if (unlikely(blockpfn > end_pfn))
		blockpfn = end_pfn;

529 530 531
	trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
					nr_scanned, total_isolated);

532 533 534
	/* Record how far we have got within the block */
	*start_pfn = blockpfn;

535 536 537 538 539
	/*
	 * If strict isolation is requested by CMA then check that all the
	 * pages requested were isolated. If there were any failures, 0 is
	 * returned and CMA will fail.
	 */
540
	if (strict && blockpfn < end_pfn)
541 542
		total_isolated = 0;

543 544
	/* Update the pageblock-skip if the whole pageblock was scanned */
	if (blockpfn == end_pfn)
545
		update_pageblock_skip(cc, valid_page, total_isolated, false);
546

547
	cc->total_free_scanned += nr_scanned;
548
	if (total_isolated)
549
		count_compact_events(COMPACTISOLATED, total_isolated);
550 551 552
	return total_isolated;
}

553 554 555 556 557 558 559 560 561 562 563 564 565
/**
 * isolate_freepages_range() - isolate free pages.
 * @start_pfn: The first PFN to start isolating.
 * @end_pfn:   The one-past-last PFN.
 *
 * Non-free pages, invalid PFNs, or zone boundaries within the
 * [start_pfn, end_pfn) range are considered errors, cause function to
 * undo its actions and return zero.
 *
 * Otherwise, function returns one-past-the-last PFN of isolated page
 * (which may be greater then end_pfn if end fell in a middle of
 * a free page).
 */
566
unsigned long
567 568
isolate_freepages_range(struct compact_control *cc,
			unsigned long start_pfn, unsigned long end_pfn)
569
{
570
	unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
571 572
	LIST_HEAD(freelist);

573
	pfn = start_pfn;
574
	block_start_pfn = pageblock_start_pfn(pfn);
575 576
	if (block_start_pfn < cc->zone->zone_start_pfn)
		block_start_pfn = cc->zone->zone_start_pfn;
577
	block_end_pfn = pageblock_end_pfn(pfn);
578 579

	for (; pfn < end_pfn; pfn += isolated,
580
				block_start_pfn = block_end_pfn,
581
				block_end_pfn += pageblock_nr_pages) {
582 583
		/* Protect pfn from changing by isolate_freepages_block */
		unsigned long isolate_start_pfn = pfn;
584 585 586

		block_end_pfn = min(block_end_pfn, end_pfn);

587 588 589 590 591 592
		/*
		 * pfn could pass the block_end_pfn if isolated freepage
		 * is more than pageblock order. In this case, we adjust
		 * scanning range to right one.
		 */
		if (pfn >= block_end_pfn) {
593 594
			block_start_pfn = pageblock_start_pfn(pfn);
			block_end_pfn = pageblock_end_pfn(pfn);
595 596 597
			block_end_pfn = min(block_end_pfn, end_pfn);
		}

598 599
		if (!pageblock_pfn_to_page(block_start_pfn,
					block_end_pfn, cc->zone))
600 601
			break;

602 603
		isolated = isolate_freepages_block(cc, &isolate_start_pfn,
						block_end_pfn, &freelist, true);
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619

		/*
		 * In strict mode, isolate_freepages_block() returns 0 if
		 * there are any holes in the block (ie. invalid PFNs or
		 * non-free pages).
		 */
		if (!isolated)
			break;

		/*
		 * If we managed to isolate pages, it is always (1 << n) *
		 * pageblock_nr_pages for some non-negative n.  (Max order
		 * page may span two pageblocks).
		 */
	}

620
	/* __isolate_free_page() does not map the pages */
621 622 623 624 625 626 627 628 629 630 631 632
	map_pages(&freelist);

	if (pfn < end_pfn) {
		/* Loop terminated early, cleanup. */
		release_freepages(&freelist);
		return 0;
	}

	/* We don't use freelists for anything. */
	return pfn;
}

633 634 635
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
636
	unsigned long active, inactive, isolated;
637

M
Mel Gorman 已提交
638 639 640 641 642 643
	inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
			node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
	active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
			node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
	isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
			node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
644

645
	return isolated > (inactive + active) / 2;
646 647
}

648
/**
649 650
 * isolate_migratepages_block() - isolate all migrate-able pages within
 *				  a single pageblock
651
 * @cc:		Compaction control structure.
652 653 654
 * @low_pfn:	The first PFN to isolate
 * @end_pfn:	The one-past-the-last PFN to isolate, within same pageblock
 * @isolate_mode: Isolation mode to be used.
655 656
 *
 * Isolate all pages that can be migrated from the range specified by
657 658 659 660
 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 * first page that was not scanned (which may be both less, equal to or more
 * than end_pfn).
661
 *
662 663 664
 * The pages are isolated on cc->migratepages list (not required to be empty),
 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 * is neither read nor updated.
665
 */
666 667 668
static unsigned long
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
			unsigned long end_pfn, isolate_mode_t isolate_mode)
669
{
670
	struct zone *zone = cc->zone;
671
	unsigned long nr_scanned = 0, nr_isolated = 0;
672
	struct lruvec *lruvec;
673
	unsigned long flags = 0;
674
	bool locked = false;
675
	struct page *page = NULL, *valid_page = NULL;
676
	unsigned long start_pfn = low_pfn;
677 678
	bool skip_on_failure = false;
	unsigned long next_skip_pfn = 0;
679 680 681 682 683 684 685

	/*
	 * Ensure that there are not too many pages isolated from the LRU
	 * list by either parallel reclaimers or compaction. If there are,
	 * delay for some time until fewer pages are isolated
	 */
	while (unlikely(too_many_isolated(zone))) {
686
		/* async migration should just abort */
687
		if (cc->mode == MIGRATE_ASYNC)
688
			return 0;
689

690 691 692
		congestion_wait(BLK_RW_ASYNC, HZ/10);

		if (fatal_signal_pending(current))
693
			return 0;
694 695
	}

696 697
	if (compact_should_abort(cc))
		return 0;
698

699 700 701 702 703
	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
		skip_on_failure = true;
		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
	}

704 705
	/* Time to isolate some pages for migration */
	for (; low_pfn < end_pfn; low_pfn++) {
706

707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728
		if (skip_on_failure && low_pfn >= next_skip_pfn) {
			/*
			 * We have isolated all migration candidates in the
			 * previous order-aligned block, and did not skip it due
			 * to failure. We should migrate the pages now and
			 * hopefully succeed compaction.
			 */
			if (nr_isolated)
				break;

			/*
			 * We failed to isolate in the previous order-aligned
			 * block. Set the new boundary to the end of the
			 * current block. Note we can't simply increase
			 * next_skip_pfn by 1 << order, as low_pfn might have
			 * been incremented by a higher number due to skipping
			 * a compound or a high-order buddy page in the
			 * previous loop iteration.
			 */
			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
		}

729 730 731 732 733 734
		/*
		 * Periodically drop the lock (if held) regardless of its
		 * contention, to give chance to IRQs. Abort async compaction
		 * if contended.
		 */
		if (!(low_pfn % SWAP_CLUSTER_MAX)
735
		    && compact_unlock_should_abort(zone_lru_lock(zone), flags,
736 737
								&locked, cc))
			break;
738

739
		if (!pfn_valid_within(low_pfn))
740
			goto isolate_fail;
741
		nr_scanned++;
742 743

		page = pfn_to_page(low_pfn);
744

745 746 747
		if (!valid_page)
			valid_page = page;

748
		/*
749 750 751 752
		 * Skip if free. We read page order here without zone lock
		 * which is generally unsafe, but the race window is small and
		 * the worst thing that can happen is that we skip some
		 * potential isolation targets.
753
		 */
754 755 756 757 758 759 760 761 762 763
		if (PageBuddy(page)) {
			unsigned long freepage_order = page_order_unsafe(page);

			/*
			 * Without lock, we cannot be sure that what we got is
			 * a valid page order. Consider only values in the
			 * valid order range to prevent low_pfn overflow.
			 */
			if (freepage_order > 0 && freepage_order < MAX_ORDER)
				low_pfn += (1UL << freepage_order) - 1;
764
			continue;
765
		}
766

767
		/*
768 769 770 771 772
		 * Regardless of being on LRU, compound pages such as THP and
		 * hugetlbfs are not to be compacted. We can potentially save
		 * a lot of iterations if we skip them at once. The check is
		 * racy, but we can consider only valid values and the only
		 * danger is skipping too much.
773
		 */
774 775 776 777 778
		if (PageCompound(page)) {
			unsigned int comp_order = compound_order(page);

			if (likely(comp_order < MAX_ORDER))
				low_pfn += (1UL << comp_order) - 1;
779

780
			goto isolate_fail;
781 782
		}

783 784 785 786 787 788 789 790 791 792 793 794 795
		/*
		 * Check may be lockless but that's ok as we recheck later.
		 * It's possible to migrate LRU and non-lru movable pages.
		 * Skip any other type of page
		 */
		if (!PageLRU(page)) {
			/*
			 * __PageMovable can return false positive so we need
			 * to verify it under page_lock.
			 */
			if (unlikely(__PageMovable(page)) &&
					!PageIsolated(page)) {
				if (locked) {
796
					spin_unlock_irqrestore(zone_lru_lock(zone),
797 798 799 800
									flags);
					locked = false;
				}

801
				if (!isolate_movable_page(page, isolate_mode))
802 803 804
					goto isolate_success;
			}

805
			goto isolate_fail;
806
		}
807

808 809 810 811 812 813 814
		/*
		 * Migration will fail if an anonymous page is pinned in memory,
		 * so avoid taking lru_lock and isolating it unnecessarily in an
		 * admittedly racy check.
		 */
		if (!page_mapping(page) &&
		    page_count(page) > page_mapcount(page))
815
			goto isolate_fail;
816

817 818 819 820 821 822 823
		/*
		 * Only allow to migrate anonymous pages in GFP_NOFS context
		 * because those do not depend on fs locks.
		 */
		if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
			goto isolate_fail;

824 825
		/* If we already hold the lock, we can skip some rechecking */
		if (!locked) {
826
			locked = compact_trylock_irqsave(zone_lru_lock(zone),
827
								&flags, cc);
828 829
			if (!locked)
				break;
830

831
			/* Recheck PageLRU and PageCompound under lock */
832
			if (!PageLRU(page))
833
				goto isolate_fail;
834 835 836 837 838 839 840 841

			/*
			 * Page become compound since the non-locked check,
			 * and it's on LRU. It can only be a THP so the order
			 * is safe to read and it's 0 for tail pages.
			 */
			if (unlikely(PageCompound(page))) {
				low_pfn += (1UL << compound_order(page)) - 1;
842
				goto isolate_fail;
843
			}
844 845
		}

M
Mel Gorman 已提交
846
		lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
847

848
		/* Try isolate the page */
849
		if (__isolate_lru_page(page, isolate_mode) != 0)
850
			goto isolate_fail;
851

852
		VM_BUG_ON_PAGE(PageCompound(page), page);
853

854
		/* Successfully isolated */
855
		del_page_from_lru_list(page, lruvec, page_lru(page));
856 857
		inc_node_page_state(page,
				NR_ISOLATED_ANON + page_is_file_cache(page));
858 859

isolate_success:
860
		list_add(&page->lru, &cc->migratepages);
861
		cc->nr_migratepages++;
862
		nr_isolated++;
863

864 865 866 867 868 869 870 871 872
		/*
		 * Record where we could have freed pages by migration and not
		 * yet flushed them to buddy allocator.
		 * - this is the lowest page that was isolated and likely be
		 * then freed by migration.
		 */
		if (!cc->last_migrated_pfn)
			cc->last_migrated_pfn = low_pfn;

873
		/* Avoid isolating too much */
874 875
		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
			++low_pfn;
876
			break;
877
		}
878 879 880 881 882 883 884 885 886 887 888 889 890

		continue;
isolate_fail:
		if (!skip_on_failure)
			continue;

		/*
		 * We have isolated some pages, but then failed. Release them
		 * instead of migrating, as we cannot form the cc->order buddy
		 * page anyway.
		 */
		if (nr_isolated) {
			if (locked) {
891
				spin_unlock_irqrestore(zone_lru_lock(zone), flags);
892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
				locked = false;
			}
			putback_movable_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
			cc->last_migrated_pfn = 0;
			nr_isolated = 0;
		}

		if (low_pfn < next_skip_pfn) {
			low_pfn = next_skip_pfn - 1;
			/*
			 * The check near the loop beginning would have updated
			 * next_skip_pfn too, but this is a bit simpler.
			 */
			next_skip_pfn += 1UL << cc->order;
		}
908 909
	}

910 911 912 913 914 915 916
	/*
	 * The PageBuddy() check could have potentially brought us outside
	 * the range to be scanned.
	 */
	if (unlikely(low_pfn > end_pfn))
		low_pfn = end_pfn;

917
	if (locked)
918
		spin_unlock_irqrestore(zone_lru_lock(zone), flags);
919

920 921 922 923
	/*
	 * Update the pageblock-skip information and cached scanner pfn,
	 * if the whole pageblock was scanned without isolating any page.
	 */
924
	if (low_pfn == end_pfn)
925
		update_pageblock_skip(cc, valid_page, nr_isolated, true);
926

927 928
	trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
						nr_scanned, nr_isolated);
929

930
	cc->total_migrate_scanned += nr_scanned;
931
	if (nr_isolated)
932
		count_compact_events(COMPACTISOLATED, nr_isolated);
933

934 935 936
	return low_pfn;
}

937 938 939 940 941 942 943 944 945 946 947 948 949 950
/**
 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
 * @cc:        Compaction control structure.
 * @start_pfn: The first PFN to start isolating.
 * @end_pfn:   The one-past-last PFN.
 *
 * Returns zero if isolation fails fatally due to e.g. pending signal.
 * Otherwise, function returns one-past-the-last PFN of isolated page
 * (which may be greater than end_pfn if end fell in a middle of a THP page).
 */
unsigned long
isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
							unsigned long end_pfn)
{
951
	unsigned long pfn, block_start_pfn, block_end_pfn;
952 953 954

	/* Scan block by block. First and last block may be incomplete */
	pfn = start_pfn;
955
	block_start_pfn = pageblock_start_pfn(pfn);
956 957
	if (block_start_pfn < cc->zone->zone_start_pfn)
		block_start_pfn = cc->zone->zone_start_pfn;
958
	block_end_pfn = pageblock_end_pfn(pfn);
959 960

	for (; pfn < end_pfn; pfn = block_end_pfn,
961
				block_start_pfn = block_end_pfn,
962 963 964 965
				block_end_pfn += pageblock_nr_pages) {

		block_end_pfn = min(block_end_pfn, end_pfn);

966 967
		if (!pageblock_pfn_to_page(block_start_pfn,
					block_end_pfn, cc->zone))
968 969 970 971 972
			continue;

		pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
							ISOLATE_UNEVICTABLE);

973
		if (!pfn)
974
			break;
975 976 977

		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
			break;
978 979 980 981 982
	}

	return pfn;
}

983 984
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
985

986 987 988
static bool suitable_migration_source(struct compact_control *cc,
							struct page *page)
{
989 990 991
	int block_mt;

	if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
992 993
		return true;

994 995 996 997 998 999
	block_mt = get_pageblock_migratetype(page);

	if (cc->migratetype == MIGRATE_MOVABLE)
		return is_migrate_movable(block_mt);
	else
		return block_mt == cc->migratetype;
1000 1001
}

1002
/* Returns true if the page is within a block suitable for migration to */
1003 1004
static bool suitable_migration_target(struct compact_control *cc,
							struct page *page)
1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
{
	/* If the page is a large free page, then disallow migration */
	if (PageBuddy(page)) {
		/*
		 * We are checking page_order without zone->lock taken. But
		 * the only small danger is that we skip a potentially suitable
		 * pageblock, so it's not worth to check order for valid range.
		 */
		if (page_order_unsafe(page) >= pageblock_order)
			return false;
	}

1017 1018 1019
	if (cc->ignore_block_suitable)
		return true;

1020
	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1021
	if (is_migrate_movable(get_pageblock_migratetype(page)))
1022 1023 1024 1025 1026 1027
		return true;

	/* Otherwise skip the block */
	return false;
}

1028 1029 1030 1031 1032 1033 1034 1035 1036 1037
/*
 * Test whether the free scanner has reached the same or lower pageblock than
 * the migration scanner, and compaction should thus terminate.
 */
static inline bool compact_scanners_met(struct compact_control *cc)
{
	return (cc->free_pfn >> pageblock_order)
		<= (cc->migrate_pfn >> pageblock_order);
}

1038
/*
1039 1040
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
1041
 */
1042
static void isolate_freepages(struct compact_control *cc)
1043
{
1044
	struct zone *zone = cc->zone;
1045
	struct page *page;
1046
	unsigned long block_start_pfn;	/* start of current pageblock */
1047
	unsigned long isolate_start_pfn; /* exact pfn we start at */
1048 1049
	unsigned long block_end_pfn;	/* end of current pageblock */
	unsigned long low_pfn;	     /* lowest pfn scanner is able to scan */
1050
	struct list_head *freelist = &cc->freepages;
1051

1052 1053
	/*
	 * Initialise the free scanner. The starting point is where we last
1054
	 * successfully isolated from, zone-cached value, or the end of the
1055 1056
	 * zone when isolating for the first time. For looping we also need
	 * this pfn aligned down to the pageblock boundary, because we do
1057 1058 1059
	 * block_start_pfn -= pageblock_nr_pages in the for loop.
	 * For ending point, take care when isolating in last pageblock of a
	 * a zone which ends in the middle of a pageblock.
1060 1061
	 * The low boundary is the end of the pageblock the migration scanner
	 * is using.
1062
	 */
1063
	isolate_start_pfn = cc->free_pfn;
1064
	block_start_pfn = pageblock_start_pfn(cc->free_pfn);
1065 1066
	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
						zone_end_pfn(zone));
1067
	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1068

1069 1070 1071 1072 1073
	/*
	 * Isolate free pages until enough are available to migrate the
	 * pages on cc->migratepages. We stop searching if the migrate
	 * and free page scanners meet or enough free pages are isolated.
	 */
1074
	for (; block_start_pfn >= low_pfn;
1075
				block_end_pfn = block_start_pfn,
1076 1077
				block_start_pfn -= pageblock_nr_pages,
				isolate_start_pfn = block_start_pfn) {
1078 1079 1080
		/*
		 * This can iterate a massively long zone without finding any
		 * suitable migration targets, so periodically check if we need
1081
		 * to schedule, or even abort async compaction.
1082
		 */
1083 1084 1085
		if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
						&& compact_should_abort(cc))
			break;
1086

1087 1088 1089
		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
									zone);
		if (!page)
1090 1091 1092
			continue;

		/* Check the block is suitable for migration */
1093
		if (!suitable_migration_target(cc, page))
1094
			continue;
1095

1096 1097 1098 1099
		/* If isolation recently failed, do not retry */
		if (!isolation_suitable(cc, page))
			continue;

1100
		/* Found a block suitable for isolating free pages from. */
1101 1102
		isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
					freelist, false);
1103

1104
		/*
1105 1106
		 * If we isolated enough freepages, or aborted due to lock
		 * contention, terminate.
1107
		 */
1108 1109
		if ((cc->nr_freepages >= cc->nr_migratepages)
							|| cc->contended) {
1110 1111 1112 1113 1114
			if (isolate_start_pfn >= block_end_pfn) {
				/*
				 * Restart at previous pageblock if more
				 * freepages can be isolated next time.
				 */
1115 1116
				isolate_start_pfn =
					block_start_pfn - pageblock_nr_pages;
1117
			}
1118
			break;
1119
		} else if (isolate_start_pfn < block_end_pfn) {
1120
			/*
1121 1122
			 * If isolation failed early, do not continue
			 * needlessly.
1123
			 */
1124
			break;
1125
		}
1126 1127
	}

1128
	/* __isolate_free_page() does not map the pages */
1129 1130
	map_pages(freelist);

1131
	/*
1132 1133 1134 1135
	 * Record where the free scanner will restart next time. Either we
	 * broke from the loop and set isolate_start_pfn based on the last
	 * call to isolate_freepages_block(), or we met the migration scanner
	 * and the loop terminated due to isolate_start_pfn < low_pfn
1136
	 */
1137
	cc->free_pfn = isolate_start_pfn;
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150
}

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
static struct page *compaction_alloc(struct page *migratepage,
					unsigned long data,
					int **result)
{
	struct compact_control *cc = (struct compact_control *)data;
	struct page *freepage;

1151 1152 1153 1154
	/*
	 * Isolate free pages if necessary, and if we are not aborting due to
	 * contention.
	 */
1155
	if (list_empty(&cc->freepages)) {
1156
		if (!cc->contended)
1157
			isolate_freepages(cc);
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170

		if (list_empty(&cc->freepages))
			return NULL;
	}

	freepage = list_entry(cc->freepages.next, struct page, lru);
	list_del(&freepage->lru);
	cc->nr_freepages--;

	return freepage;
}

/*
1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182
 * This is a migrate-callback that "frees" freepages back to the isolated
 * freelist.  All pages on the freelist are from the same zone, so there is no
 * special handling needed for NUMA.
 */
static void compaction_free(struct page *page, unsigned long data)
{
	struct compact_control *cc = (struct compact_control *)data;

	list_add(&page->lru, &cc->freepages);
	cc->nr_freepages++;
}

1183 1184 1185 1186 1187 1188 1189
/* possible outcome of isolate_migratepages */
typedef enum {
	ISOLATE_ABORT,		/* Abort compaction now */
	ISOLATE_NONE,		/* No pages isolated, continue scanning */
	ISOLATE_SUCCESS,	/* Pages isolated, migrate */
} isolate_migrate_t;

1190 1191 1192 1193 1194 1195
/*
 * Allow userspace to control policy on scanning the unevictable LRU for
 * compactable pages.
 */
int sysctl_compact_unevictable_allowed __read_mostly = 1;

1196
/*
1197 1198 1199
 * Isolate all pages that can be migrated from the first suitable block,
 * starting at the block pointed to by the migrate scanner pfn within
 * compact_control.
1200 1201 1202 1203
 */
static isolate_migrate_t isolate_migratepages(struct zone *zone,
					struct compact_control *cc)
{
1204 1205 1206
	unsigned long block_start_pfn;
	unsigned long block_end_pfn;
	unsigned long low_pfn;
1207 1208
	struct page *page;
	const isolate_mode_t isolate_mode =
1209
		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1210
		(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1211

1212 1213 1214 1215 1216
	/*
	 * Start at where we last stopped, or beginning of the zone as
	 * initialized by compact_zone()
	 */
	low_pfn = cc->migrate_pfn;
1217
	block_start_pfn = pageblock_start_pfn(low_pfn);
1218 1219
	if (block_start_pfn < zone->zone_start_pfn)
		block_start_pfn = zone->zone_start_pfn;
1220 1221

	/* Only scan within a pageblock boundary */
1222
	block_end_pfn = pageblock_end_pfn(low_pfn);
1223

1224 1225 1226 1227
	/*
	 * Iterate over whole pageblocks until we find the first suitable.
	 * Do not cross the free scanner.
	 */
1228 1229 1230 1231
	for (; block_end_pfn <= cc->free_pfn;
			low_pfn = block_end_pfn,
			block_start_pfn = block_end_pfn,
			block_end_pfn += pageblock_nr_pages) {
1232

1233 1234 1235 1236 1237 1238 1239 1240
		/*
		 * This can potentially iterate a massively long zone with
		 * many pageblocks unsuitable, so periodically check if we
		 * need to schedule, or even abort async compaction.
		 */
		if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
						&& compact_should_abort(cc))
			break;
1241

1242 1243
		page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
									zone);
1244
		if (!page)
1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255
			continue;

		/* If isolation recently failed, do not retry */
		if (!isolation_suitable(cc, page))
			continue;

		/*
		 * For async compaction, also only scan in MOVABLE blocks.
		 * Async compaction is optimistic to see if the minimum amount
		 * of work satisfies the allocation.
		 */
1256
		if (!suitable_migration_source(cc, page))
1257 1258 1259
			continue;

		/* Perform the isolation */
1260 1261
		low_pfn = isolate_migratepages_block(cc, low_pfn,
						block_end_pfn, isolate_mode);
1262

1263
		if (!low_pfn || cc->contended)
1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
			return ISOLATE_ABORT;

		/*
		 * Either we isolated something and proceed with migration. Or
		 * we failed and compact_zone should decide if we should
		 * continue or not.
		 */
		break;
	}

1274 1275
	/* Record where migration scanner will be restarted. */
	cc->migrate_pfn = low_pfn;
1276

1277
	return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1278 1279
}

1280 1281 1282 1283 1284 1285 1286 1287 1288
/*
 * order == -1 is expected when compacting via
 * /proc/sys/vm/compact_memory
 */
static inline bool is_via_compact_memory(int order)
{
	return order == -1;
}

1289 1290
static enum compact_result __compact_finished(struct zone *zone,
						struct compact_control *cc)
1291
{
1292
	unsigned int order;
1293
	const int migratetype = cc->migratetype;
1294

1295
	if (cc->contended || fatal_signal_pending(current))
1296
		return COMPACT_CONTENDED;
1297

1298
	/* Compaction run completes if the migrate and free scanner meet */
1299
	if (compact_scanners_met(cc)) {
1300
		/* Let the next compaction start anew. */
1301
		reset_cached_positions(zone);
1302

1303 1304
		/*
		 * Mark that the PG_migrate_skip information should be cleared
1305
		 * by kswapd when it goes to sleep. kcompactd does not set the
1306 1307 1308
		 * flag itself as the decision to be clear should be directly
		 * based on an allocation request.
		 */
1309
		if (cc->direct_compaction)
1310 1311
			zone->compact_blockskip_flush = true;

1312 1313 1314 1315
		if (cc->whole_zone)
			return COMPACT_COMPLETE;
		else
			return COMPACT_PARTIAL_SKIPPED;
1316
	}
1317

1318
	if (is_via_compact_memory(cc->order))
1319 1320 1321
		return COMPACT_CONTINUE;

	/* Direct compactor: Is a suitable page free? */
1322 1323
	for (order = cc->order; order < MAX_ORDER; order++) {
		struct free_area *area = &zone->free_area[order];
1324
		bool can_steal;
1325 1326

		/* Job done if page is free of the right migratetype */
1327
		if (!list_empty(&area->free_list[migratetype]))
1328
			return COMPACT_SUCCESS;
1329

1330 1331 1332 1333
#ifdef CONFIG_CMA
		/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
		if (migratetype == MIGRATE_MOVABLE &&
			!list_empty(&area->free_list[MIGRATE_CMA]))
1334
			return COMPACT_SUCCESS;
1335 1336 1337 1338 1339 1340 1341
#endif
		/*
		 * Job done if allocation would steal freepages from
		 * other migratetype buddy lists.
		 */
		if (find_suitable_fallback(area, order, migratetype,
						true, &can_steal) != -1)
1342
			return COMPACT_SUCCESS;
1343 1344
	}

1345 1346 1347
	return COMPACT_NO_SUITABLE_PAGE;
}

1348
static enum compact_result compact_finished(struct zone *zone,
1349
			struct compact_control *cc)
1350 1351 1352
{
	int ret;

1353
	ret = __compact_finished(zone, cc);
1354 1355 1356 1357 1358
	trace_mm_compaction_finished(zone, cc->order, ret);
	if (ret == COMPACT_NO_SUITABLE_PAGE)
		ret = COMPACT_CONTINUE;

	return ret;
1359 1360
}

1361 1362 1363 1364
/*
 * compaction_suitable: Is this suitable to run compaction on this zone now?
 * Returns
 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
1365
 *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
1366 1367
 *   COMPACT_CONTINUE - If compaction should run now
 */
1368
static enum compact_result __compaction_suitable(struct zone *zone, int order,
1369
					unsigned int alloc_flags,
1370 1371
					int classzone_idx,
					unsigned long wmark_target)
1372 1373 1374
{
	unsigned long watermark;

1375
	if (is_via_compact_memory(order))
1376 1377
		return COMPACT_CONTINUE;

1378
	watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1379 1380 1381 1382 1383 1384
	/*
	 * If watermarks for high-order allocation are already met, there
	 * should be no need for compaction at all.
	 */
	if (zone_watermark_ok(zone, order, watermark, classzone_idx,
								alloc_flags))
1385
		return COMPACT_SUCCESS;
1386

1387
	/*
1388
	 * Watermarks for order-0 must be met for compaction to be able to
1389 1390 1391 1392 1393 1394 1395
	 * isolate free pages for migration targets. This means that the
	 * watermark and alloc_flags have to match, or be more pessimistic than
	 * the check in __isolate_free_page(). We don't use the direct
	 * compactor's alloc_flags, as they are not relevant for freepage
	 * isolation. We however do use the direct compactor's classzone_idx to
	 * skip over zones where lowmem reserves would prevent allocation even
	 * if compaction succeeds.
1396 1397
	 * For costly orders, we require low watermark instead of min for
	 * compaction to proceed to increase its chances.
1398 1399
	 * ALLOC_CMA is used, as pages in CMA pageblocks are considered
	 * suitable migration targets
1400
	 */
1401 1402 1403
	watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
				low_wmark_pages(zone) : min_wmark_pages(zone);
	watermark += compact_gap(order);
1404
	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
1405
						ALLOC_CMA, wmark_target))
1406 1407
		return COMPACT_SKIPPED;

1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419
	return COMPACT_CONTINUE;
}

enum compact_result compaction_suitable(struct zone *zone, int order,
					unsigned int alloc_flags,
					int classzone_idx)
{
	enum compact_result ret;
	int fragindex;

	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
				    zone_page_state(zone, NR_FREE_PAGES));
1420 1421 1422 1423
	/*
	 * fragmentation index determines if allocation failures are due to
	 * low memory or external fragmentation
	 *
1424 1425
	 * index of -1000 would imply allocations might succeed depending on
	 * watermarks, but we already failed the high-order watermark check
1426 1427 1428
	 * index towards 0 implies failure is due to lack of memory
	 * index towards 1000 implies failure is due to fragmentation
	 *
1429 1430 1431 1432 1433 1434
	 * Only compact if a failure would be due to fragmentation. Also
	 * ignore fragindex for non-costly orders where the alternative to
	 * a successful reclaim/compaction is OOM. Fragindex and the
	 * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
	 * excessive compaction for costly orders, but it should not be at the
	 * expense of system stability.
1435
	 */
1436
	if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
1437 1438 1439 1440
		fragindex = fragmentation_index(zone, order);
		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
			ret = COMPACT_NOT_SUITABLE_ZONE;
	}
1441 1442 1443 1444 1445 1446 1447 1448

	trace_mm_compaction_suitable(zone, order, ret);
	if (ret == COMPACT_NOT_SUITABLE_ZONE)
		ret = COMPACT_SKIPPED;

	return ret;
}

1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
		int alloc_flags)
{
	struct zone *zone;
	struct zoneref *z;

	/*
	 * Make sure at least one zone would pass __compaction_suitable if we continue
	 * retrying the reclaim.
	 */
	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
					ac->nodemask) {
		unsigned long available;
		enum compact_result compact_result;

		/*
		 * Do not consider all the reclaimable memory because we do not
		 * want to trash just for a single high order allocation which
		 * is even not guaranteed to appear even if __compaction_suitable
		 * is happy about the watermark check.
		 */
1470
		available = zone_reclaimable_pages(zone) / order;
1471 1472 1473
		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
		compact_result = __compaction_suitable(zone, order, alloc_flags,
				ac_classzone_idx(ac), available);
1474
		if (compact_result != COMPACT_SKIPPED)
1475 1476 1477 1478 1479 1480
			return true;
	}

	return false;
}

1481
static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
1482
{
1483
	enum compact_result ret;
1484
	unsigned long start_pfn = zone->zone_start_pfn;
1485
	unsigned long end_pfn = zone_end_pfn(zone);
1486
	const bool sync = cc->mode != MIGRATE_ASYNC;
1487

1488
	cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1489 1490
	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
							cc->classzone_idx);
1491
	/* Compaction is likely to fail */
1492
	if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
1493
		return ret;
1494 1495 1496

	/* huh, compaction_suitable is returning something unexpected */
	VM_BUG_ON(ret != COMPACT_CONTINUE);
1497

1498 1499
	/*
	 * Clear pageblock skip if there were failures recently and compaction
1500
	 * is about to be retried after being deferred.
1501
	 */
1502
	if (compaction_restarting(zone, cc->order))
1503 1504
		__reset_isolation_suitable(zone);

1505 1506
	/*
	 * Setup to move all movable pages to the end of the zone. Used cached
1507 1508 1509
	 * information on where the scanners should start (unless we explicitly
	 * want to compact the whole zone), but check that it is initialised
	 * by ensuring the values are within zone boundaries.
1510
	 */
1511
	if (cc->whole_zone) {
1512
		cc->migrate_pfn = start_pfn;
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
	} else {
		cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
		cc->free_pfn = zone->compact_cached_free_pfn;
		if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
			cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
			zone->compact_cached_free_pfn = cc->free_pfn;
		}
		if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
			cc->migrate_pfn = start_pfn;
			zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
			zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
		}
1526

1527 1528 1529
		if (cc->migrate_pfn == start_pfn)
			cc->whole_zone = true;
	}
1530

1531
	cc->last_migrated_pfn = 0;
1532

1533 1534
	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
				cc->free_pfn, end_pfn, sync);
1535

1536 1537
	migrate_prep_local();

1538
	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
1539
		int err;
1540

1541 1542
		switch (isolate_migratepages(zone, cc)) {
		case ISOLATE_ABORT:
1543
			ret = COMPACT_CONTENDED;
1544
			putback_movable_pages(&cc->migratepages);
1545
			cc->nr_migratepages = 0;
1546 1547
			goto out;
		case ISOLATE_NONE:
1548 1549 1550 1551 1552 1553
			/*
			 * We haven't isolated and migrated anything, but
			 * there might still be unflushed migrations from
			 * previous cc->order aligned block.
			 */
			goto check_drain;
1554 1555 1556
		case ISOLATE_SUCCESS:
			;
		}
1557

1558
		err = migrate_pages(&cc->migratepages, compaction_alloc,
1559
				compaction_free, (unsigned long)cc, cc->mode,
1560
				MR_COMPACTION);
1561

1562 1563
		trace_mm_compaction_migratepages(cc->nr_migratepages, err,
							&cc->migratepages);
1564

1565 1566
		/* All pages were either migrated or will be released */
		cc->nr_migratepages = 0;
1567
		if (err) {
1568
			putback_movable_pages(&cc->migratepages);
1569 1570 1571 1572
			/*
			 * migrate_pages() may return -ENOMEM when scanners meet
			 * and we want compact_finished() to detect it
			 */
1573
			if (err == -ENOMEM && !compact_scanners_met(cc)) {
1574
				ret = COMPACT_CONTENDED;
1575 1576
				goto out;
			}
1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588
			/*
			 * We failed to migrate at least one page in the current
			 * order-aligned block, so skip the rest of it.
			 */
			if (cc->direct_compaction &&
						(cc->mode == MIGRATE_ASYNC)) {
				cc->migrate_pfn = block_end_pfn(
						cc->migrate_pfn - 1, cc->order);
				/* Draining pcplists is useless in this case */
				cc->last_migrated_pfn = 0;

			}
1589
		}
1590 1591 1592 1593 1594 1595 1596 1597 1598

check_drain:
		/*
		 * Has the migration scanner moved away from the previous
		 * cc->order aligned block where we migrated from? If yes,
		 * flush the pages that were freed, so that they can merge and
		 * compact_finished() can detect immediately if allocation
		 * would succeed.
		 */
1599
		if (cc->order > 0 && cc->last_migrated_pfn) {
1600 1601
			int cpu;
			unsigned long current_block_start =
1602
				block_start_pfn(cc->migrate_pfn, cc->order);
1603

1604
			if (cc->last_migrated_pfn < current_block_start) {
1605 1606 1607 1608 1609
				cpu = get_cpu();
				lru_add_drain_cpu(cpu);
				drain_local_pages(zone);
				put_cpu();
				/* No more flushing until we migrate again */
1610
				cc->last_migrated_pfn = 0;
1611 1612 1613
			}
		}

1614 1615
	}

1616
out:
1617 1618 1619 1620 1621 1622 1623 1624 1625 1626
	/*
	 * Release free pages and update where the free scanner should restart,
	 * so we don't leave any returned pages behind in the next attempt.
	 */
	if (cc->nr_freepages > 0) {
		unsigned long free_pfn = release_freepages(&cc->freepages);

		cc->nr_freepages = 0;
		VM_BUG_ON(free_pfn == 0);
		/* The cached pfn is always the first in a pageblock */
1627
		free_pfn = pageblock_start_pfn(free_pfn);
1628 1629 1630 1631 1632 1633 1634
		/*
		 * Only go back, not forward. The cached pfn might have been
		 * already reset to zone end in compact_finished()
		 */
		if (free_pfn > zone->compact_cached_free_pfn)
			zone->compact_cached_free_pfn = free_pfn;
	}
1635

1636 1637 1638
	count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
	count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);

1639 1640
	trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
				cc->free_pfn, end_pfn, sync, ret);
1641

1642 1643
	return ret;
}
1644

1645
static enum compact_result compact_zone_order(struct zone *zone, int order,
1646
		gfp_t gfp_mask, enum compact_priority prio,
1647
		unsigned int alloc_flags, int classzone_idx)
1648
{
1649
	enum compact_result ret;
1650 1651 1652
	struct compact_control cc = {
		.nr_freepages = 0,
		.nr_migratepages = 0,
1653 1654
		.total_migrate_scanned = 0,
		.total_free_scanned = 0,
1655
		.order = order,
1656
		.gfp_mask = gfp_mask,
1657
		.zone = zone,
1658 1659
		.mode = (prio == COMPACT_PRIO_ASYNC) ?
					MIGRATE_ASYNC :	MIGRATE_SYNC_LIGHT,
1660 1661
		.alloc_flags = alloc_flags,
		.classzone_idx = classzone_idx,
1662
		.direct_compaction = true,
1663
		.whole_zone = (prio == MIN_COMPACT_PRIORITY),
1664 1665
		.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
		.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
1666 1667 1668 1669
	};
	INIT_LIST_HEAD(&cc.freepages);
	INIT_LIST_HEAD(&cc.migratepages);

1670 1671 1672 1673 1674 1675
	ret = compact_zone(zone, &cc);

	VM_BUG_ON(!list_empty(&cc.freepages));
	VM_BUG_ON(!list_empty(&cc.migratepages));

	return ret;
1676 1677
}

1678 1679
int sysctl_extfrag_threshold = 500;

1680 1681 1682
/**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 * @gfp_mask: The GFP mask of the current allocation
1683 1684 1685
 * @order: The order of the current allocation
 * @alloc_flags: The allocation flags of the current allocation
 * @ac: The context of current allocation
1686
 * @mode: The migration mode for async, sync light, or sync migration
1687 1688 1689
 *
 * This is the main entry point for direct page compaction.
 */
1690
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1691
		unsigned int alloc_flags, const struct alloc_context *ac,
1692
		enum compact_priority prio)
1693 1694 1695 1696
{
	int may_perform_io = gfp_mask & __GFP_IO;
	struct zoneref *z;
	struct zone *zone;
1697
	enum compact_result rc = COMPACT_SKIPPED;
1698

1699 1700 1701 1702 1703
	/*
	 * Check if the GFP flags allow compaction - GFP_NOIO is really
	 * tricky context because the migration might require IO
	 */
	if (!may_perform_io)
1704
		return COMPACT_SKIPPED;
1705

1706
	trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
1707

1708
	/* Compact each zone in the list */
1709 1710
	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
								ac->nodemask) {
1711
		enum compact_result status;
1712

1713 1714
		if (prio > MIN_COMPACT_PRIORITY
					&& compaction_deferred(zone, order)) {
1715
			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1716
			continue;
1717
		}
1718

1719
		status = compact_zone_order(zone, order, gfp_mask, prio,
1720
					alloc_flags, ac_classzone_idx(ac));
1721 1722
		rc = max(status, rc);

1723 1724
		/* The allocation should succeed, stop compacting */
		if (status == COMPACT_SUCCESS) {
1725 1726 1727 1728 1729 1730 1731
			/*
			 * We think the allocation will succeed in this zone,
			 * but it is not certain, hence the false. The caller
			 * will repeat this with true if allocation indeed
			 * succeeds in this zone.
			 */
			compaction_defer_reset(zone, order, false);
1732

1733
			break;
1734 1735
		}

1736
		if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
1737
					status == COMPACT_PARTIAL_SKIPPED))
1738 1739 1740 1741 1742 1743
			/*
			 * We think that allocation won't succeed in this zone
			 * so we defer compaction there. If it ends up
			 * succeeding after all, it will be reset.
			 */
			defer_compaction(zone, order);
1744 1745 1746 1747

		/*
		 * We might have stopped compacting due to need_resched() in
		 * async compaction, or due to a fatal signal detected. In that
1748
		 * case do not try further zones
1749
		 */
1750 1751 1752
		if ((prio == COMPACT_PRIO_ASYNC && need_resched())
					|| fatal_signal_pending(current))
			break;
1753 1754 1755 1756 1757 1758
	}

	return rc;
}


1759
/* Compact all zones within a node */
1760
static void compact_node(int nid)
1761
{
1762
	pg_data_t *pgdat = NODE_DATA(nid);
1763 1764
	int zoneid;
	struct zone *zone;
1765 1766
	struct compact_control cc = {
		.order = -1,
1767 1768
		.total_migrate_scanned = 0,
		.total_free_scanned = 0,
1769 1770 1771
		.mode = MIGRATE_SYNC,
		.ignore_skip_hint = true,
		.whole_zone = true,
1772
		.gfp_mask = GFP_KERNEL,
1773 1774
	};

1775 1776 1777 1778 1779 1780 1781

	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {

		zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

1782 1783 1784 1785 1786
		cc.nr_freepages = 0;
		cc.nr_migratepages = 0;
		cc.zone = zone;
		INIT_LIST_HEAD(&cc.freepages);
		INIT_LIST_HEAD(&cc.migratepages);
1787

1788
		compact_zone(zone, &cc);
1789

1790 1791
		VM_BUG_ON(!list_empty(&cc.freepages));
		VM_BUG_ON(!list_empty(&cc.migratepages));
1792 1793 1794 1795
	}
}

/* Compact all nodes in the system */
1796
static void compact_nodes(void)
1797 1798 1799
{
	int nid;

1800 1801 1802
	/* Flush pending updates to the LRU lists */
	lru_add_drain_all();

1803 1804 1805 1806 1807 1808 1809
	for_each_online_node(nid)
		compact_node(nid);
}

/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;

1810 1811 1812 1813
/*
 * This is the entry point for compacting all nodes via
 * /proc/sys/vm/compact_memory
 */
1814 1815 1816 1817
int sysctl_compaction_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	if (write)
1818
		compact_nodes();
1819 1820 1821

	return 0;
}
1822

1823 1824 1825 1826 1827 1828 1829 1830
int sysctl_extfrag_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	proc_dointvec_minmax(table, write, buffer, length, ppos);

	return 0;
}

1831
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1832
static ssize_t sysfs_compact_node(struct device *dev,
1833
			struct device_attribute *attr,
1834 1835
			const char *buf, size_t count)
{
1836 1837 1838 1839 1840 1841 1842 1843
	int nid = dev->id;

	if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
		/* Flush pending updates to the LRU lists */
		lru_add_drain_all();

		compact_node(nid);
	}
1844 1845 1846

	return count;
}
1847
static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
1848 1849 1850

int compaction_register_node(struct node *node)
{
1851
	return device_create_file(&node->dev, &dev_attr_compact);
1852 1853 1854 1855
}

void compaction_unregister_node(struct node *node)
{
1856
	return device_remove_file(&node->dev, &dev_attr_compact);
1857 1858
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
1859

1860 1861
static inline bool kcompactd_work_requested(pg_data_t *pgdat)
{
1862
	return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1863 1864 1865 1866 1867 1868 1869 1870
}

static bool kcompactd_node_suitable(pg_data_t *pgdat)
{
	int zoneid;
	struct zone *zone;
	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;

1871
	for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894
		zone = &pgdat->node_zones[zoneid];

		if (!populated_zone(zone))
			continue;

		if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
					classzone_idx) == COMPACT_CONTINUE)
			return true;
	}

	return false;
}

static void kcompactd_do_work(pg_data_t *pgdat)
{
	/*
	 * With no special task, compact all zones so that a page of requested
	 * order is allocatable.
	 */
	int zoneid;
	struct zone *zone;
	struct compact_control cc = {
		.order = pgdat->kcompactd_max_order,
1895 1896
		.total_migrate_scanned = 0,
		.total_free_scanned = 0,
1897 1898 1899
		.classzone_idx = pgdat->kcompactd_classzone_idx,
		.mode = MIGRATE_SYNC_LIGHT,
		.ignore_skip_hint = true,
1900
		.gfp_mask = GFP_KERNEL,
1901 1902 1903 1904

	};
	trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
							cc.classzone_idx);
1905
	count_compact_event(KCOMPACTD_WAKE);
1906

1907
	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922
		int status;

		zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

		if (compaction_deferred(zone, cc.order))
			continue;

		if (compaction_suitable(zone, cc.order, 0, zoneid) !=
							COMPACT_CONTINUE)
			continue;

		cc.nr_freepages = 0;
		cc.nr_migratepages = 0;
1923 1924
		cc.total_migrate_scanned = 0;
		cc.total_free_scanned = 0;
1925 1926 1927 1928
		cc.zone = zone;
		INIT_LIST_HEAD(&cc.freepages);
		INIT_LIST_HEAD(&cc.migratepages);

1929 1930
		if (kthread_should_stop())
			return;
1931 1932
		status = compact_zone(zone, &cc);

1933
		if (status == COMPACT_SUCCESS) {
1934
			compaction_defer_reset(zone, cc.order, false);
1935
		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1936 1937 1938 1939 1940 1941 1942
			/*
			 * We use sync migration mode here, so we defer like
			 * sync direct compaction does.
			 */
			defer_compaction(zone, cc.order);
		}

1943 1944 1945 1946 1947
		count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
				     cc.total_migrate_scanned);
		count_compact_events(KCOMPACTD_FREE_SCANNED,
				     cc.total_free_scanned);

1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
		VM_BUG_ON(!list_empty(&cc.freepages));
		VM_BUG_ON(!list_empty(&cc.migratepages));
	}

	/*
	 * Regardless of success, we are done until woken up next. But remember
	 * the requested order/classzone_idx in case it was higher/tighter than
	 * our current ones
	 */
	if (pgdat->kcompactd_max_order <= cc.order)
		pgdat->kcompactd_max_order = 0;
	if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
		pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
}

void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
{
	if (!order)
		return;

	if (pgdat->kcompactd_max_order < order)
		pgdat->kcompactd_max_order = order;

1971 1972 1973 1974 1975 1976 1977
	/*
	 * Pairs with implicit barrier in wait_event_freezable()
	 * such that wakeups are not missed in the lockless
	 * waitqueue_active() call.
	 */
	smp_acquire__after_ctrl_dep();

1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062
	if (pgdat->kcompactd_classzone_idx > classzone_idx)
		pgdat->kcompactd_classzone_idx = classzone_idx;

	if (!waitqueue_active(&pgdat->kcompactd_wait))
		return;

	if (!kcompactd_node_suitable(pgdat))
		return;

	trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
							classzone_idx);
	wake_up_interruptible(&pgdat->kcompactd_wait);
}

/*
 * The background compaction daemon, started as a kernel thread
 * from the init process.
 */
static int kcompactd(void *p)
{
	pg_data_t *pgdat = (pg_data_t*)p;
	struct task_struct *tsk = current;

	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);

	if (!cpumask_empty(cpumask))
		set_cpus_allowed_ptr(tsk, cpumask);

	set_freezable();

	pgdat->kcompactd_max_order = 0;
	pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;

	while (!kthread_should_stop()) {
		trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
		wait_event_freezable(pgdat->kcompactd_wait,
				kcompactd_work_requested(pgdat));

		kcompactd_do_work(pgdat);
	}

	return 0;
}

/*
 * This kcompactd start function will be called by init and node-hot-add.
 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
 */
int kcompactd_run(int nid)
{
	pg_data_t *pgdat = NODE_DATA(nid);
	int ret = 0;

	if (pgdat->kcompactd)
		return 0;

	pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
	if (IS_ERR(pgdat->kcompactd)) {
		pr_err("Failed to start kcompactd on node %d\n", nid);
		ret = PTR_ERR(pgdat->kcompactd);
		pgdat->kcompactd = NULL;
	}
	return ret;
}

/*
 * Called by memory hotplug when all memory in a node is offlined. Caller must
 * hold mem_hotplug_begin/end().
 */
void kcompactd_stop(int nid)
{
	struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;

	if (kcompactd) {
		kthread_stop(kcompactd);
		NODE_DATA(nid)->kcompactd = NULL;
	}
}

/*
 * It's optimal to keep kcompactd on the same CPUs as their memory, but
 * not required for correctness. So if the last cpu in a node goes
 * away, we get changed to run anywhere: as the first one comes back,
 * restore their cpu bindings.
 */
2063
static int kcompactd_cpu_online(unsigned int cpu)
2064 2065 2066
{
	int nid;

2067 2068 2069
	for_each_node_state(nid, N_MEMORY) {
		pg_data_t *pgdat = NODE_DATA(nid);
		const struct cpumask *mask;
2070

2071
		mask = cpumask_of_node(pgdat->node_id);
2072

2073 2074 2075
		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
			/* One of our CPUs online: restore mask */
			set_cpus_allowed_ptr(pgdat->kcompactd, mask);
2076
	}
2077
	return 0;
2078 2079 2080 2081 2082
}

static int __init kcompactd_init(void)
{
	int nid;
2083 2084 2085 2086 2087 2088 2089 2090 2091
	int ret;

	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
					"mm/compaction:online",
					kcompactd_cpu_online, NULL);
	if (ret < 0) {
		pr_err("kcompactd: failed to register hotplug callbacks.\n");
		return ret;
	}
2092 2093 2094 2095 2096 2097 2098

	for_each_node_state(nid, N_MEMORY)
		kcompactd_run(nid);
	return 0;
}
subsys_initcall(kcompactd_init)

2099
#endif /* CONFIG_COMPACTION */