swap_state.c 22.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
11
#include <linux/gfp.h>
L
Linus Torvalds 已提交
12 13
#include <linux/kernel_stat.h>
#include <linux/swap.h>
14
#include <linux/swapops.h>
L
Linus Torvalds 已提交
15 16 17
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
18
#include <linux/blkdev.h>
19
#include <linux/pagevec.h>
C
Christoph Lameter 已提交
20
#include <linux/migrate.h>
21
#include <linux/vmalloc.h>
22
#include <linux/swap_slots.h>
23
#include <linux/huge_mm.h>
L
Linus Torvalds 已提交
24 25 26 27 28

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
J
Jens Axboe 已提交
29
 * vmscan's shrink_page_list.
L
Linus Torvalds 已提交
30
 */
31
static const struct address_space_operations swap_aops = {
L
Linus Torvalds 已提交
32
	.writepage	= swap_writepage,
33
	.set_page_dirty	= swap_set_page_dirty,
34
#ifdef CONFIG_MIGRATION
35
	.migratepage	= migrate_page,
36
#endif
L
Linus Torvalds 已提交
37 38
};

39 40
struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
41
static bool enable_vma_readahead __read_mostly = true;
H
Huang Ying 已提交
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59

#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)

#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)

#define SWAP_RA_VAL(addr, win, hits)				\
	(((addr) & PAGE_MASK) |					\
	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
	 ((hits) & SWAP_RA_HITS_MASK))

/* Initial readahead hits is 4 to start up with a small window */
#define GET_SWAP_RA_VAL(vma)					\
	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
L
Linus Torvalds 已提交
60 61

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
62
#define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
L
Linus Torvalds 已提交
63 64 65 66 67 68 69 70

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

71 72
unsigned long total_swapcache_pages(void)
{
73
	unsigned int i, j, nr;
74
	unsigned long ret = 0;
75
	struct address_space *spaces;
76

77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
	rcu_read_lock();
	for (i = 0; i < MAX_SWAPFILES; i++) {
		/*
		 * The corresponding entries in nr_swapper_spaces and
		 * swapper_spaces will be reused only after at least
		 * one grace period.  So it is impossible for them
		 * belongs to different usage.
		 */
		nr = nr_swapper_spaces[i];
		spaces = rcu_dereference(swapper_spaces[i]);
		if (!nr || !spaces)
			continue;
		for (j = 0; j < nr; j++)
			ret += spaces[j].nrpages;
	}
	rcu_read_unlock();
93 94 95
	return ret;
}

96 97
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);

L
Linus Torvalds 已提交
98 99
void show_swap_cache_info(void)
{
100
	printk("%lu pages in swap cache\n", total_swapcache_pages());
101
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
L
Linus Torvalds 已提交
102
		swap_cache_info.add_total, swap_cache_info.del_total,
103
		swap_cache_info.find_success, swap_cache_info.find_total);
104 105
	printk("Free swap  = %ldkB\n",
		get_nr_swap_pages() << (PAGE_SHIFT - 10));
L
Linus Torvalds 已提交
106 107 108 109
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
110
 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
L
Linus Torvalds 已提交
111 112
 * but sets SwapCache flag and private instead of mapping and index.
 */
113
int __add_to_swap_cache(struct page *page, swp_entry_t entry)
L
Linus Torvalds 已提交
114
{
115
	int error, i, nr = hpage_nr_pages(page);
116
	struct address_space *address_space;
117
	pgoff_t idx = swp_offset(entry);
L
Linus Torvalds 已提交
118

119 120 121
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(PageSwapCache(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
122

123
	page_ref_add(page, nr);
124 125
	SetPageSwapCache(page);

126
	address_space = swap_address_space(entry);
M
Matthew Wilcox 已提交
127
	xa_lock_irq(&address_space->i_pages);
128 129
	for (i = 0; i < nr; i++) {
		set_page_private(page + i, entry.val + i);
M
Matthew Wilcox 已提交
130
		error = radix_tree_insert(&address_space->i_pages,
131 132 133
					  idx + i, page + i);
		if (unlikely(error))
			break;
134
	}
135 136 137 138 139
	if (likely(!error)) {
		address_space->nrpages += nr;
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		ADD_CACHE_INFO(add_total, nr);
	} else {
140 141 142 143 144 145
		/*
		 * Only the context which have set SWAP_HAS_CACHE flag
		 * would call add_to_swap_cache().
		 * So add_to_swap_cache() doesn't returns -EEXIST.
		 */
		VM_BUG_ON(error == -EEXIST);
146 147
		set_page_private(page + i, 0UL);
		while (i--) {
M
Matthew Wilcox 已提交
148
			radix_tree_delete(&address_space->i_pages, idx + i);
149 150
			set_page_private(page + i, 0UL);
		}
151
		ClearPageSwapCache(page);
152
		page_ref_sub(page, nr);
153
	}
M
Matthew Wilcox 已提交
154
	xa_unlock_irq(&address_space->i_pages);
155 156 157 158 159 160 161 162 163

	return error;
}


int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
	int error;

164
	error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
B
Balbir Singh 已提交
165
	if (!error) {
166
		error = __add_to_swap_cache(page, entry);
L
Linus Torvalds 已提交
167
		radix_tree_preload_end();
H
Hugh Dickins 已提交
168
	}
L
Linus Torvalds 已提交
169 170 171 172 173 174 175 176 177
	return error;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
178
	struct address_space *address_space;
179 180 181
	int i, nr = hpage_nr_pages(page);
	swp_entry_t entry;
	pgoff_t idx;
182

183 184 185
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	VM_BUG_ON_PAGE(PageWriteback(page), page);
L
Linus Torvalds 已提交
186

187 188
	entry.val = page_private(page);
	address_space = swap_address_space(entry);
189 190
	idx = swp_offset(entry);
	for (i = 0; i < nr; i++) {
M
Matthew Wilcox 已提交
191
		radix_tree_delete(&address_space->i_pages, idx + i);
192 193
		set_page_private(page + i, 0);
	}
L
Linus Torvalds 已提交
194
	ClearPageSwapCache(page);
195 196 197
	address_space->nrpages -= nr;
	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
	ADD_CACHE_INFO(del_total, nr);
L
Linus Torvalds 已提交
198 199 200 201 202 203 204 205 206
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
207
int add_to_swap(struct page *page)
L
Linus Torvalds 已提交
208 209 210 211
{
	swp_entry_t entry;
	int err;

212 213
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageUptodate(page), page);
L
Linus Torvalds 已提交
214

215
	entry = get_swap_page(page);
216
	if (!entry.val)
217 218
		return 0;

219 220 221 222 223 224 225 226 227
	/*
	 * Radix-tree node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
M
Minchan Kim 已提交
228
	 * Add it to the swap cache.
229 230 231
	 */
	err = add_to_swap_cache(page, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
232 233
	/* -ENOMEM radix-tree allocation failure */
	if (err)
N
Nick Piggin 已提交
234
		/*
235 236
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
L
Linus Torvalds 已提交
237
		 */
238
		goto fail;
239 240 241 242 243 244 245 246 247 248 249
	/*
	 * Normally the page will be dirtied in unmap because its pte should be
	 * dirty. A special case is MADV_FREE page. The page'e pte could have
	 * dirty bit cleared but the page's SwapBacked bit is still set because
	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
	 * such page, unmap will not set dirty bit for it, so page reclaim will
	 * not write the page out. This can cause data corruption when the page
	 * is swap in later. Always setting the dirty bit for the page solves
	 * the problem.
	 */
	set_page_dirty(page);
250 251 252 253

	return 1;

fail:
254
	put_swap_page(page, entry);
255
	return 0;
L
Linus Torvalds 已提交
256 257 258 259 260 261 262 263 264 265 266
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;
267
	struct address_space *address_space;
L
Linus Torvalds 已提交
268

H
Hugh Dickins 已提交
269
	entry.val = page_private(page);
L
Linus Torvalds 已提交
270

271
	address_space = swap_address_space(entry);
M
Matthew Wilcox 已提交
272
	xa_lock_irq(&address_space->i_pages);
L
Linus Torvalds 已提交
273
	__delete_from_swap_cache(page);
M
Matthew Wilcox 已提交
274
	xa_unlock_irq(&address_space->i_pages);
L
Linus Torvalds 已提交
275

276
	put_swap_page(page, entry);
277
	page_ref_sub(page, hpage_nr_pages(page));
L
Linus Torvalds 已提交
278 279 280 281 282 283
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
284 285
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
L
Linus Torvalds 已提交
286 287 288 289
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
290 291
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
L
Linus Torvalds 已提交
292 293 294 295 296 297
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
298
 * this page if it is the last user of the page.
L
Linus Torvalds 已提交
299 300 301 302
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
303
	if (!is_huge_zero_page(page))
304
		put_page(page);
L
Linus Torvalds 已提交
305 306 307 308 309 310 311 312 313
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;
314
	int i;
L
Linus Torvalds 已提交
315 316

	lru_add_drain();
317 318
	for (i = 0; i < nr; i++)
		free_swap_cache(pagep[i]);
319
	release_pages(pagep, nr);
L
Linus Torvalds 已提交
320 321
}

322 323 324 325 326
static inline bool swap_use_vma_readahead(void)
{
	return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}

L
Linus Torvalds 已提交
327 328 329 330 331 332
/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
H
Huang Ying 已提交
333 334
struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
			       unsigned long addr)
L
Linus Torvalds 已提交
335 336 337
{
	struct page *page;

338
	page = find_get_page(swap_address_space(entry), swp_offset(entry));
L
Linus Torvalds 已提交
339

H
Huang Ying 已提交
340 341
	INC_CACHE_INFO(find_total);
	if (page) {
M
Minchan Kim 已提交
342 343 344
		bool vma_ra = swap_use_vma_readahead();
		bool readahead;

L
Linus Torvalds 已提交
345
		INC_CACHE_INFO(find_success);
M
Minchan Kim 已提交
346 347 348 349
		/*
		 * At the moment, we don't support PG_readahead for anon THP
		 * so let's bail out rather than confusing the readahead stat.
		 */
H
Huang Ying 已提交
350 351
		if (unlikely(PageTransCompound(page)))
			return page;
M
Minchan Kim 已提交
352

H
Huang Ying 已提交
353
		readahead = TestClearPageReadahead(page);
M
Minchan Kim 已提交
354 355 356 357 358 359 360
		if (vma && vma_ra) {
			unsigned long ra_val;
			int win, hits;

			ra_val = GET_SWAP_RA_VAL(vma);
			win = SWAP_RA_WIN(ra_val);
			hits = SWAP_RA_HITS(ra_val);
H
Huang Ying 已提交
361 362 363 364 365
			if (readahead)
				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
			atomic_long_set(&vma->swap_readahead_info,
					SWAP_RA_VAL(addr, win, hits));
		}
M
Minchan Kim 已提交
366

H
Huang Ying 已提交
367
		if (readahead) {
368
			count_vm_event(SWAP_RA_HIT);
M
Minchan Kim 已提交
369
			if (!vma || !vma_ra)
H
Huang Ying 已提交
370
				atomic_inc(&swapin_readahead_hits);
371
		}
372
	}
M
Minchan Kim 已提交
373

L
Linus Torvalds 已提交
374 375 376
	return page;
}

377 378 379
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr,
			bool *new_page_allocated)
L
Linus Torvalds 已提交
380 381
{
	struct page *found_page, *new_page = NULL;
382
	struct address_space *swapper_space = swap_address_space(entry);
L
Linus Torvalds 已提交
383
	int err;
384
	*new_page_allocated = false;
L
Linus Torvalds 已提交
385 386 387 388 389 390 391

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
392
		found_page = find_get_page(swapper_space, swp_offset(entry));
L
Linus Torvalds 已提交
393 394 395
		if (found_page)
			break;

396 397 398 399 400 401 402 403 404 405
		/*
		 * Just skip read ahead for unused swap slot.
		 * During swap_off when swap_slot_cache is disabled,
		 * we have to handle the race between putting
		 * swap entry in swap cache and marking swap slot
		 * as SWAP_HAS_CACHE.  That's done in later part of code or
		 * else swap_off will be aborted if we return NULL.
		 */
		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
			break;
406

L
Linus Torvalds 已提交
407 408 409 410
		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
411
			new_page = alloc_page_vma(gfp_mask, vma, addr);
L
Linus Torvalds 已提交
412 413 414 415
			if (!new_page)
				break;		/* Out of memory */
		}

416 417 418
		/*
		 * call radix_tree_preload() while we can wait.
		 */
419
		err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
420 421 422
		if (err)
			break;

H
Hugh Dickins 已提交
423 424 425
		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
426
		err = swapcache_prepare(entry);
427
		if (err == -EEXIST) {
428
			radix_tree_preload_end();
429 430 431
			/*
			 * We might race against get_swap_page() and stumble
			 * across a SWAP_HAS_CACHE swap_map entry whose page
432
			 * has not been brought into the swapcache yet.
433 434
			 */
			cond_resched();
435
			continue;
436 437 438
		}
		if (err) {		/* swp entry is obsolete ? */
			radix_tree_preload_end();
H
Hugh Dickins 已提交
439
			break;
440
		}
H
Hugh Dickins 已提交
441

442
		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
443
		__SetPageLocked(new_page);
444
		__SetPageSwapBacked(new_page);
445
		err = __add_to_swap_cache(new_page, entry);
N
Nick Piggin 已提交
446
		if (likely(!err)) {
447
			radix_tree_preload_end();
L
Linus Torvalds 已提交
448 449 450
			/*
			 * Initiate read into locked page and return.
			 */
451
			SetPageWorkingset(new_page);
452
			lru_cache_add_anon(new_page);
453
			*new_page_allocated = true;
L
Linus Torvalds 已提交
454 455
			return new_page;
		}
456
		radix_tree_preload_end();
457
		__ClearPageLocked(new_page);
458 459 460 461
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
462
		put_swap_page(new_page, entry);
H
Hugh Dickins 已提交
463
	} while (err != -ENOMEM);
L
Linus Torvalds 已提交
464 465

	if (new_page)
466
		put_page(new_page);
L
Linus Torvalds 已提交
467 468
	return found_page;
}
469

470 471 472 473 474 475 476
/*
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
477
		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
478 479 480 481 482 483
{
	bool page_was_allocated;
	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
			vma, addr, &page_was_allocated);

	if (page_was_allocated)
484
		swap_readpage(retpage, do_poll);
485 486 487 488

	return retpage;
}

H
Huang Ying 已提交
489 490 491 492 493
static unsigned int __swapin_nr_pages(unsigned long prev_offset,
				      unsigned long offset,
				      int hits,
				      int max_pages,
				      int prev_win)
494
{
H
Huang Ying 已提交
495
	unsigned int pages, last_ra;
496 497 498 499 500 501

	/*
	 * This heuristic has been found to work well on both sequential and
	 * random loads, swapping to hard disk or to SSD: please don't ask
	 * what the "+ 2" means, it just happens to work well, that's all.
	 */
H
Huang Ying 已提交
502
	pages = hits + 2;
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
	if (pages == 2) {
		/*
		 * We can have no readahead hits to judge by: but must not get
		 * stuck here forever, so check for an adjacent offset instead
		 * (and don't even bother to check whether swap type is same).
		 */
		if (offset != prev_offset + 1 && offset != prev_offset - 1)
			pages = 1;
	} else {
		unsigned int roundup = 4;
		while (roundup < pages)
			roundup <<= 1;
		pages = roundup;
	}

	if (pages > max_pages)
		pages = max_pages;

	/* Don't shrink readahead too fast */
H
Huang Ying 已提交
522
	last_ra = prev_win / 2;
523 524
	if (pages < last_ra)
		pages = last_ra;
H
Huang Ying 已提交
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543

	return pages;
}

static unsigned long swapin_nr_pages(unsigned long offset)
{
	static unsigned long prev_offset;
	unsigned int hits, pages, max_pages;
	static atomic_t last_readahead_pages;

	max_pages = 1 << READ_ONCE(page_cluster);
	if (max_pages <= 1)
		return 1;

	hits = atomic_xchg(&swapin_readahead_hits, 0);
	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
				  atomic_read(&last_readahead_pages));
	if (!hits)
		prev_offset = offset;
544 545 546 547 548
	atomic_set(&last_readahead_pages, pages);

	return pages;
}

549
/**
550
 * swap_cluster_readahead - swap in pages in hope we need them soon
551
 * @entry: swap entry of this memory
552
 * @gfp_mask: memory allocation flags
553
 * @vmf: fault information
554 555 556 557 558 559 560 561 562 563 564
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
565
 * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
566
 */
567 568
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
				struct vm_fault *vmf)
569 570
{
	struct page *page;
571 572
	unsigned long entry_offset = swp_offset(entry);
	unsigned long offset = entry_offset;
573
	unsigned long start_offset, end_offset;
574
	unsigned long mask;
575
	struct swap_info_struct *si = swp_swap_info(entry);
576
	struct blk_plug plug;
577
	bool do_poll = true, page_allocated;
578 579
	struct vm_area_struct *vma = vmf->vma;
	unsigned long addr = vmf->address;
580

581 582 583 584
	mask = swapin_nr_pages(offset) - 1;
	if (!mask)
		goto skip;

585
	do_poll = false;
586 587 588 589 590
	/* Read a page_cluster sized and aligned cluster around offset. */
	start_offset = offset & ~mask;
	end_offset = offset | mask;
	if (!start_offset)	/* First page is swap header. */
		start_offset++;
591 592
	if (end_offset >= si->max)
		end_offset = si->max - 1;
593

594
	blk_start_plug(&plug);
595
	for (offset = start_offset; offset <= end_offset ; offset++) {
596
		/* Ok, do the async read-ahead now */
597 598 599
		page = __read_swap_cache_async(
			swp_entry(swp_type(entry), offset),
			gfp_mask, vma, addr, &page_allocated);
600
		if (!page)
601
			continue;
602 603
		if (page_allocated) {
			swap_readpage(page, false);
M
Minchan Kim 已提交
604
			if (offset != entry_offset) {
605 606 607
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
608
		}
609
		put_page(page);
610
	}
611 612
	blk_finish_plug(&plug);

613
	lru_add_drain();	/* Push any new pages onto the LRU now */
614
skip:
615
	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
616
}
617 618 619 620 621 622 623

int init_swap_address_space(unsigned int type, unsigned long nr_pages)
{
	struct address_space *spaces, *space;
	unsigned int i, nr;

	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
K
Kees Cook 已提交
624
	spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
625 626 627 628
	if (!spaces)
		return -ENOMEM;
	for (i = 0; i < nr; i++) {
		space = spaces + i;
M
Matthew Wilcox 已提交
629
		INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
		atomic_set(&space->i_mmap_writable, 0);
		space->a_ops = &swap_aops;
		/* swap cache doesn't use writeback related tags */
		mapping_set_no_writeback_tags(space);
	}
	nr_swapper_spaces[type] = nr;
	rcu_assign_pointer(swapper_spaces[type], spaces);

	return 0;
}

void exit_swap_address_space(unsigned int type)
{
	struct address_space *spaces;

	spaces = swapper_spaces[type];
	nr_swapper_spaces[type] = 0;
	rcu_assign_pointer(swapper_spaces[type], NULL);
	synchronize_rcu();
	kvfree(spaces);
}
H
Huang Ying 已提交
651 652 653 654 655 656 657 658 659 660 661 662 663 664

static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
				     unsigned long faddr,
				     unsigned long lpfn,
				     unsigned long rpfn,
				     unsigned long *start,
				     unsigned long *end)
{
	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
		      PFN_DOWN(faddr & PMD_MASK));
	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
}

M
Minchan Kim 已提交
665 666
static void swap_ra_info(struct vm_fault *vmf,
			struct vma_swap_readahead *ra_info)
H
Huang Ying 已提交
667 668
{
	struct vm_area_struct *vma = vmf->vma;
M
Minchan Kim 已提交
669
	unsigned long ra_val;
H
Huang Ying 已提交
670 671 672
	swp_entry_t entry;
	unsigned long faddr, pfn, fpfn;
	unsigned long start, end;
M
Minchan Kim 已提交
673
	pte_t *pte, *orig_pte;
H
Huang Ying 已提交
674 675 676 677 678
	unsigned int max_win, hits, prev_win, win, left;
#ifndef CONFIG_64BIT
	pte_t *tpte;
#endif

679 680 681
	max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster),
			     SWAP_RA_ORDER_CEILING);
	if (max_win == 1) {
M
Minchan Kim 已提交
682 683
		ra_info->win = 1;
		return;
684 685
	}

H
Huang Ying 已提交
686
	faddr = vmf->address;
M
Minchan Kim 已提交
687 688 689 690 691 692
	orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
	entry = pte_to_swp_entry(*pte);
	if ((unlikely(non_swap_entry(entry)))) {
		pte_unmap(orig_pte);
		return;
	}
H
Huang Ying 已提交
693 694

	fpfn = PFN_DOWN(faddr);
M
Minchan Kim 已提交
695 696 697 698 699
	ra_val = GET_SWAP_RA_VAL(vma);
	pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val));
	prev_win = SWAP_RA_WIN(ra_val);
	hits = SWAP_RA_HITS(ra_val);
	ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits,
H
Huang Ying 已提交
700 701 702 703
					       max_win, prev_win);
	atomic_long_set(&vma->swap_readahead_info,
			SWAP_RA_VAL(faddr, win, 0));

M
Minchan Kim 已提交
704 705 706 707
	if (win == 1) {
		pte_unmap(orig_pte);
		return;
	}
H
Huang Ying 已提交
708 709 710 711 712 713 714 715 716 717 718 719

	/* Copy the PTEs because the page table may be unmapped */
	if (fpfn == pfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
	else if (pfn == fpfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
				  &start, &end);
	else {
		left = (win - 1) / 2;
		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
				  &start, &end);
	}
M
Minchan Kim 已提交
720 721 722
	ra_info->nr_pte = end - start;
	ra_info->offset = fpfn - start;
	pte -= ra_info->offset;
H
Huang Ying 已提交
723
#ifdef CONFIG_64BIT
M
Minchan Kim 已提交
724
	ra_info->ptes = pte;
H
Huang Ying 已提交
725
#else
M
Minchan Kim 已提交
726
	tpte = ra_info->ptes;
H
Huang Ying 已提交
727 728 729
	for (pfn = start; pfn != end; pfn++)
		*tpte++ = *pte++;
#endif
M
Minchan Kim 已提交
730
	pte_unmap(orig_pte);
H
Huang Ying 已提交
731 732
}

733 734
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
				       struct vm_fault *vmf)
H
Huang Ying 已提交
735 736 737 738 739 740 741 742
{
	struct blk_plug plug;
	struct vm_area_struct *vma = vmf->vma;
	struct page *page;
	pte_t *pte, pentry;
	swp_entry_t entry;
	unsigned int i;
	bool page_allocated;
M
Minchan Kim 已提交
743
	struct vma_swap_readahead ra_info = {0,};
H
Huang Ying 已提交
744

M
Minchan Kim 已提交
745 746
	swap_ra_info(vmf, &ra_info);
	if (ra_info.win == 1)
H
Huang Ying 已提交
747 748 749
		goto skip;

	blk_start_plug(&plug);
M
Minchan Kim 已提交
750
	for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte;
H
Huang Ying 已提交
751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
	     i++, pte++) {
		pentry = *pte;
		if (pte_none(pentry))
			continue;
		if (pte_present(pentry))
			continue;
		entry = pte_to_swp_entry(pentry);
		if (unlikely(non_swap_entry(entry)))
			continue;
		page = __read_swap_cache_async(entry, gfp_mask, vma,
					       vmf->address, &page_allocated);
		if (!page)
			continue;
		if (page_allocated) {
			swap_readpage(page, false);
M
Minchan Kim 已提交
766
			if (i != ra_info.offset) {
H
Huang Ying 已提交
767 768 769 770 771 772 773 774 775 776
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
		}
		put_page(page);
	}
	blk_finish_plug(&plug);
	lru_add_drain();
skip:
	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
M
Minchan Kim 已提交
777
				     ra_info.win == 1);
H
Huang Ying 已提交
778
}
779

780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799
/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * It's a main entry function for swap readahead. By the configuration,
 * it will read ahead blocks by cluster-based(ie, physical disk based)
 * or vma-based(ie, virtual address based on faulty address) readahead.
 */
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
				struct vm_fault *vmf)
{
	return swap_use_vma_readahead() ?
			swap_vma_readahead(entry, gfp_mask, vmf) :
			swap_cluster_readahead(entry, gfp_mask, vmf);
}

800 801 802 803
#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
804
	return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
805 806 807 808 809 810
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
811
		enable_vma_readahead = true;
812
	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
813
		enable_vma_readahead = false;
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
	else
		return -EINVAL;

	return count;
}
static struct kobj_attribute vma_ra_enabled_attr =
	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
	       vma_ra_enabled_store);

static struct attribute *swap_attrs[] = {
	&vma_ra_enabled_attr.attr,
	NULL,
};

static struct attribute_group swap_attr_group = {
	.attrs = swap_attrs,
};

static int __init swap_init_sysfs(void)
{
	int err;
	struct kobject *swap_kobj;

	swap_kobj = kobject_create_and_add("swap", mm_kobj);
	if (!swap_kobj) {
		pr_err("failed to create swap kobject\n");
		return -ENOMEM;
	}
	err = sysfs_create_group(swap_kobj, &swap_attr_group);
	if (err) {
		pr_err("failed to register swap group\n");
		goto delete_obj;
	}
	return 0;

delete_obj:
	kobject_put(swap_kobj);
	return err;
}
subsys_initcall(swap_init_sysfs);
#endif