swap_state.c 22.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
10
#include <linux/gfp.h>
L
Linus Torvalds 已提交
11 12
#include <linux/kernel_stat.h>
#include <linux/swap.h>
13
#include <linux/swapops.h>
L
Linus Torvalds 已提交
14 15 16
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
17
#include <linux/blkdev.h>
18
#include <linux/pagevec.h>
C
Christoph Lameter 已提交
19
#include <linux/migrate.h>
20
#include <linux/vmalloc.h>
21
#include <linux/swap_slots.h>
22
#include <linux/huge_mm.h>
L
Linus Torvalds 已提交
23 24 25 26 27

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
J
Jens Axboe 已提交
28
 * vmscan's shrink_page_list.
L
Linus Torvalds 已提交
29
 */
30
static const struct address_space_operations swap_aops = {
L
Linus Torvalds 已提交
31
	.writepage	= swap_writepage,
32
	.set_page_dirty	= swap_set_page_dirty,
33
#ifdef CONFIG_MIGRATION
34
	.migratepage	= migrate_page,
35
#endif
L
Linus Torvalds 已提交
36 37
};

38 39
struct address_space *swapper_spaces[MAX_SWAPFILES];
static unsigned int nr_swapper_spaces[MAX_SWAPFILES];
H
Huang Ying 已提交
40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62
bool swap_vma_readahead = true;

#define SWAP_RA_MAX_ORDER_DEFAULT	3

static int swap_ra_max_order = SWAP_RA_MAX_ORDER_DEFAULT;

#define SWAP_RA_WIN_SHIFT	(PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK	((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX	SWAP_RA_HITS_MASK
#define SWAP_RA_WIN_MASK	(~PAGE_MASK & ~SWAP_RA_HITS_MASK)

#define SWAP_RA_HITS(v)		((v) & SWAP_RA_HITS_MASK)
#define SWAP_RA_WIN(v)		(((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
#define SWAP_RA_ADDR(v)		((v) & PAGE_MASK)

#define SWAP_RA_VAL(addr, win, hits)				\
	(((addr) & PAGE_MASK) |					\
	 (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |	\
	 ((hits) & SWAP_RA_HITS_MASK))

/* Initial readahead hits is 4 to start up with a small window */
#define GET_SWAP_RA_VAL(vma)					\
	(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
L
Linus Torvalds 已提交
63 64

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
65
#define ADD_CACHE_INFO(x, nr)	do { swap_cache_info.x += (nr); } while (0)
L
Linus Torvalds 已提交
66 67 68 69 70 71 72 73

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

74 75
unsigned long total_swapcache_pages(void)
{
76
	unsigned int i, j, nr;
77
	unsigned long ret = 0;
78
	struct address_space *spaces;
79

80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
	rcu_read_lock();
	for (i = 0; i < MAX_SWAPFILES; i++) {
		/*
		 * The corresponding entries in nr_swapper_spaces and
		 * swapper_spaces will be reused only after at least
		 * one grace period.  So it is impossible for them
		 * belongs to different usage.
		 */
		nr = nr_swapper_spaces[i];
		spaces = rcu_dereference(swapper_spaces[i]);
		if (!nr || !spaces)
			continue;
		for (j = 0; j < nr; j++)
			ret += spaces[j].nrpages;
	}
	rcu_read_unlock();
96 97 98
	return ret;
}

99 100
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);

L
Linus Torvalds 已提交
101 102
void show_swap_cache_info(void)
{
103
	printk("%lu pages in swap cache\n", total_swapcache_pages());
104
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
L
Linus Torvalds 已提交
105
		swap_cache_info.add_total, swap_cache_info.del_total,
106
		swap_cache_info.find_success, swap_cache_info.find_total);
107 108
	printk("Free swap  = %ldkB\n",
		get_nr_swap_pages() << (PAGE_SHIFT - 10));
L
Linus Torvalds 已提交
109 110 111 112
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
113
 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
L
Linus Torvalds 已提交
114 115
 * but sets SwapCache flag and private instead of mapping and index.
 */
116
int __add_to_swap_cache(struct page *page, swp_entry_t entry)
L
Linus Torvalds 已提交
117
{
118
	int error, i, nr = hpage_nr_pages(page);
119
	struct address_space *address_space;
120
	pgoff_t idx = swp_offset(entry);
L
Linus Torvalds 已提交
121

122 123 124
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(PageSwapCache(page), page);
	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
125

126
	page_ref_add(page, nr);
127 128
	SetPageSwapCache(page);

129 130
	address_space = swap_address_space(entry);
	spin_lock_irq(&address_space->tree_lock);
131 132 133 134 135 136
	for (i = 0; i < nr; i++) {
		set_page_private(page + i, entry.val + i);
		error = radix_tree_insert(&address_space->page_tree,
					  idx + i, page + i);
		if (unlikely(error))
			break;
137
	}
138 139 140 141 142
	if (likely(!error)) {
		address_space->nrpages += nr;
		__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
		ADD_CACHE_INFO(add_total, nr);
	} else {
143 144 145 146 147 148
		/*
		 * Only the context which have set SWAP_HAS_CACHE flag
		 * would call add_to_swap_cache().
		 * So add_to_swap_cache() doesn't returns -EEXIST.
		 */
		VM_BUG_ON(error == -EEXIST);
149 150 151 152 153
		set_page_private(page + i, 0UL);
		while (i--) {
			radix_tree_delete(&address_space->page_tree, idx + i);
			set_page_private(page + i, 0UL);
		}
154
		ClearPageSwapCache(page);
155
		page_ref_sub(page, nr);
156
	}
157
	spin_unlock_irq(&address_space->tree_lock);
158 159 160 161 162 163 164 165 166

	return error;
}


int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
	int error;

167
	error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
B
Balbir Singh 已提交
168
	if (!error) {
169
		error = __add_to_swap_cache(page, entry);
L
Linus Torvalds 已提交
170
		radix_tree_preload_end();
H
Hugh Dickins 已提交
171
	}
L
Linus Torvalds 已提交
172 173 174 175 176 177 178 179 180
	return error;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
181
	struct address_space *address_space;
182 183 184
	int i, nr = hpage_nr_pages(page);
	swp_entry_t entry;
	pgoff_t idx;
185

186 187 188
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageSwapCache(page), page);
	VM_BUG_ON_PAGE(PageWriteback(page), page);
L
Linus Torvalds 已提交
189

190 191
	entry.val = page_private(page);
	address_space = swap_address_space(entry);
192 193 194 195 196
	idx = swp_offset(entry);
	for (i = 0; i < nr; i++) {
		radix_tree_delete(&address_space->page_tree, idx + i);
		set_page_private(page + i, 0);
	}
L
Linus Torvalds 已提交
197
	ClearPageSwapCache(page);
198 199 200
	address_space->nrpages -= nr;
	__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
	ADD_CACHE_INFO(del_total, nr);
L
Linus Torvalds 已提交
201 202 203 204 205 206 207 208 209
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
210
int add_to_swap(struct page *page)
L
Linus Torvalds 已提交
211 212 213 214
{
	swp_entry_t entry;
	int err;

215 216
	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(!PageUptodate(page), page);
L
Linus Torvalds 已提交
217

218
	entry = get_swap_page(page);
219
	if (!entry.val)
220 221
		return 0;

222
	if (mem_cgroup_try_charge_swap(page, entry))
223
		goto fail;
A
Andrea Arcangeli 已提交
224

225 226 227 228 229 230 231 232 233
	/*
	 * Radix-tree node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
M
Minchan Kim 已提交
234
	 * Add it to the swap cache.
235 236 237
	 */
	err = add_to_swap_cache(page, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
238 239
	/* -ENOMEM radix-tree allocation failure */
	if (err)
N
Nick Piggin 已提交
240
		/*
241 242
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
L
Linus Torvalds 已提交
243
		 */
244
		goto fail;
245 246 247 248 249 250 251 252 253 254 255
	/*
	 * Normally the page will be dirtied in unmap because its pte should be
	 * dirty. A special case is MADV_FREE page. The page'e pte could have
	 * dirty bit cleared but the page's SwapBacked bit is still set because
	 * clearing the dirty bit and SwapBacked bit has no lock protected. For
	 * such page, unmap will not set dirty bit for it, so page reclaim will
	 * not write the page out. This can cause data corruption when the page
	 * is swap in later. Always setting the dirty bit for the page solves
	 * the problem.
	 */
	set_page_dirty(page);
256 257 258 259

	return 1;

fail:
260
	put_swap_page(page, entry);
261
	return 0;
L
Linus Torvalds 已提交
262 263 264 265 266 267 268 269 270 271 272
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;
273
	struct address_space *address_space;
L
Linus Torvalds 已提交
274

H
Hugh Dickins 已提交
275
	entry.val = page_private(page);
L
Linus Torvalds 已提交
276

277 278
	address_space = swap_address_space(entry);
	spin_lock_irq(&address_space->tree_lock);
L
Linus Torvalds 已提交
279
	__delete_from_swap_cache(page);
280
	spin_unlock_irq(&address_space->tree_lock);
L
Linus Torvalds 已提交
281

282
	put_swap_page(page, entry);
283
	page_ref_sub(page, hpage_nr_pages(page));
L
Linus Torvalds 已提交
284 285 286 287 288 289
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
290 291
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
L
Linus Torvalds 已提交
292 293 294 295
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
296 297
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
L
Linus Torvalds 已提交
298 299 300 301 302 303
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
304
 * this page if it is the last user of the page.
L
Linus Torvalds 已提交
305 306 307 308
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
309
	if (!is_huge_zero_page(page))
310
		put_page(page);
L
Linus Torvalds 已提交
311 312 313 314 315 316 317 318 319
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;
320
	int i;
L
Linus Torvalds 已提交
321 322

	lru_add_drain();
323 324 325
	for (i = 0; i < nr; i++)
		free_swap_cache(pagep[i]);
	release_pages(pagep, nr, false);
L
Linus Torvalds 已提交
326 327 328 329 330 331 332 333
}

/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
H
Huang Ying 已提交
334 335
struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
			       unsigned long addr)
L
Linus Torvalds 已提交
336 337
{
	struct page *page;
H
Huang Ying 已提交
338 339
	unsigned long ra_info;
	int win, hits, readahead;
L
Linus Torvalds 已提交
340

341
	page = find_get_page(swap_address_space(entry), swp_offset(entry));
L
Linus Torvalds 已提交
342

H
Huang Ying 已提交
343 344
	INC_CACHE_INFO(find_total);
	if (page) {
L
Linus Torvalds 已提交
345
		INC_CACHE_INFO(find_success);
H
Huang Ying 已提交
346 347 348 349 350 351 352 353 354 355 356 357 358
		if (unlikely(PageTransCompound(page)))
			return page;
		readahead = TestClearPageReadahead(page);
		if (vma) {
			ra_info = GET_SWAP_RA_VAL(vma);
			win = SWAP_RA_WIN(ra_info);
			hits = SWAP_RA_HITS(ra_info);
			if (readahead)
				hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
			atomic_long_set(&vma->swap_readahead_info,
					SWAP_RA_VAL(addr, win, hits));
		}
		if (readahead) {
359
			count_vm_event(SWAP_RA_HIT);
H
Huang Ying 已提交
360 361
			if (!vma)
				atomic_inc(&swapin_readahead_hits);
362
		}
363
	}
L
Linus Torvalds 已提交
364 365 366
	return page;
}

367 368 369
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
			struct vm_area_struct *vma, unsigned long addr,
			bool *new_page_allocated)
L
Linus Torvalds 已提交
370 371
{
	struct page *found_page, *new_page = NULL;
372
	struct address_space *swapper_space = swap_address_space(entry);
L
Linus Torvalds 已提交
373
	int err;
374
	*new_page_allocated = false;
L
Linus Torvalds 已提交
375 376 377 378 379 380 381

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
382
		found_page = find_get_page(swapper_space, swp_offset(entry));
L
Linus Torvalds 已提交
383 384 385
		if (found_page)
			break;

386 387 388 389 390 391 392 393 394 395
		/*
		 * Just skip read ahead for unused swap slot.
		 * During swap_off when swap_slot_cache is disabled,
		 * we have to handle the race between putting
		 * swap entry in swap cache and marking swap slot
		 * as SWAP_HAS_CACHE.  That's done in later part of code or
		 * else swap_off will be aborted if we return NULL.
		 */
		if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
			break;
396

L
Linus Torvalds 已提交
397 398 399 400
		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
401
			new_page = alloc_page_vma(gfp_mask, vma, addr);
L
Linus Torvalds 已提交
402 403 404 405
			if (!new_page)
				break;		/* Out of memory */
		}

406 407 408
		/*
		 * call radix_tree_preload() while we can wait.
		 */
409
		err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
410 411 412
		if (err)
			break;

H
Hugh Dickins 已提交
413 414 415
		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
416
		err = swapcache_prepare(entry);
417
		if (err == -EEXIST) {
418
			radix_tree_preload_end();
419 420 421
			/*
			 * We might race against get_swap_page() and stumble
			 * across a SWAP_HAS_CACHE swap_map entry whose page
422
			 * has not been brought into the swapcache yet.
423 424
			 */
			cond_resched();
425
			continue;
426 427 428
		}
		if (err) {		/* swp entry is obsolete ? */
			radix_tree_preload_end();
H
Hugh Dickins 已提交
429
			break;
430
		}
H
Hugh Dickins 已提交
431

432
		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
433
		__SetPageLocked(new_page);
434
		__SetPageSwapBacked(new_page);
435
		err = __add_to_swap_cache(new_page, entry);
N
Nick Piggin 已提交
436
		if (likely(!err)) {
437
			radix_tree_preload_end();
L
Linus Torvalds 已提交
438 439 440
			/*
			 * Initiate read into locked page and return.
			 */
441
			lru_cache_add_anon(new_page);
442
			*new_page_allocated = true;
L
Linus Torvalds 已提交
443 444
			return new_page;
		}
445
		radix_tree_preload_end();
446
		__ClearPageLocked(new_page);
447 448 449 450
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
451
		put_swap_page(new_page, entry);
H
Hugh Dickins 已提交
452
	} while (err != -ENOMEM);
L
Linus Torvalds 已提交
453 454

	if (new_page)
455
		put_page(new_page);
L
Linus Torvalds 已提交
456 457
	return found_page;
}
458

459 460 461 462 463 464 465
/*
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
466
		struct vm_area_struct *vma, unsigned long addr, bool do_poll)
467 468 469 470 471 472
{
	bool page_was_allocated;
	struct page *retpage = __read_swap_cache_async(entry, gfp_mask,
			vma, addr, &page_was_allocated);

	if (page_was_allocated)
473
		swap_readpage(retpage, do_poll);
474 475 476 477

	return retpage;
}

H
Huang Ying 已提交
478 479 480 481 482
static unsigned int __swapin_nr_pages(unsigned long prev_offset,
				      unsigned long offset,
				      int hits,
				      int max_pages,
				      int prev_win)
483
{
H
Huang Ying 已提交
484
	unsigned int pages, last_ra;
485 486 487 488 489 490

	/*
	 * This heuristic has been found to work well on both sequential and
	 * random loads, swapping to hard disk or to SSD: please don't ask
	 * what the "+ 2" means, it just happens to work well, that's all.
	 */
H
Huang Ying 已提交
491
	pages = hits + 2;
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510
	if (pages == 2) {
		/*
		 * We can have no readahead hits to judge by: but must not get
		 * stuck here forever, so check for an adjacent offset instead
		 * (and don't even bother to check whether swap type is same).
		 */
		if (offset != prev_offset + 1 && offset != prev_offset - 1)
			pages = 1;
	} else {
		unsigned int roundup = 4;
		while (roundup < pages)
			roundup <<= 1;
		pages = roundup;
	}

	if (pages > max_pages)
		pages = max_pages;

	/* Don't shrink readahead too fast */
H
Huang Ying 已提交
511
	last_ra = prev_win / 2;
512 513
	if (pages < last_ra)
		pages = last_ra;
H
Huang Ying 已提交
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532

	return pages;
}

static unsigned long swapin_nr_pages(unsigned long offset)
{
	static unsigned long prev_offset;
	unsigned int hits, pages, max_pages;
	static atomic_t last_readahead_pages;

	max_pages = 1 << READ_ONCE(page_cluster);
	if (max_pages <= 1)
		return 1;

	hits = atomic_xchg(&swapin_readahead_hits, 0);
	pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
				  atomic_read(&last_readahead_pages));
	if (!hits)
		prev_offset = offset;
533 534 535 536 537
	atomic_set(&last_readahead_pages, pages);

	return pages;
}

538 539 540
/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
541
 * @gfp_mask: memory allocation flags
542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
 * @vma: user vma this address belongs to
 * @addr: target address for mempolicy
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 */
557
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
558 559 560
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *page;
561 562
	unsigned long entry_offset = swp_offset(entry);
	unsigned long offset = entry_offset;
563
	unsigned long start_offset, end_offset;
564
	unsigned long mask;
565
	struct blk_plug plug;
566
	bool do_poll = true, page_allocated;
567

568 569 570 571
	mask = swapin_nr_pages(offset) - 1;
	if (!mask)
		goto skip;

572
	do_poll = false;
573 574 575 576 577 578
	/* Read a page_cluster sized and aligned cluster around offset. */
	start_offset = offset & ~mask;
	end_offset = offset | mask;
	if (!start_offset)	/* First page is swap header. */
		start_offset++;

579
	blk_start_plug(&plug);
580
	for (offset = start_offset; offset <= end_offset ; offset++) {
581
		/* Ok, do the async read-ahead now */
582 583 584
		page = __read_swap_cache_async(
			swp_entry(swp_type(entry), offset),
			gfp_mask, vma, addr, &page_allocated);
585
		if (!page)
586
			continue;
587 588 589 590 591 592 593
		if (page_allocated) {
			swap_readpage(page, false);
			if (offset != entry_offset &&
			    likely(!PageTransCompound(page))) {
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
594
		}
595
		put_page(page);
596
	}
597 598
	blk_finish_plug(&plug);

599
	lru_add_drain();	/* Push any new pages onto the LRU now */
600
skip:
601
	return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
602
}
603 604 605 606 607 608 609

int init_swap_address_space(unsigned int type, unsigned long nr_pages)
{
	struct address_space *spaces, *space;
	unsigned int i, nr;

	nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
610
	spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL);
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	if (!spaces)
		return -ENOMEM;
	for (i = 0; i < nr; i++) {
		space = spaces + i;
		INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN);
		atomic_set(&space->i_mmap_writable, 0);
		space->a_ops = &swap_aops;
		/* swap cache doesn't use writeback related tags */
		mapping_set_no_writeback_tags(space);
		spin_lock_init(&space->tree_lock);
	}
	nr_swapper_spaces[type] = nr;
	rcu_assign_pointer(swapper_spaces[type], spaces);

	return 0;
}

void exit_swap_address_space(unsigned int type)
{
	struct address_space *spaces;

	spaces = swapper_spaces[type];
	nr_swapper_spaces[type] = 0;
	rcu_assign_pointer(swapper_spaces[type], NULL);
	synchronize_rcu();
	kvfree(spaces);
}
H
Huang Ying 已提交
638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764

static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
				     unsigned long faddr,
				     unsigned long lpfn,
				     unsigned long rpfn,
				     unsigned long *start,
				     unsigned long *end)
{
	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
		      PFN_DOWN(faddr & PMD_MASK));
	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
}

struct page *swap_readahead_detect(struct vm_fault *vmf,
				   struct vma_swap_readahead *swap_ra)
{
	struct vm_area_struct *vma = vmf->vma;
	unsigned long swap_ra_info;
	struct page *page;
	swp_entry_t entry;
	unsigned long faddr, pfn, fpfn;
	unsigned long start, end;
	pte_t *pte;
	unsigned int max_win, hits, prev_win, win, left;
#ifndef CONFIG_64BIT
	pte_t *tpte;
#endif

	faddr = vmf->address;
	entry = pte_to_swp_entry(vmf->orig_pte);
	if ((unlikely(non_swap_entry(entry))))
		return NULL;
	page = lookup_swap_cache(entry, vma, faddr);
	if (page)
		return page;

	max_win = 1 << READ_ONCE(swap_ra_max_order);
	if (max_win == 1) {
		swap_ra->win = 1;
		return NULL;
	}

	fpfn = PFN_DOWN(faddr);
	swap_ra_info = GET_SWAP_RA_VAL(vma);
	pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info));
	prev_win = SWAP_RA_WIN(swap_ra_info);
	hits = SWAP_RA_HITS(swap_ra_info);
	swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits,
					       max_win, prev_win);
	atomic_long_set(&vma->swap_readahead_info,
			SWAP_RA_VAL(faddr, win, 0));

	if (win == 1)
		return NULL;

	/* Copy the PTEs because the page table may be unmapped */
	if (fpfn == pfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end);
	else if (pfn == fpfn + 1)
		swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1,
				  &start, &end);
	else {
		left = (win - 1) / 2;
		swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left,
				  &start, &end);
	}
	swap_ra->nr_pte = end - start;
	swap_ra->offset = fpfn - start;
	pte = vmf->pte - swap_ra->offset;
#ifdef CONFIG_64BIT
	swap_ra->ptes = pte;
#else
	tpte = swap_ra->ptes;
	for (pfn = start; pfn != end; pfn++)
		*tpte++ = *pte++;
#endif

	return NULL;
}

struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask,
				    struct vm_fault *vmf,
				    struct vma_swap_readahead *swap_ra)
{
	struct blk_plug plug;
	struct vm_area_struct *vma = vmf->vma;
	struct page *page;
	pte_t *pte, pentry;
	swp_entry_t entry;
	unsigned int i;
	bool page_allocated;

	if (swap_ra->win == 1)
		goto skip;

	blk_start_plug(&plug);
	for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte;
	     i++, pte++) {
		pentry = *pte;
		if (pte_none(pentry))
			continue;
		if (pte_present(pentry))
			continue;
		entry = pte_to_swp_entry(pentry);
		if (unlikely(non_swap_entry(entry)))
			continue;
		page = __read_swap_cache_async(entry, gfp_mask, vma,
					       vmf->address, &page_allocated);
		if (!page)
			continue;
		if (page_allocated) {
			swap_readpage(page, false);
			if (i != swap_ra->offset &&
			    likely(!PageTransCompound(page))) {
				SetPageReadahead(page);
				count_vm_event(SWAP_RA);
			}
		}
		put_page(page);
	}
	blk_finish_plug(&plug);
	lru_add_drain();
skip:
	return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address,
				     swap_ra->win == 1);
}
765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844

#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false");
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
		swap_vma_readahead = true;
	else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
		swap_vma_readahead = false;
	else
		return -EINVAL;

	return count;
}
static struct kobj_attribute vma_ra_enabled_attr =
	__ATTR(vma_ra_enabled, 0644, vma_ra_enabled_show,
	       vma_ra_enabled_store);

static ssize_t vma_ra_max_order_show(struct kobject *kobj,
				     struct kobj_attribute *attr, char *buf)
{
	return sprintf(buf, "%d\n", swap_ra_max_order);
}
static ssize_t vma_ra_max_order_store(struct kobject *kobj,
				      struct kobj_attribute *attr,
				      const char *buf, size_t count)
{
	int err, v;

	err = kstrtoint(buf, 10, &v);
	if (err || v > SWAP_RA_ORDER_CEILING || v <= 0)
		return -EINVAL;

	swap_ra_max_order = v;

	return count;
}
static struct kobj_attribute vma_ra_max_order_attr =
	__ATTR(vma_ra_max_order, 0644, vma_ra_max_order_show,
	       vma_ra_max_order_store);

static struct attribute *swap_attrs[] = {
	&vma_ra_enabled_attr.attr,
	&vma_ra_max_order_attr.attr,
	NULL,
};

static struct attribute_group swap_attr_group = {
	.attrs = swap_attrs,
};

static int __init swap_init_sysfs(void)
{
	int err;
	struct kobject *swap_kobj;

	swap_kobj = kobject_create_and_add("swap", mm_kobj);
	if (!swap_kobj) {
		pr_err("failed to create swap kobject\n");
		return -ENOMEM;
	}
	err = sysfs_create_group(swap_kobj, &swap_attr_group);
	if (err) {
		pr_err("failed to register swap group\n");
		goto delete_obj;
	}
	return 0;

delete_obj:
	kobject_put(swap_kobj);
	return err;
}
subsys_initcall(swap_init_sysfs);
#endif