swap_state.c 10.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
10
#include <linux/gfp.h>
L
Linus Torvalds 已提交
11 12
#include <linux/kernel_stat.h>
#include <linux/swap.h>
13
#include <linux/swapops.h>
L
Linus Torvalds 已提交
14 15 16
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
17
#include <linux/blkdev.h>
18
#include <linux/pagevec.h>
C
Christoph Lameter 已提交
19
#include <linux/migrate.h>
20
#include <linux/page_cgroup.h>
L
Linus Torvalds 已提交
21 22 23 24 25

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
J
Jens Axboe 已提交
26
 * vmscan's shrink_page_list.
L
Linus Torvalds 已提交
27
 */
28
static const struct address_space_operations swap_aops = {
L
Linus Torvalds 已提交
29
	.writepage	= swap_writepage,
30
	.set_page_dirty	= swap_set_page_dirty,
31
	.migratepage	= migrate_page,
L
Linus Torvalds 已提交
32 33 34
};

static struct backing_dev_info swap_backing_dev_info = {
35
	.name		= "swap",
36
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
L
Linus Torvalds 已提交
37 38
};

39 40 41 42 43 44
struct address_space swapper_spaces[MAX_SWAPFILES] = {
	[0 ... MAX_SWAPFILES - 1] = {
		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
		.a_ops		= &swap_aops,
		.backing_dev_info = &swap_backing_dev_info,
	}
L
Linus Torvalds 已提交
45 46 47 48 49 50 51 52 53 54 55
};

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

56 57 58 59 60 61 62 63 64 65
unsigned long total_swapcache_pages(void)
{
	int i;
	unsigned long ret = 0;

	for (i = 0; i < MAX_SWAPFILES; i++)
		ret += swapper_spaces[i].nrpages;
	return ret;
}

L
Linus Torvalds 已提交
66 67
void show_swap_cache_info(void)
{
68
	printk("%lu pages in swap cache\n", total_swapcache_pages());
69
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
L
Linus Torvalds 已提交
70
		swap_cache_info.add_total, swap_cache_info.del_total,
71
		swap_cache_info.find_success, swap_cache_info.find_total);
72 73
	printk("Free swap  = %ldkB\n",
		get_nr_swap_pages() << (PAGE_SHIFT - 10));
L
Linus Torvalds 已提交
74 75 76 77
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
78
 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
L
Linus Torvalds 已提交
79 80
 * but sets SwapCache flag and private instead of mapping and index.
 */
81
int __add_to_swap_cache(struct page *page, swp_entry_t entry)
L
Linus Torvalds 已提交
82 83
{
	int error;
84
	struct address_space *address_space;
L
Linus Torvalds 已提交
85

86 87 88 89
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(PageSwapCache(page));
	VM_BUG_ON(!PageSwapBacked(page));

90 91 92 93
	page_cache_get(page);
	SetPageSwapCache(page);
	set_page_private(page, entry.val);

94 95 96 97
	address_space = swap_address_space(entry);
	spin_lock_irq(&address_space->tree_lock);
	error = radix_tree_insert(&address_space->page_tree,
					entry.val, page);
98
	if (likely(!error)) {
99
		address_space->nrpages++;
100 101 102
		__inc_zone_page_state(page, NR_FILE_PAGES);
		INC_CACHE_INFO(add_total);
	}
103
	spin_unlock_irq(&address_space->tree_lock);
104 105

	if (unlikely(error)) {
106 107 108 109 110 111
		/*
		 * Only the context which have set SWAP_HAS_CACHE flag
		 * would call add_to_swap_cache().
		 * So add_to_swap_cache() doesn't returns -EEXIST.
		 */
		VM_BUG_ON(error == -EEXIST);
112 113 114 115 116 117 118 119 120 121 122 123 124
		set_page_private(page, 0UL);
		ClearPageSwapCache(page);
		page_cache_release(page);
	}

	return error;
}


int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
	int error;

B
Balbir Singh 已提交
125 126
	error = radix_tree_preload(gfp_mask);
	if (!error) {
127
		error = __add_to_swap_cache(page, entry);
L
Linus Torvalds 已提交
128
		radix_tree_preload_end();
H
Hugh Dickins 已提交
129
	}
L
Linus Torvalds 已提交
130 131 132 133 134 135 136 137 138
	return error;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
139 140 141
	swp_entry_t entry;
	struct address_space *address_space;

142 143 144
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageSwapCache(page));
	VM_BUG_ON(PageWriteback(page));
L
Linus Torvalds 已提交
145

146 147 148
	entry.val = page_private(page);
	address_space = swap_address_space(entry);
	radix_tree_delete(&address_space->page_tree, page_private(page));
H
Hugh Dickins 已提交
149
	set_page_private(page, 0);
L
Linus Torvalds 已提交
150
	ClearPageSwapCache(page);
151
	address_space->nrpages--;
152
	__dec_zone_page_state(page, NR_FILE_PAGES);
L
Linus Torvalds 已提交
153 154 155 156 157 158 159 160 161 162
	INC_CACHE_INFO(del_total);
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
163
int add_to_swap(struct page *page)
L
Linus Torvalds 已提交
164 165 166 167
{
	swp_entry_t entry;
	int err;

168 169
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageUptodate(page));
L
Linus Torvalds 已提交
170

171 172 173 174
	entry = get_swap_page();
	if (!entry.val)
		return 0;

A
Andrea Arcangeli 已提交
175 176 177 178 179 180
	if (unlikely(PageTransHuge(page)))
		if (unlikely(split_huge_page(page))) {
			swapcache_free(entry, NULL);
			return 0;
		}

181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
	/*
	 * Radix-tree node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
	 * Add it to the swap cache and mark it dirty
	 */
	err = add_to_swap_cache(page, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);

	if (!err) {	/* Success */
		SetPageDirty(page);
		return 1;
	} else {	/* -ENOMEM radix-tree allocation failure */
N
Nick Piggin 已提交
199
		/*
200 201
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
L
Linus Torvalds 已提交
202
		 */
203 204
		swapcache_free(entry, NULL);
		return 0;
L
Linus Torvalds 已提交
205 206 207 208 209 210 211 212 213 214 215 216
	}
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;
217
	struct address_space *address_space;
L
Linus Torvalds 已提交
218

H
Hugh Dickins 已提交
219
	entry.val = page_private(page);
L
Linus Torvalds 已提交
220

221 222
	address_space = swap_address_space(entry);
	spin_lock_irq(&address_space->tree_lock);
L
Linus Torvalds 已提交
223
	__delete_from_swap_cache(page);
224
	spin_unlock_irq(&address_space->tree_lock);
L
Linus Torvalds 已提交
225

226
	swapcache_free(entry, page);
L
Linus Torvalds 已提交
227 228 229 230 231 232 233
	page_cache_release(page);
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
234 235
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
L
Linus Torvalds 已提交
236 237 238 239
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
240 241
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
L
Linus Torvalds 已提交
242 243 244 245 246 247
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
248
 * this page if it is the last user of the page.
L
Linus Torvalds 已提交
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
	page_cache_release(page);
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;

	lru_add_drain();
	while (nr) {
266
		int todo = min(nr, PAGEVEC_SIZE);
L
Linus Torvalds 已提交
267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
		int i;

		for (i = 0; i < todo; i++)
			free_swap_cache(pagep[i]);
		release_pages(pagep, todo, 0);
		pagep += todo;
		nr -= todo;
	}
}

/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *page;

287
	page = find_get_page(swap_address_space(entry), entry.val);
L
Linus Torvalds 已提交
288 289 290 291 292 293 294 295 296 297 298 299 300 301

	if (page)
		INC_CACHE_INFO(find_success);

	INC_CACHE_INFO(find_total);
	return page;
}

/* 
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
302
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
L
Linus Torvalds 已提交
303 304 305 306 307 308 309 310 311 312 313
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *found_page, *new_page = NULL;
	int err;

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
314 315
		found_page = find_get_page(swap_address_space(entry),
					entry.val);
L
Linus Torvalds 已提交
316 317 318 319 320 321 322
		if (found_page)
			break;

		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
323
			new_page = alloc_page_vma(gfp_mask, vma, addr);
L
Linus Torvalds 已提交
324 325 326 327
			if (!new_page)
				break;		/* Out of memory */
		}

328 329 330 331 332 333 334
		/*
		 * call radix_tree_preload() while we can wait.
		 */
		err = radix_tree_preload(gfp_mask & GFP_KERNEL);
		if (err)
			break;

H
Hugh Dickins 已提交
335 336 337
		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
338
		err = swapcache_prepare(entry);
339 340
		if (err == -EEXIST) {	/* seems racy */
			radix_tree_preload_end();
341
			continue;
342 343 344
		}
		if (err) {		/* swp entry is obsolete ? */
			radix_tree_preload_end();
H
Hugh Dickins 已提交
345
			break;
346
		}
H
Hugh Dickins 已提交
347

348
		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
349
		__set_page_locked(new_page);
R
Rik van Riel 已提交
350
		SetPageSwapBacked(new_page);
351
		err = __add_to_swap_cache(new_page, entry);
N
Nick Piggin 已提交
352
		if (likely(!err)) {
353
			radix_tree_preload_end();
L
Linus Torvalds 已提交
354 355 356
			/*
			 * Initiate read into locked page and return.
			 */
357
			lru_cache_add_anon(new_page);
358
			swap_readpage(new_page);
L
Linus Torvalds 已提交
359 360
			return new_page;
		}
361
		radix_tree_preload_end();
R
Rik van Riel 已提交
362
		ClearPageSwapBacked(new_page);
363
		__clear_page_locked(new_page);
364 365 366 367
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
368
		swapcache_free(entry, NULL);
H
Hugh Dickins 已提交
369
	} while (err != -ENOMEM);
L
Linus Torvalds 已提交
370 371 372 373 374

	if (new_page)
		page_cache_release(new_page);
	return found_page;
}
375 376 377 378

/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
379
 * @gfp_mask: memory allocation flags
380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
 * @vma: user vma this address belongs to
 * @addr: target address for mempolicy
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 */
395
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
396 397 398
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *page;
399 400 401
	unsigned long offset = swp_offset(entry);
	unsigned long start_offset, end_offset;
	unsigned long mask = (1UL << page_cluster) - 1;
402
	struct blk_plug plug;
403

404 405 406 407 408 409
	/* Read a page_cluster sized and aligned cluster around offset. */
	start_offset = offset & ~mask;
	end_offset = offset | mask;
	if (!start_offset)	/* First page is swap header. */
		start_offset++;

410
	blk_start_plug(&plug);
411
	for (offset = start_offset; offset <= end_offset ; offset++) {
412 413
		/* Ok, do the async read-ahead now */
		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
414
						gfp_mask, vma, addr);
415
		if (!page)
416
			continue;
417 418
		page_cache_release(page);
	}
419 420
	blk_finish_plug(&plug);

421
	lru_add_drain();	/* Push any new pages onto the LRU now */
422
	return read_swap_cache_async(entry, gfp_mask, vma, addr);
423
}