swap_state.c 10.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
10
#include <linux/gfp.h>
L
Linus Torvalds 已提交
11 12
#include <linux/kernel_stat.h>
#include <linux/swap.h>
13
#include <linux/swapops.h>
L
Linus Torvalds 已提交
14 15 16
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
17
#include <linux/blkdev.h>
18
#include <linux/pagevec.h>
C
Christoph Lameter 已提交
19
#include <linux/migrate.h>
20
#include <linux/page_cgroup.h>
L
Linus Torvalds 已提交
21 22 23 24 25

#include <asm/pgtable.h>

/*
 * swapper_space is a fiction, retained to simplify the path through
J
Jens Axboe 已提交
26
 * vmscan's shrink_page_list.
L
Linus Torvalds 已提交
27
 */
28
static const struct address_space_operations swap_aops = {
L
Linus Torvalds 已提交
29
	.writepage	= swap_writepage,
30
	.set_page_dirty	= swap_set_page_dirty,
31
	.migratepage	= migrate_page,
L
Linus Torvalds 已提交
32 33 34
};

static struct backing_dev_info swap_backing_dev_info = {
35
	.name		= "swap",
36
	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
L
Linus Torvalds 已提交
37 38
};

39 40 41 42 43 44
struct address_space swapper_spaces[MAX_SWAPFILES] = {
	[0 ... MAX_SWAPFILES - 1] = {
		.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
		.a_ops		= &swap_aops,
		.backing_dev_info = &swap_backing_dev_info,
	}
L
Linus Torvalds 已提交
45 46 47 48 49 50 51 52 53 54 55
};

#define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
} swap_cache_info;

56 57 58 59 60 61 62 63 64 65
unsigned long total_swapcache_pages(void)
{
	int i;
	unsigned long ret = 0;

	for (i = 0; i < MAX_SWAPFILES; i++)
		ret += swapper_spaces[i].nrpages;
	return ret;
}

L
Linus Torvalds 已提交
66 67
void show_swap_cache_info(void)
{
68
	printk("%lu pages in swap cache\n", total_swapcache_pages());
69
	printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
L
Linus Torvalds 已提交
70
		swap_cache_info.add_total, swap_cache_info.del_total,
71
		swap_cache_info.find_success, swap_cache_info.find_total);
H
Hugh Dickins 已提交
72
	printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
L
Linus Torvalds 已提交
73 74 75 76
	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}

/*
77
 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
L
Linus Torvalds 已提交
78 79
 * but sets SwapCache flag and private instead of mapping and index.
 */
80
static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
L
Linus Torvalds 已提交
81 82
{
	int error;
83
	struct address_space *address_space;
L
Linus Torvalds 已提交
84

85 86 87 88
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(PageSwapCache(page));
	VM_BUG_ON(!PageSwapBacked(page));

89 90 91 92
	page_cache_get(page);
	SetPageSwapCache(page);
	set_page_private(page, entry.val);

93 94 95 96
	address_space = swap_address_space(entry);
	spin_lock_irq(&address_space->tree_lock);
	error = radix_tree_insert(&address_space->page_tree,
					entry.val, page);
97
	if (likely(!error)) {
98
		address_space->nrpages++;
99 100 101
		__inc_zone_page_state(page, NR_FILE_PAGES);
		INC_CACHE_INFO(add_total);
	}
102
	spin_unlock_irq(&address_space->tree_lock);
103 104

	if (unlikely(error)) {
105 106 107 108 109 110
		/*
		 * Only the context which have set SWAP_HAS_CACHE flag
		 * would call add_to_swap_cache().
		 * So add_to_swap_cache() doesn't returns -EEXIST.
		 */
		VM_BUG_ON(error == -EEXIST);
111 112 113 114 115 116 117 118 119 120 121 122 123
		set_page_private(page, 0UL);
		ClearPageSwapCache(page);
		page_cache_release(page);
	}

	return error;
}


int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
{
	int error;

B
Balbir Singh 已提交
124 125
	error = radix_tree_preload(gfp_mask);
	if (!error) {
126
		error = __add_to_swap_cache(page, entry);
L
Linus Torvalds 已提交
127
		radix_tree_preload_end();
H
Hugh Dickins 已提交
128
	}
L
Linus Torvalds 已提交
129 130 131 132 133 134 135 136 137
	return error;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
138 139 140
	swp_entry_t entry;
	struct address_space *address_space;

141 142 143
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageSwapCache(page));
	VM_BUG_ON(PageWriteback(page));
L
Linus Torvalds 已提交
144

145 146 147
	entry.val = page_private(page);
	address_space = swap_address_space(entry);
	radix_tree_delete(&address_space->page_tree, page_private(page));
H
Hugh Dickins 已提交
148
	set_page_private(page, 0);
L
Linus Torvalds 已提交
149
	ClearPageSwapCache(page);
150
	address_space->nrpages--;
151
	__dec_zone_page_state(page, NR_FILE_PAGES);
L
Linus Torvalds 已提交
152 153 154 155 156 157 158 159 160 161
	INC_CACHE_INFO(del_total);
}

/**
 * add_to_swap - allocate swap space for a page
 * @page: page we want to move to swap
 *
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
162
int add_to_swap(struct page *page)
L
Linus Torvalds 已提交
163 164 165 166
{
	swp_entry_t entry;
	int err;

167 168
	VM_BUG_ON(!PageLocked(page));
	VM_BUG_ON(!PageUptodate(page));
L
Linus Torvalds 已提交
169

170 171 172 173
	entry = get_swap_page();
	if (!entry.val)
		return 0;

A
Andrea Arcangeli 已提交
174 175 176 177 178 179
	if (unlikely(PageTransHuge(page)))
		if (unlikely(split_huge_page(page))) {
			swapcache_free(entry, NULL);
			return 0;
		}

180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
	/*
	 * Radix-tree node allocations from PF_MEMALLOC contexts could
	 * completely exhaust the page allocator. __GFP_NOMEMALLOC
	 * stops emergency reserves from being allocated.
	 *
	 * TODO: this could cause a theoretical memory reclaim
	 * deadlock in the swap out path.
	 */
	/*
	 * Add it to the swap cache and mark it dirty
	 */
	err = add_to_swap_cache(page, entry,
			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);

	if (!err) {	/* Success */
		SetPageDirty(page);
		return 1;
	} else {	/* -ENOMEM radix-tree allocation failure */
N
Nick Piggin 已提交
198
		/*
199 200
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
L
Linus Torvalds 已提交
201
		 */
202 203
		swapcache_free(entry, NULL);
		return 0;
L
Linus Torvalds 已提交
204 205 206 207 208 209 210 211 212 213 214 215
	}
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;
216
	struct address_space *address_space;
L
Linus Torvalds 已提交
217

H
Hugh Dickins 已提交
218
	entry.val = page_private(page);
L
Linus Torvalds 已提交
219

220 221
	address_space = swap_address_space(entry);
	spin_lock_irq(&address_space->tree_lock);
L
Linus Torvalds 已提交
222
	__delete_from_swap_cache(page);
223
	spin_unlock_irq(&address_space->tree_lock);
L
Linus Torvalds 已提交
224

225
	swapcache_free(entry, page);
L
Linus Torvalds 已提交
226 227 228 229 230 231 232
	page_cache_release(page);
}

/* 
 * If we are the only user, then try to free up the swap cache. 
 * 
 * Its ok to check for PageSwapCache without the page lock
233 234
 * here because we are going to recheck again inside
 * try_to_free_swap() _with_ the lock.
L
Linus Torvalds 已提交
235 236 237 238
 * 					- Marcelo
 */
static inline void free_swap_cache(struct page *page)
{
239 240
	if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
		try_to_free_swap(page);
L
Linus Torvalds 已提交
241 242 243 244 245 246
		unlock_page(page);
	}
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
247
 * this page if it is the last user of the page.
L
Linus Torvalds 已提交
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
 */
void free_page_and_swap_cache(struct page *page)
{
	free_swap_cache(page);
	page_cache_release(page);
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct page **pages, int nr)
{
	struct page **pagep = pages;

	lru_add_drain();
	while (nr) {
265
		int todo = min(nr, PAGEVEC_SIZE);
L
Linus Torvalds 已提交
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285
		int i;

		for (i = 0; i < todo; i++)
			free_swap_cache(pagep[i]);
		release_pages(pagep, todo, 0);
		pagep += todo;
		nr -= todo;
	}
}

/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *page;

286
	page = find_get_page(swap_address_space(entry), entry.val);
L
Linus Torvalds 已提交
287 288 289 290 291 292 293 294 295 296 297 298 299 300

	if (page)
		INC_CACHE_INFO(find_success);

	INC_CACHE_INFO(find_total);
	return page;
}

/* 
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
301
struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
L
Linus Torvalds 已提交
302 303 304 305 306 307 308 309 310 311 312
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *found_page, *new_page = NULL;
	int err;

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics.
		 */
313 314
		found_page = find_get_page(swap_address_space(entry),
					entry.val);
L
Linus Torvalds 已提交
315 316 317 318 319 320 321
		if (found_page)
			break;

		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
322
			new_page = alloc_page_vma(gfp_mask, vma, addr);
L
Linus Torvalds 已提交
323 324 325 326
			if (!new_page)
				break;		/* Out of memory */
		}

327 328 329 330 331 332 333
		/*
		 * call radix_tree_preload() while we can wait.
		 */
		err = radix_tree_preload(gfp_mask & GFP_KERNEL);
		if (err)
			break;

H
Hugh Dickins 已提交
334 335 336
		/*
		 * Swap entry may have been freed since our caller observed it.
		 */
337
		err = swapcache_prepare(entry);
338 339
		if (err == -EEXIST) {	/* seems racy */
			radix_tree_preload_end();
340
			continue;
341 342 343
		}
		if (err) {		/* swp entry is obsolete ? */
			radix_tree_preload_end();
H
Hugh Dickins 已提交
344
			break;
345
		}
H
Hugh Dickins 已提交
346

347
		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
348
		__set_page_locked(new_page);
R
Rik van Riel 已提交
349
		SetPageSwapBacked(new_page);
350
		err = __add_to_swap_cache(new_page, entry);
N
Nick Piggin 已提交
351
		if (likely(!err)) {
352
			radix_tree_preload_end();
L
Linus Torvalds 已提交
353 354 355
			/*
			 * Initiate read into locked page and return.
			 */
356
			lru_cache_add_anon(new_page);
357
			swap_readpage(new_page);
L
Linus Torvalds 已提交
358 359
			return new_page;
		}
360
		radix_tree_preload_end();
R
Rik van Riel 已提交
361
		ClearPageSwapBacked(new_page);
362
		__clear_page_locked(new_page);
363 364 365 366
		/*
		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
		 * clear SWAP_HAS_CACHE flag.
		 */
367
		swapcache_free(entry, NULL);
H
Hugh Dickins 已提交
368
	} while (err != -ENOMEM);
L
Linus Torvalds 已提交
369 370 371 372 373

	if (new_page)
		page_cache_release(new_page);
	return found_page;
}
374 375 376 377

/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
378
 * @gfp_mask: memory allocation flags
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
 * @vma: user vma this address belongs to
 * @addr: target address for mempolicy
 *
 * Returns the struct page for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * This has been extended to use the NUMA policies from the mm triggering
 * the readahead.
 *
 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 */
394
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
395 396 397
			struct vm_area_struct *vma, unsigned long addr)
{
	struct page *page;
398 399 400
	unsigned long offset = swp_offset(entry);
	unsigned long start_offset, end_offset;
	unsigned long mask = (1UL << page_cluster) - 1;
401
	struct blk_plug plug;
402

403 404 405 406 407 408
	/* Read a page_cluster sized and aligned cluster around offset. */
	start_offset = offset & ~mask;
	end_offset = offset | mask;
	if (!start_offset)	/* First page is swap header. */
		start_offset++;

409
	blk_start_plug(&plug);
410
	for (offset = start_offset; offset <= end_offset ; offset++) {
411 412
		/* Ok, do the async read-ahead now */
		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
413
						gfp_mask, vma, addr);
414
		if (!page)
415
			continue;
416 417
		page_cache_release(page);
	}
418 419
	blk_finish_plug(&plug);

420
	lru_add_drain();	/* Push any new pages onto the LRU now */
421
	return read_swap_cache_async(entry, gfp_mask, vma, addr);
422
}