readahead.c 13.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
6
 * 09Apr2002	Andrew Morton
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15
 *		Initial version.
 */

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
16
#include <linux/task_io_accounting_ops.h>
L
Linus Torvalds 已提交
17
#include <linux/pagevec.h>
J
Jens Axboe 已提交
18
#include <linux/pagemap.h>
L
Linus Torvalds 已提交
19 20 21 22 23 24 25 26 27

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
	ra->ra_pages = mapping->backing_dev_info->ra_pages;
28
	ra->prev_pos = -1;
L
Linus Torvalds 已提交
29
}
30
EXPORT_SYMBOL_GPL(file_ra_state_init);
L
Linus Torvalds 已提交
31 32 33

#define list_to_page(head) (list_entry((head)->prev, struct page, lru))

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
/*
 * see if a page needs releasing upon read_cache_pages() failure
 * - the caller of read_cache_pages() may have set PG_private before calling,
 *   such as the NFS fs marking pages that are cached locally on disk, thus we
 *   need to give the fs a chance to clean up in the event of an error
 */
static void read_cache_pages_invalidate_page(struct address_space *mapping,
					     struct page *page)
{
	if (PagePrivate(page)) {
		if (!trylock_page(page))
			BUG();
		page->mapping = mapping;
		do_invalidatepage(page, 0);
		page->mapping = NULL;
		unlock_page(page);
	}
	page_cache_release(page);
}

/*
 * release a list of pages, invalidating them first if need be
 */
static void read_cache_pages_invalidate_pages(struct address_space *mapping,
					      struct list_head *pages)
{
	struct page *victim;

	while (!list_empty(pages)) {
		victim = list_to_page(pages);
		list_del(&victim->lru);
		read_cache_pages_invalidate_page(mapping, victim);
	}
}

L
Linus Torvalds 已提交
69
/**
70
 * read_cache_pages - populate an address space with some pages & start reads against them
L
Linus Torvalds 已提交
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
 * @mapping: the address_space
 * @pages: The address of a list_head which contains the target pages.  These
 *   pages have their ->index populated and are otherwise uninitialised.
 * @filler: callback routine for filling a single page.
 * @data: private data for the callback routine.
 *
 * Hides the details of the LRU cache etc from the filesystems.
 */
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
			int (*filler)(void *, struct page *), void *data)
{
	struct page *page;
	int ret = 0;

	while (!list_empty(pages)) {
		page = list_to_page(pages);
		list_del(&page->lru);
N
Nick Piggin 已提交
88 89
		if (add_to_page_cache_lru(page, mapping,
					page->index, GFP_KERNEL)) {
90
			read_cache_pages_invalidate_page(mapping, page);
L
Linus Torvalds 已提交
91 92
			continue;
		}
N
Nick Piggin 已提交
93 94
		page_cache_release(page);

L
Linus Torvalds 已提交
95
		ret = filler(data, page);
N
Nick Piggin 已提交
96
		if (unlikely(ret)) {
97
			read_cache_pages_invalidate_pages(mapping, pages);
L
Linus Torvalds 已提交
98 99
			break;
		}
100
		task_io_account_read(PAGE_CACHE_SIZE);
L
Linus Torvalds 已提交
101 102 103 104 105 106 107 108 109 110
	}
	return ret;
}

EXPORT_SYMBOL(read_cache_pages);

static int read_pages(struct address_space *mapping, struct file *filp,
		struct list_head *pages, unsigned nr_pages)
{
	unsigned page_idx;
111
	int ret;
L
Linus Torvalds 已提交
112 113 114

	if (mapping->a_ops->readpages) {
		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
O
OGAWA Hirofumi 已提交
115 116
		/* Clean up the remaining pages */
		put_pages_list(pages);
L
Linus Torvalds 已提交
117 118 119 120 121 122
		goto out;
	}

	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
		struct page *page = list_to_page(pages);
		list_del(&page->lru);
N
Nick Piggin 已提交
123
		if (!add_to_page_cache_lru(page, mapping,
L
Linus Torvalds 已提交
124
					page->index, GFP_KERNEL)) {
125
			mapping->a_ops->readpage(filp, page);
N
Nick Piggin 已提交
126 127
		}
		page_cache_release(page);
L
Linus Torvalds 已提交
128
	}
129
	ret = 0;
L
Linus Torvalds 已提交
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
out:
	return ret;
}

/*
 * do_page_cache_readahead actually reads a chunk of disk.  It allocates all
 * the pages first, then submits them all for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 *
 * Returns the number of pages requested, or the maximum amount of I/O allowed.
 *
 * do_page_cache_readahead() returns -1 if it encountered request queue
 * congestion.
 */
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
147 148
			pgoff_t offset, unsigned long nr_to_read,
			unsigned long lookahead_size)
L
Linus Torvalds 已提交
149 150 151 152 153 154 155 156 157 158 159 160
{
	struct inode *inode = mapping->host;
	struct page *page;
	unsigned long end_index;	/* The last page we want to read */
	LIST_HEAD(page_pool);
	int page_idx;
	int ret = 0;
	loff_t isize = i_size_read(inode);

	if (isize == 0)
		goto out;

161
	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
L
Linus Torvalds 已提交
162 163 164 165 166

	/*
	 * Preallocate as many pages as we will need.
	 */
	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
A
Andrew Morton 已提交
167
		pgoff_t page_offset = offset + page_idx;
168

L
Linus Torvalds 已提交
169 170 171
		if (page_offset > end_index)
			break;

N
Nick Piggin 已提交
172
		rcu_read_lock();
L
Linus Torvalds 已提交
173
		page = radix_tree_lookup(&mapping->page_tree, page_offset);
N
Nick Piggin 已提交
174
		rcu_read_unlock();
L
Linus Torvalds 已提交
175 176 177 178 179 180 181 182
		if (page)
			continue;

		page = page_cache_alloc_cold(mapping);
		if (!page)
			break;
		page->index = page_offset;
		list_add(&page->lru, &page_pool);
183 184
		if (page_idx == nr_to_read - lookahead_size)
			SetPageReadahead(page);
L
Linus Torvalds 已提交
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
		ret++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	if (ret)
		read_pages(mapping, filp, &page_pool, ret);
	BUG_ON(!list_empty(&page_pool));
out:
	return ret;
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
A
Andrew Morton 已提交
205
		pgoff_t offset, unsigned long nr_to_read)
L
Linus Torvalds 已提交
206 207 208 209 210 211 212 213 214 215 216 217 218 219
{
	int ret = 0;

	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
		return -EINVAL;

	while (nr_to_read) {
		int err;

		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_CACHE_SIZE;

		if (this_chunk > nr_to_read)
			this_chunk = nr_to_read;
		err = __do_page_cache_readahead(mapping, filp,
220
						offset, this_chunk, 0);
L
Linus Torvalds 已提交
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
		if (err < 0) {
			ret = err;
			break;
		}
		ret += err;
		offset += this_chunk;
		nr_to_read -= this_chunk;
	}
	return ret;
}

/*
 * This version skips the IO if the queue is read-congested, and will tell the
 * block layer to abandon the readahead if request allocation would block.
 *
 * force_page_cache_readahead() will ignore queue congestion and will block on
 * request queues.
 */
int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
A
Andrew Morton 已提交
240
			pgoff_t offset, unsigned long nr_to_read)
L
Linus Torvalds 已提交
241 242 243 244
{
	if (bdi_read_congested(mapping->backing_dev_info))
		return -1;

245
	return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0);
L
Linus Torvalds 已提交
246 247 248 249 250 251 252 253
}

/*
 * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a
 * sensible upper limit.
 */
unsigned long max_sane_readahead(unsigned long nr)
{
254
	return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
255
		+ node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
L
Linus Torvalds 已提交
256
}
257 258 259 260

/*
 * Submit IO for the read-ahead request in file_ra_state.
 */
261
static unsigned long ra_submit(struct file_ra_state *ra,
262 263 264 265 266
		       struct address_space *mapping, struct file *filp)
{
	int actual;

	actual = __do_page_cache_readahead(mapping, filp,
267
					ra->start, ra->size, ra->async_size);
268 269 270

	return actual;
}
271

272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
	unsigned long newsize = roundup_pow_of_two(size);

	if (newsize <= max / 32)
		newsize = newsize * 4;
	else if (newsize <= max / 4)
		newsize = newsize * 2;
	else
		newsize = max;

	return newsize;
}

292 293 294 295
/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
296
static unsigned long get_next_ra_size(struct file_ra_state *ra,
297 298
						unsigned long max)
{
299
	unsigned long cur = ra->size;
300 301 302
	unsigned long newsize;

	if (cur < max / 16)
303
		newsize = 4 * cur;
304
	else
305
		newsize = 2 * cur;
306 307 308 309 310 311 312 313 314 315

	return min(newsize, max);
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
316 317 318 319
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
320 321 322 323
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
324 325 326
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
327 328 329
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
330
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
331 332 333
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
334
 * prev_pos tracks the last visited byte in the _previous_ read request.
335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static unsigned long
ondemand_readahead(struct address_space *mapping,
		   struct file_ra_state *ra, struct file *filp,
355
		   bool hit_readahead_marker, pgoff_t offset,
356 357
		   unsigned long req_size)
{
358 359 360
	int	max = ra->ra_pages;	/* max readahead pages */
	pgoff_t prev_offset;
	int	sequential;
361 362

	/*
363
	 * It's the expected callback offset, assume sequential access.
364 365
	 * Ramp up sizes, and push forward the readahead window.
	 */
366 367 368 369 370 371
	if (offset && (offset == (ra->start + ra->size - ra->async_size) ||
			offset == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max);
		ra->async_size = ra->size;
		goto readit;
372 373
	}

374 375 376
	prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
	sequential = offset - prev_offset <= 1UL || req_size > max;

377 378 379 380
	/*
	 * Standalone, small read.
	 * Read as is, and do not pollute the readahead state.
	 */
381
	if (!hit_readahead_marker && !sequential) {
382 383 384 385
		return __do_page_cache_readahead(mapping, filp,
						offset, req_size, 0);
	}

386 387 388 389 390 391 392 393 394
	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
	if (hit_readahead_marker) {
		pgoff_t start;

N
Nick Piggin 已提交
395 396 397
		rcu_read_lock();
		start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
		rcu_read_unlock();
398 399 400 401 402 403 404 405 406 407 408

		if (!start || start - offset > max)
			return 0;

		ra->start = start;
		ra->size = start - offset;	/* old async_size */
		ra->size = get_next_ra_size(ra, max);
		ra->async_size = ra->size;
		goto readit;
	}

409 410 411 412 413 414 415
	/*
	 * It may be one of
	 * 	- first read on start of file
	 * 	- sequential cache miss
	 * 	- oversize random read
	 * Start readahead for it.
	 */
416 417 418
	ra->start = offset;
	ra->size = get_init_ra_size(req_size, max);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
419

420
readit:
421 422 423 424
	return ra_submit(ra, mapping, filp);
}

/**
425
 * page_cache_sync_readahead - generic file readahead
426 427 428
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @filp: passed on to ->readpage() and ->readpages()
429
 * @offset: start offset into @mapping, in pagecache page-sized units
430
 * @req_size: hint: total size of the read which the caller is performing in
431
 *            pagecache pages
432
 *
433 434 435 436
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
437
 */
438 439 440
void page_cache_sync_readahead(struct address_space *mapping,
			       struct file_ra_state *ra, struct file *filp,
			       pgoff_t offset, unsigned long req_size)
441 442 443
{
	/* no read-ahead */
	if (!ra->ra_pages)
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461
		return;

	/* do read-ahead */
	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
EXPORT_SYMBOL_GPL(page_cache_sync_readahead);

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @filp: passed on to ->readpage() and ->readpages()
 * @page: the page at @offset which has the PG_readahead flag set
 * @offset: start offset into @mapping, in pagecache page-sized units
 * @req_size: hint: total size of the read which the caller is performing in
 *            pagecache pages
 *
 * page_cache_async_ondemand() should be called when a page is used which
462
 * has the PG_readahead flag; this is a marker to suggest that the application
463
 * has used up enough of the readahead window that we should start pulling in
464 465
 * more pages.
 */
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
void
page_cache_async_readahead(struct address_space *mapping,
			   struct file_ra_state *ra, struct file *filp,
			   struct page *page, pgoff_t offset,
			   unsigned long req_size)
{
	/* no read-ahead */
	if (!ra->ra_pages)
		return;

	/*
	 * Same bit is used for PG_readahead and PG_reclaim.
	 */
	if (PageWriteback(page))
		return;

	ClearPageReadahead(page);

	/*
	 * Defer asynchronous read-ahead on IO congestion.
	 */
	if (bdi_read_congested(mapping->backing_dev_info))
		return;
489 490

	/* do read-ahead */
491
	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
492
}
493
EXPORT_SYMBOL_GPL(page_cache_async_readahead);