xfs_buf.c 37.5 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3
 * All Rights Reserved.
L
Linus Torvalds 已提交
4
 *
5 6
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
L
Linus Torvalds 已提交
7 8
 * published by the Free Software Foundation.
 *
9 10 11 12
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
L
Linus Torvalds 已提交
13
 *
14 15 16
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
L
Linus Torvalds 已提交
17
 */
18
#include "xfs.h"
L
Linus Torvalds 已提交
19 20
#include <linux/stddef.h>
#include <linux/errno.h>
21
#include <linux/gfp.h>
L
Linus Torvalds 已提交
22 23 24 25 26 27 28 29 30 31
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/percpu.h>
#include <linux/blkdev.h>
#include <linux/hash.h>
32
#include <linux/kthread.h>
C
Christoph Lameter 已提交
33
#include <linux/migrate.h>
34
#include <linux/backing-dev.h>
35
#include <linux/freezer.h>
L
Linus Torvalds 已提交
36

37
#include "xfs_sb.h"
38
#include "xfs_log.h"
39 40
#include "xfs_ag.h"
#include "xfs_mount.h"
C
Christoph Hellwig 已提交
41
#include "xfs_trace.h"
42

43
static kmem_zone_t *xfs_buf_zone;
44

45
static struct workqueue_struct *xfslogd_workqueue;
L
Linus Torvalds 已提交
46

47 48 49 50
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
L
Linus Torvalds 已提交
51
#else
52 53 54
# define XB_SET_OWNER(bp)	do { } while (0)
# define XB_CLEAR_OWNER(bp)	do { } while (0)
# define XB_GET_OWNER(bp)	do { } while (0)
L
Linus Torvalds 已提交
55 56
#endif

57
#define xb_to_gfp(flags) \
D
Dave Chinner 已提交
58
	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
L
Linus Torvalds 已提交
59 60


61 62 63 64 65 66 67
static inline int
xfs_buf_is_vmapped(
	struct xfs_buf	*bp)
{
	/*
	 * Return true if the buffer is vmapped.
	 *
68 69 70
	 * b_addr is null if the buffer is not mapped, but the code is clever
	 * enough to know it doesn't have to map a single page, so the check has
	 * to be both for b_addr and bp->b_page_count > 1.
71
	 */
72
	return bp->b_addr && bp->b_page_count > 1;
73 74 75 76 77 78 79 80 81
}

static inline int
xfs_buf_vmap_len(
	struct xfs_buf	*bp)
{
	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
}

L
Linus Torvalds 已提交
82
/*
83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
 * xfs_buf_lru_add - add a buffer to the LRU.
 *
 * The LRU takes a new reference to the buffer so that it will only be freed
 * once the shrinker takes the buffer off the LRU.
 */
STATIC void
xfs_buf_lru_add(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	spin_lock(&btp->bt_lru_lock);
	if (list_empty(&bp->b_lru)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_lru, &btp->bt_lru);
		btp->bt_lru_nr++;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * xfs_buf_lru_del - remove a buffer from the LRU
 *
 * The unlocked check is safe here because it only occurs when there are not
 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 * to optimise the shrinker removing the buffer from the LRU and calling
L
Lucas De Marchi 已提交
109
 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
110
 * bt_lru_lock.
L
Linus Torvalds 已提交
111
 */
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
STATIC void
xfs_buf_lru_del(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	if (list_empty(&bp->b_lru))
		return;

	spin_lock(&btp->bt_lru_lock);
	if (!list_empty(&bp->b_lru)) {
		list_del_init(&bp->b_lru);
		btp->bt_lru_nr--;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
 * to remove the reference that LRU holds on the buffer.
 *
 * This prevents build-up of stale buffers on the LRU.
 */
void
xfs_buf_stale(
	struct xfs_buf	*bp)
{
141 142
	ASSERT(xfs_buf_islocked(bp));

143
	bp->b_flags |= XBF_STALE;
144 145 146 147 148 149 150 151

	/*
	 * Clear the delwri status so that a delwri queue walker will not
	 * flush this buffer to disk now that it is stale. The delwri queue has
	 * a reference to the buffer, so this is safe to do.
	 */
	bp->b_flags &= ~_XBF_DELWRI_Q;

152 153 154 155 156 157 158 159 160 161 162 163 164 165
	atomic_set(&(bp)->b_lru_ref, 0);
	if (!list_empty(&bp->b_lru)) {
		struct xfs_buftarg *btp = bp->b_target;

		spin_lock(&btp->bt_lru_lock);
		if (!list_empty(&bp->b_lru)) {
			list_del_init(&bp->b_lru);
			btp->bt_lru_nr--;
			atomic_dec(&bp->b_hold);
		}
		spin_unlock(&btp->bt_lru_lock);
	}
	ASSERT(atomic_read(&bp->b_hold) >= 1);
}
L
Linus Torvalds 已提交
166

167 168 169
struct xfs_buf *
xfs_buf_alloc(
	struct xfs_buftarg	*target,
170 171
	xfs_daddr_t		blkno,
	size_t			numblks,
172
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
173
{
174 175
	struct xfs_buf		*bp;

D
Dave Chinner 已提交
176
	bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
177 178 179
	if (unlikely(!bp))
		return NULL;

L
Linus Torvalds 已提交
180
	/*
181 182
	 * We don't want certain flags to appear in b_flags unless they are
	 * specifically set by later operations on the buffer.
L
Linus Torvalds 已提交
183
	 */
184
	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
185 186

	atomic_set(&bp->b_hold, 1);
187
	atomic_set(&bp->b_lru_ref, 1);
188
	init_completion(&bp->b_iowait);
189
	INIT_LIST_HEAD(&bp->b_lru);
190
	INIT_LIST_HEAD(&bp->b_list);
191
	RB_CLEAR_NODE(&bp->b_rbnode);
T
Thomas Gleixner 已提交
192
	sema_init(&bp->b_sema, 0); /* held, no waiters */
193 194
	XB_SET_OWNER(bp);
	bp->b_target = target;
D
Dave Chinner 已提交
195

L
Linus Torvalds 已提交
196
	/*
197 198
	 * Set length and io_length to the same value initially.
	 * I/O routines should use io_length, which will be the same in
L
Linus Torvalds 已提交
199 200
	 * most cases but may be reset (e.g. XFS recovery).
	 */
201
	bp->b_length = numblks;
202
	bp->b_io_length = numblks;
203
	bp->b_flags = flags;
204
	bp->b_bn = blkno;
205 206
	bp->b_map.bm_bn = blkno;
	bp->b_map.bm_len = numblks;
207 208 209 210
	atomic_set(&bp->b_pin_count, 0);
	init_waitqueue_head(&bp->b_waiters);

	XFS_STATS_INC(xb_create);
C
Christoph Hellwig 已提交
211
	trace_xfs_buf_init(bp, _RET_IP_);
212 213

	return bp;
L
Linus Torvalds 已提交
214 215 216
}

/*
217 218
 *	Allocate a page array capable of holding a specified number
 *	of pages, and point the page buf at it.
L
Linus Torvalds 已提交
219 220
 */
STATIC int
221 222
_xfs_buf_get_pages(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
223
	int			page_count,
224
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
225 226
{
	/* Make sure that we have a page list */
227 228 229 230
	if (bp->b_pages == NULL) {
		bp->b_page_count = page_count;
		if (page_count <= XB_PAGES) {
			bp->b_pages = bp->b_page_array;
L
Linus Torvalds 已提交
231
		} else {
232
			bp->b_pages = kmem_alloc(sizeof(struct page *) *
D
Dave Chinner 已提交
233
						 page_count, KM_NOFS);
234
			if (bp->b_pages == NULL)
L
Linus Torvalds 已提交
235 236
				return -ENOMEM;
		}
237
		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
L
Linus Torvalds 已提交
238 239 240 241 242
	}
	return 0;
}

/*
243
 *	Frees b_pages if it was allocated.
L
Linus Torvalds 已提交
244 245
 */
STATIC void
246
_xfs_buf_free_pages(
L
Linus Torvalds 已提交
247 248
	xfs_buf_t	*bp)
{
249
	if (bp->b_pages != bp->b_page_array) {
250
		kmem_free(bp->b_pages);
251
		bp->b_pages = NULL;
L
Linus Torvalds 已提交
252 253 254 255 256 257 258
	}
}

/*
 *	Releases the specified buffer.
 *
 * 	The modification state of any associated pages is left unchanged.
259
 * 	The buffer most not be on any hash - use xfs_buf_rele instead for
L
Linus Torvalds 已提交
260 261 262
 * 	hashed and refcounted buffers
 */
void
263
xfs_buf_free(
L
Linus Torvalds 已提交
264 265
	xfs_buf_t		*bp)
{
C
Christoph Hellwig 已提交
266
	trace_xfs_buf_free(bp, _RET_IP_);
L
Linus Torvalds 已提交
267

268 269
	ASSERT(list_empty(&bp->b_lru));

270
	if (bp->b_flags & _XBF_PAGES) {
L
Linus Torvalds 已提交
271 272
		uint		i;

273
		if (xfs_buf_is_vmapped(bp))
A
Alex Elder 已提交
274 275
			vm_unmap_ram(bp->b_addr - bp->b_offset,
					bp->b_page_count);
L
Linus Torvalds 已提交
276

277 278 279
		for (i = 0; i < bp->b_page_count; i++) {
			struct page	*page = bp->b_pages[i];

280
			__free_page(page);
281
		}
282 283
	} else if (bp->b_flags & _XBF_KMEM)
		kmem_free(bp->b_addr);
284
	_xfs_buf_free_pages(bp);
285
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
286 287 288
}

/*
289
 * Allocates all the pages for buffer in question and builds it's page list.
L
Linus Torvalds 已提交
290 291
 */
STATIC int
292
xfs_buf_allocate_memory(
L
Linus Torvalds 已提交
293 294 295
	xfs_buf_t		*bp,
	uint			flags)
{
296
	size_t			size;
L
Linus Torvalds 已提交
297
	size_t			nbytes, offset;
298
	gfp_t			gfp_mask = xb_to_gfp(flags);
L
Linus Torvalds 已提交
299
	unsigned short		page_count, i;
D
Dave Chinner 已提交
300
	xfs_off_t		start, end;
L
Linus Torvalds 已提交
301 302
	int			error;

303 304 305 306 307
	/*
	 * for buffers that are contained within a single page, just allocate
	 * the memory from the heap - there's no need for the complexity of
	 * page arrays to keep allocation down to order 0.
	 */
D
Dave Chinner 已提交
308 309
	size = BBTOB(bp->b_length);
	if (size < PAGE_SIZE) {
D
Dave Chinner 已提交
310
		bp->b_addr = kmem_alloc(size, KM_NOFS);
311 312 313 314 315
		if (!bp->b_addr) {
			/* low memory - use alloc_page loop instead */
			goto use_alloc_page;
		}

D
Dave Chinner 已提交
316
		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
317 318 319 320 321 322 323 324 325 326
		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
			/* b_addr spans two pages - use alloc_page instead */
			kmem_free(bp->b_addr);
			bp->b_addr = NULL;
			goto use_alloc_page;
		}
		bp->b_offset = offset_in_page(bp->b_addr);
		bp->b_pages = bp->b_page_array;
		bp->b_pages[0] = virt_to_page(bp->b_addr);
		bp->b_page_count = 1;
327
		bp->b_flags |= _XBF_KMEM;
328 329 330 331
		return 0;
	}

use_alloc_page:
332 333 334
	start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT;
	end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1)
								>> PAGE_SHIFT;
D
Dave Chinner 已提交
335
	page_count = end - start;
336
	error = _xfs_buf_get_pages(bp, page_count, flags);
L
Linus Torvalds 已提交
337 338 339
	if (unlikely(error))
		return error;

340
	offset = bp->b_offset;
341
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
342

343
	for (i = 0; i < bp->b_page_count; i++) {
L
Linus Torvalds 已提交
344 345
		struct page	*page;
		uint		retries = 0;
346 347
retry:
		page = alloc_page(gfp_mask);
L
Linus Torvalds 已提交
348
		if (unlikely(page == NULL)) {
349 350
			if (flags & XBF_READ_AHEAD) {
				bp->b_page_count = i;
351 352
				error = ENOMEM;
				goto out_free_pages;
L
Linus Torvalds 已提交
353 354 355 356 357 358 359 360 361
			}

			/*
			 * This could deadlock.
			 *
			 * But until all the XFS lowlevel code is revamped to
			 * handle buffer allocation failures we can't do much.
			 */
			if (!(++retries % 100))
362 363
				xfs_err(NULL,
		"possible memory allocation deadlock in %s (mode:0x%x)",
364
					__func__, gfp_mask);
L
Linus Torvalds 已提交
365

366
			XFS_STATS_INC(xb_page_retries);
367
			congestion_wait(BLK_RW_ASYNC, HZ/50);
L
Linus Torvalds 已提交
368 369 370
			goto retry;
		}

371
		XFS_STATS_INC(xb_page_found);
L
Linus Torvalds 已提交
372

373
		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
L
Linus Torvalds 已提交
374
		size -= nbytes;
375
		bp->b_pages[i] = page;
L
Linus Torvalds 已提交
376 377
		offset = 0;
	}
378
	return 0;
L
Linus Torvalds 已提交
379

380 381 382
out_free_pages:
	for (i = 0; i < bp->b_page_count; i++)
		__free_page(bp->b_pages[i]);
L
Linus Torvalds 已提交
383 384 385 386
	return error;
}

/*
L
Lucas De Marchi 已提交
387
 *	Map buffer into kernel address-space if necessary.
L
Linus Torvalds 已提交
388 389
 */
STATIC int
390
_xfs_buf_map_pages(
L
Linus Torvalds 已提交
391 392 393
	xfs_buf_t		*bp,
	uint			flags)
{
394
	ASSERT(bp->b_flags & _XBF_PAGES);
395
	if (bp->b_page_count == 1) {
396
		/* A single page buffer is always mappable */
397
		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
398 399 400
	} else if (flags & XBF_UNMAPPED) {
		bp->b_addr = NULL;
	} else {
401 402 403 404 405 406 407 408 409 410 411
		int retried = 0;

		do {
			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
						-1, PAGE_KERNEL);
			if (bp->b_addr)
				break;
			vm_unmap_aliases();
		} while (retried++ <= 1);

		if (!bp->b_addr)
L
Linus Torvalds 已提交
412
			return -ENOMEM;
413
		bp->b_addr += bp->b_offset;
L
Linus Torvalds 已提交
414 415 416 417 418 419 420 421 422 423
	}

	return 0;
}

/*
 *	Finding and Reading Buffers
 */

/*
424
 *	Look up, and creates if absent, a lockable buffer for
L
Linus Torvalds 已提交
425
 *	a given range of an inode.  The buffer is returned
426
 *	locked.	No I/O is implied by this call.
L
Linus Torvalds 已提交
427 428
 */
xfs_buf_t *
429
_xfs_buf_find(
430 431 432
	struct xfs_buftarg	*btp,
	xfs_daddr_t		blkno,
	size_t			numblks,
433 434
	xfs_buf_flags_t		flags,
	xfs_buf_t		*new_bp)
L
Linus Torvalds 已提交
435
{
436
	size_t			numbytes;
437 438 439 440
	struct xfs_perag	*pag;
	struct rb_node		**rbp;
	struct rb_node		*parent;
	xfs_buf_t		*bp;
L
Linus Torvalds 已提交
441

442
	numbytes = BBTOB(numblks);
L
Linus Torvalds 已提交
443 444

	/* Check for IOs smaller than the sector size / not sector aligned */
445
	ASSERT(!(numbytes < (1 << btp->bt_sshift)));
D
Dave Chinner 已提交
446
	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
L
Linus Torvalds 已提交
447

448 449
	/* get tree root */
	pag = xfs_perag_get(btp->bt_mount,
450
				xfs_daddr_to_agno(btp->bt_mount, blkno));
451 452 453 454 455 456 457 458 459 460

	/* walk tree */
	spin_lock(&pag->pag_buf_lock);
	rbp = &pag->pag_buf_tree.rb_node;
	parent = NULL;
	bp = NULL;
	while (*rbp) {
		parent = *rbp;
		bp = rb_entry(parent, struct xfs_buf, b_rbnode);

D
Dave Chinner 已提交
461
		if (blkno < bp->b_bn)
462
			rbp = &(*rbp)->rb_left;
D
Dave Chinner 已提交
463
		else if (blkno > bp->b_bn)
464 465 466
			rbp = &(*rbp)->rb_right;
		else {
			/*
D
Dave Chinner 已提交
467
			 * found a block number match. If the range doesn't
468 469 470 471 472 473
			 * match, the only way this is allowed is if the buffer
			 * in the cache is stale and the transaction that made
			 * it stale has not yet committed. i.e. we are
			 * reallocating a busy extent. Skip this buffer and
			 * continue searching to the right for an exact match.
			 */
474
			if (bp->b_length != numblks) {
475 476 477 478
				ASSERT(bp->b_flags & XBF_STALE);
				rbp = &(*rbp)->rb_right;
				continue;
			}
479
			atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
480 481 482 483 484
			goto found;
		}
	}

	/* No match found */
485
	if (new_bp) {
486 487 488 489 490
		rb_link_node(&new_bp->b_rbnode, parent, rbp);
		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
		/* the buffer keeps the perag reference until it is freed */
		new_bp->b_pag = pag;
		spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
491
	} else {
492
		XFS_STATS_INC(xb_miss_locked);
493 494
		spin_unlock(&pag->pag_buf_lock);
		xfs_perag_put(pag);
L
Linus Torvalds 已提交
495
	}
496
	return new_bp;
L
Linus Torvalds 已提交
497 498

found:
499 500
	spin_unlock(&pag->pag_buf_lock);
	xfs_perag_put(pag);
L
Linus Torvalds 已提交
501

502 503
	if (!xfs_buf_trylock(bp)) {
		if (flags & XBF_TRYLOCK) {
504 505 506
			xfs_buf_rele(bp);
			XFS_STATS_INC(xb_busy_locked);
			return NULL;
L
Linus Torvalds 已提交
507
		}
508 509
		xfs_buf_lock(bp);
		XFS_STATS_INC(xb_get_locked_waited);
L
Linus Torvalds 已提交
510 511
	}

512 513 514 515 516
	/*
	 * if the buffer is stale, clear all the external state associated with
	 * it. We need to keep flags such as how we allocated the buffer memory
	 * intact here.
	 */
517 518
	if (bp->b_flags & XBF_STALE) {
		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
519
		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
520
	}
C
Christoph Hellwig 已提交
521 522

	trace_xfs_buf_find(bp, flags, _RET_IP_);
523 524
	XFS_STATS_INC(xb_get_locked);
	return bp;
L
Linus Torvalds 已提交
525 526 527
}

/*
528 529 530
 * Assembles a buffer covering the specified range. The code is optimised for
 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 * more hits than misses.
L
Linus Torvalds 已提交
531
 */
532
struct xfs_buf *
533
xfs_buf_get(
534 535 536
	xfs_buftarg_t		*target,
	xfs_daddr_t		blkno,
	size_t			numblks,
537
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
538
{
539 540
	struct xfs_buf		*bp;
	struct xfs_buf		*new_bp;
541
	int			error = 0;
L
Linus Torvalds 已提交
542

543
	bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
544 545 546
	if (likely(bp))
		goto found;

547
	new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
548
	if (unlikely(!new_bp))
L
Linus Torvalds 已提交
549 550
		return NULL;

551 552 553 554 555 556
	error = xfs_buf_allocate_memory(new_bp, flags);
	if (error) {
		kmem_zone_free(xfs_buf_zone, new_bp);
		return NULL;
	}

557
	bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
558
	if (!bp) {
559
		xfs_buf_free(new_bp);
560 561 562
		return NULL;
	}

563 564
	if (bp != new_bp)
		xfs_buf_free(new_bp);
L
Linus Torvalds 已提交
565

566
found:
567
	if (!bp->b_addr) {
568
		error = _xfs_buf_map_pages(bp, flags);
L
Linus Torvalds 已提交
569
		if (unlikely(error)) {
570 571
			xfs_warn(target->bt_mount,
				"%s: failed to map pages\n", __func__);
D
Dave Chinner 已提交
572 573
			xfs_buf_relse(bp);
			return NULL;
L
Linus Torvalds 已提交
574 575 576
		}
	}

577
	XFS_STATS_INC(xb_get);
C
Christoph Hellwig 已提交
578
	trace_xfs_buf_get(bp, flags, _RET_IP_);
579
	return bp;
L
Linus Torvalds 已提交
580 581
}

C
Christoph Hellwig 已提交
582 583 584 585 586
STATIC int
_xfs_buf_read(
	xfs_buf_t		*bp,
	xfs_buf_flags_t		flags)
{
587
	ASSERT(!(flags & XBF_WRITE));
588
	ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL);
C
Christoph Hellwig 已提交
589

590
	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
591
	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
C
Christoph Hellwig 已提交
592

593 594 595
	xfs_buf_iorequest(bp);
	if (flags & XBF_ASYNC)
		return 0;
596
	return xfs_buf_iowait(bp);
C
Christoph Hellwig 已提交
597 598
}

L
Linus Torvalds 已提交
599
xfs_buf_t *
600
xfs_buf_read(
L
Linus Torvalds 已提交
601
	xfs_buftarg_t		*target,
602 603
	xfs_daddr_t		blkno,
	size_t			numblks,
604
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
605
{
606 607 608 609
	xfs_buf_t		*bp;

	flags |= XBF_READ;

610
	bp = xfs_buf_get(target, blkno, numblks, flags);
611
	if (bp) {
C
Christoph Hellwig 已提交
612 613
		trace_xfs_buf_read(bp, flags, _RET_IP_);

614 615
		if (!XFS_BUF_ISDONE(bp)) {
			XFS_STATS_INC(xb_get_read);
C
Christoph Hellwig 已提交
616
			_xfs_buf_read(bp, flags);
617
		} else if (flags & XBF_ASYNC) {
L
Linus Torvalds 已提交
618 619 620 621
			/*
			 * Read ahead call which is already satisfied,
			 * drop the buffer
			 */
D
Dave Chinner 已提交
622 623
			xfs_buf_relse(bp);
			return NULL;
L
Linus Torvalds 已提交
624 625
		} else {
			/* We do not want read in the flags */
626
			bp->b_flags &= ~XBF_READ;
L
Linus Torvalds 已提交
627 628 629
		}
	}

630
	return bp;
L
Linus Torvalds 已提交
631 632 633
}

/*
634 635
 *	If we are not low on memory then do the readahead in a deadlock
 *	safe manner.
L
Linus Torvalds 已提交
636 637
 */
void
638
xfs_buf_readahead(
L
Linus Torvalds 已提交
639
	xfs_buftarg_t		*target,
640 641
	xfs_daddr_t		blkno,
	size_t			numblks)
L
Linus Torvalds 已提交
642
{
643
	if (bdi_read_congested(target->bt_bdi))
L
Linus Torvalds 已提交
644 645
		return;

646
	xfs_buf_read(target, blkno, numblks,
D
Dave Chinner 已提交
647
		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
L
Linus Torvalds 已提交
648 649
}

650 651 652 653 654 655 656 657
/*
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
struct xfs_buf *
xfs_buf_read_uncached(
	struct xfs_buftarg	*target,
	xfs_daddr_t		daddr,
658
	size_t			numblks,
659 660 661 662 663
	int			flags)
{
	xfs_buf_t		*bp;
	int			error;

664
	bp = xfs_buf_get_uncached(target, numblks, flags);
665 666 667 668
	if (!bp)
		return NULL;

	/* set up the buffer for a read IO */
669 670
	bp->b_map.bm_bn = daddr;
	bp->b_flags |= XBF_READ;
671

672
	xfsbdstrat(target->bt_mount, bp);
C
Christoph Hellwig 已提交
673
	error = xfs_buf_iowait(bp);
674
	if (error) {
675 676 677 678
		xfs_buf_relse(bp);
		return NULL;
	}
	return bp;
L
Linus Torvalds 已提交
679 680
}

681 682 683 684 685 686 687
/*
 * Return a buffer allocated as an empty buffer and associated to external
 * memory via xfs_buf_associate_memory() back to it's empty state.
 */
void
xfs_buf_set_empty(
	struct xfs_buf		*bp,
688
	size_t			numblks)
689 690 691 692 693 694 695
{
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);

	bp->b_pages = NULL;
	bp->b_page_count = 0;
	bp->b_addr = NULL;
696
	bp->b_length = numblks;
697
	bp->b_io_length = numblks;
698
	bp->b_bn = XFS_BUF_DADDR_NULL;
699 700
	bp->b_map.bm_bn = XFS_BUF_DADDR_NULL;
	bp->b_map.bm_len = bp->b_length;
701 702
}

L
Linus Torvalds 已提交
703 704 705 706
static inline struct page *
mem_to_page(
	void			*addr)
{
707
	if ((!is_vmalloc_addr(addr))) {
L
Linus Torvalds 已提交
708 709 710 711 712 713 714
		return virt_to_page(addr);
	} else {
		return vmalloc_to_page(addr);
	}
}

int
715 716
xfs_buf_associate_memory(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
717 718 719 720 721
	void			*mem,
	size_t			len)
{
	int			rval;
	int			i = 0;
722 723 724
	unsigned long		pageaddr;
	unsigned long		offset;
	size_t			buflen;
L
Linus Torvalds 已提交
725 726
	int			page_count;

727
	pageaddr = (unsigned long)mem & PAGE_MASK;
728
	offset = (unsigned long)mem - pageaddr;
729 730
	buflen = PAGE_ALIGN(len + offset);
	page_count = buflen >> PAGE_SHIFT;
L
Linus Torvalds 已提交
731 732

	/* Free any previous set of page pointers */
733 734
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
735

736 737
	bp->b_pages = NULL;
	bp->b_addr = mem;
L
Linus Torvalds 已提交
738

D
Dave Chinner 已提交
739
	rval = _xfs_buf_get_pages(bp, page_count, 0);
L
Linus Torvalds 已提交
740 741 742
	if (rval)
		return rval;

743
	bp->b_offset = offset;
744 745 746

	for (i = 0; i < bp->b_page_count; i++) {
		bp->b_pages[i] = mem_to_page((void *)pageaddr);
747
		pageaddr += PAGE_SIZE;
L
Linus Torvalds 已提交
748 749
	}

750
	bp->b_io_length = BTOBB(len);
751
	bp->b_length = BTOBB(buflen);
L
Linus Torvalds 已提交
752 753 754 755 756

	return 0;
}

xfs_buf_t *
757 758
xfs_buf_get_uncached(
	struct xfs_buftarg	*target,
759
	size_t			numblks,
760
	int			flags)
L
Linus Torvalds 已提交
761
{
762
	unsigned long		page_count;
763
	int			error, i;
L
Linus Torvalds 已提交
764 765
	xfs_buf_t		*bp;

766
	bp = xfs_buf_alloc(target, XFS_BUF_DADDR_NULL, numblks, 0);
L
Linus Torvalds 已提交
767 768 769
	if (unlikely(bp == NULL))
		goto fail;

770
	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
771 772
	error = _xfs_buf_get_pages(bp, page_count, 0);
	if (error)
L
Linus Torvalds 已提交
773 774
		goto fail_free_buf;

775
	for (i = 0; i < page_count; i++) {
776
		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
777 778
		if (!bp->b_pages[i])
			goto fail_free_mem;
L
Linus Torvalds 已提交
779
	}
780
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
781

782
	error = _xfs_buf_map_pages(bp, 0);
783
	if (unlikely(error)) {
784 785
		xfs_warn(target->bt_mount,
			"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
786
		goto fail_free_mem;
787
	}
L
Linus Torvalds 已提交
788

789
	trace_xfs_buf_get_uncached(bp, _RET_IP_);
L
Linus Torvalds 已提交
790
	return bp;
791

L
Linus Torvalds 已提交
792
 fail_free_mem:
793 794
	while (--i >= 0)
		__free_page(bp->b_pages[i]);
795
	_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
796
 fail_free_buf:
797
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
798 799 800 801 802 803 804 805 806 807
 fail:
	return NULL;
}

/*
 *	Increment reference count on buffer, to hold the buffer concurrently
 *	with another thread which may release (free) the buffer asynchronously.
 *	Must hold the buffer already to call this function.
 */
void
808 809
xfs_buf_hold(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
810
{
C
Christoph Hellwig 已提交
811
	trace_xfs_buf_hold(bp, _RET_IP_);
812
	atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
813 814 815
}

/*
816 817
 *	Releases a hold on the specified buffer.  If the
 *	the hold count is 1, calls xfs_buf_free.
L
Linus Torvalds 已提交
818 819
 */
void
820 821
xfs_buf_rele(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
822
{
823
	struct xfs_perag	*pag = bp->b_pag;
L
Linus Torvalds 已提交
824

C
Christoph Hellwig 已提交
825
	trace_xfs_buf_rele(bp, _RET_IP_);
L
Linus Torvalds 已提交
826

827
	if (!pag) {
828
		ASSERT(list_empty(&bp->b_lru));
829
		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
830 831 832 833 834
		if (atomic_dec_and_test(&bp->b_hold))
			xfs_buf_free(bp);
		return;
	}

835
	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
836

837
	ASSERT(atomic_read(&bp->b_hold) > 0);
838
	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
839
		if (!(bp->b_flags & XBF_STALE) &&
840 841 842
			   atomic_read(&bp->b_lru_ref)) {
			xfs_buf_lru_add(bp);
			spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
843
		} else {
844
			xfs_buf_lru_del(bp);
845
			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
846 847 848
			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			spin_unlock(&pag->pag_buf_lock);
			xfs_perag_put(pag);
849
			xfs_buf_free(bp);
L
Linus Torvalds 已提交
850 851 852 853 854 855
		}
	}
}


/*
856
 *	Lock a buffer object, if it is not already locked.
857 858 859 860 861 862 863 864
 *
 *	If we come across a stale, pinned, locked buffer, we know that we are
 *	being asked to lock a buffer that has been reallocated. Because it is
 *	pinned, we know that the log has not been pushed to disk and hence it
 *	will still be locked.  Rather than continuing to have trylock attempts
 *	fail until someone else pushes the log, push it ourselves before
 *	returning.  This means that the xfsaild will not get stuck trying
 *	to push on stale inode buffers.
L
Linus Torvalds 已提交
865 866
 */
int
867 868
xfs_buf_trylock(
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
869 870 871
{
	int			locked;

872
	locked = down_trylock(&bp->b_sema) == 0;
C
Christoph Hellwig 已提交
873
	if (locked)
874
		XB_SET_OWNER(bp);
875 876
	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
		xfs_log_force(bp->b_target->bt_mount, 0);
C
Christoph Hellwig 已提交
877

878 879
	trace_xfs_buf_trylock(bp, _RET_IP_);
	return locked;
L
Linus Torvalds 已提交
880 881 882
}

/*
883
 *	Lock a buffer object.
884 885 886 887 888 889
 *
 *	If we come across a stale, pinned, locked buffer, we know that we
 *	are being asked to lock a buffer that has been reallocated. Because
 *	it is pinned, we know that the log has not been pushed to disk and
 *	hence it will still be locked. Rather than sleeping until someone
 *	else pushes the log, push it ourselves before trying to get the lock.
L
Linus Torvalds 已提交
890
 */
891 892
void
xfs_buf_lock(
893
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
894
{
C
Christoph Hellwig 已提交
895 896
	trace_xfs_buf_lock(bp, _RET_IP_);

897
	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
898
		xfs_log_force(bp->b_target->bt_mount, 0);
899 900
	down(&bp->b_sema);
	XB_SET_OWNER(bp);
C
Christoph Hellwig 已提交
901 902

	trace_xfs_buf_lock_done(bp, _RET_IP_);
L
Linus Torvalds 已提交
903 904 905
}

void
906
xfs_buf_unlock(
907
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
908
{
909 910
	XB_CLEAR_OWNER(bp);
	up(&bp->b_sema);
C
Christoph Hellwig 已提交
911 912

	trace_xfs_buf_unlock(bp, _RET_IP_);
L
Linus Torvalds 已提交
913 914
}

915 916 917
STATIC void
xfs_buf_wait_unpin(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
918 919 920
{
	DECLARE_WAITQUEUE	(wait, current);

921
	if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
922 923
		return;

924
	add_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
925 926
	for (;;) {
		set_current_state(TASK_UNINTERRUPTIBLE);
927
		if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
928
			break;
J
Jens Axboe 已提交
929
		io_schedule();
L
Linus Torvalds 已提交
930
	}
931
	remove_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
932 933 934 935 936 937 938 939
	set_current_state(TASK_RUNNING);
}

/*
 *	Buffer Utility Routines
 */

STATIC void
940
xfs_buf_iodone_work(
D
David Howells 已提交
941
	struct work_struct	*work)
L
Linus Torvalds 已提交
942
{
D
David Howells 已提交
943 944
	xfs_buf_t		*bp =
		container_of(work, xfs_buf_t, b_iodone_work);
L
Linus Torvalds 已提交
945

946
	if (bp->b_iodone)
947 948
		(*(bp->b_iodone))(bp);
	else if (bp->b_flags & XBF_ASYNC)
L
Linus Torvalds 已提交
949 950 951 952
		xfs_buf_relse(bp);
}

void
953 954
xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
955 956
	int			schedule)
{
C
Christoph Hellwig 已提交
957 958
	trace_xfs_buf_iodone(bp, _RET_IP_);

959
	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
960 961
	if (bp->b_error == 0)
		bp->b_flags |= XBF_DONE;
L
Linus Torvalds 已提交
962

963
	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
L
Linus Torvalds 已提交
964
		if (schedule) {
D
David Howells 已提交
965
			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
966
			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
L
Linus Torvalds 已提交
967
		} else {
D
David Howells 已提交
968
			xfs_buf_iodone_work(&bp->b_iodone_work);
L
Linus Torvalds 已提交
969 970
		}
	} else {
971
		complete(&bp->b_iowait);
L
Linus Torvalds 已提交
972 973 974 975
	}
}

void
976 977 978
xfs_buf_ioerror(
	xfs_buf_t		*bp,
	int			error)
L
Linus Torvalds 已提交
979 980
{
	ASSERT(error >= 0 && error <= 0xffff);
981
	bp->b_error = (unsigned short)error;
C
Christoph Hellwig 已提交
982
	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
L
Linus Torvalds 已提交
983 984
}

985 986 987 988 989 990
void
xfs_buf_ioerror_alert(
	struct xfs_buf		*bp,
	const char		*func)
{
	xfs_alert(bp->b_target->bt_mount,
991 992
"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
		(__uint64_t)XFS_BUF_ADDR(bp), func, bp->b_error, bp->b_length);
993 994
}

L
Linus Torvalds 已提交
995
int
C
Christoph Hellwig 已提交
996
xfs_bwrite(
C
Christoph Hellwig 已提交
997
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
998
{
999
	int			error;
L
Linus Torvalds 已提交
1000

1001 1002
	ASSERT(xfs_buf_islocked(bp));

C
Christoph Hellwig 已提交
1003
	bp->b_flags |= XBF_WRITE;
1004
	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
L
Linus Torvalds 已提交
1005

1006
	xfs_bdstrat_cb(bp);
L
Linus Torvalds 已提交
1007

1008
	error = xfs_buf_iowait(bp);
1009 1010 1011 1012
	if (error) {
		xfs_force_shutdown(bp->b_target->bt_mount,
				   SHUTDOWN_META_IO_ERROR);
	}
C
Christoph Hellwig 已提交
1013
	return error;
C
Christoph Hellwig 已提交
1014
}
L
Linus Torvalds 已提交
1015

1016 1017
/*
 * Called when we want to stop a buffer from getting written or read.
C
Christoph Hellwig 已提交
1018
 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
 * so that the proper iodone callbacks get called.
 */
STATIC int
xfs_bioerror(
	xfs_buf_t *bp)
{
#ifdef XFSERRORDEBUG
	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
#endif

	/*
	 * No need to wait until the buffer is unpinned, we aren't flushing it.
	 */
1032
	xfs_buf_ioerror(bp, EIO);
1033 1034

	/*
C
Christoph Hellwig 已提交
1035
	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1036 1037 1038
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDONE(bp);
1039
	xfs_buf_stale(bp);
1040

C
Christoph Hellwig 已提交
1041
	xfs_buf_ioend(bp, 0);
1042 1043 1044 1045 1046 1047

	return EIO;
}

/*
 * Same as xfs_bioerror, except that we are releasing the buffer
C
Christoph Hellwig 已提交
1048
 * here ourselves, and avoiding the xfs_buf_ioend call.
1049 1050 1051 1052 1053 1054 1055
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
STATIC int
xfs_bioerror_relse(
	struct xfs_buf	*bp)
{
1056
	int64_t		fl = bp->b_flags;
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
	/*
	 * No need to wait until the buffer is unpinned.
	 * We aren't flushing it.
	 *
	 * chunkhold expects B_DONE to be set, whether
	 * we actually finish the I/O or not. We don't want to
	 * change that interface.
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_DONE(bp);
1067
	xfs_buf_stale(bp);
1068
	bp->b_iodone = NULL;
1069
	if (!(fl & XBF_ASYNC)) {
1070 1071 1072 1073 1074 1075
		/*
		 * Mark b_error and B_ERROR _both_.
		 * Lot's of chunkcache code assumes that.
		 * There's no reason to mark error for
		 * ASYNC buffers.
		 */
1076
		xfs_buf_ioerror(bp, EIO);
1077
		complete(&bp->b_iowait);
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
	} else {
		xfs_buf_relse(bp);
	}

	return EIO;
}


/*
 * All xfs metadata buffers except log state machine buffers
 * get this attached as their b_bdstrat callback function.
 * This is so that we can catch a buffer
 * after prematurely unpinning it to forcibly shutdown the filesystem.
 */
int
xfs_bdstrat_cb(
	struct xfs_buf	*bp)
{
1096
	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		/*
		 * Metadata write that didn't get logged but
		 * written delayed anyway. These aren't associated
		 * with a transaction, and can be ignored.
		 */
		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
			return xfs_bioerror_relse(bp);
		else
			return xfs_bioerror(bp);
	}

	xfs_buf_iorequest(bp);
	return 0;
}

/*
 * Wrapper around bdstrat so that we can stop data from going to disk in case
 * we are shutting down the filesystem.  Typically user data goes thru this
 * path; one of the exceptions is the superblock.
 */
void
xfsbdstrat(
	struct xfs_mount	*mp,
	struct xfs_buf		*bp)
{
	if (XFS_FORCED_SHUTDOWN(mp)) {
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		xfs_bioerror_relse(bp);
		return;
	}

	xfs_buf_iorequest(bp);
}

1132
STATIC void
1133 1134
_xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1135 1136
	int			schedule)
{
1137
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1138
		xfs_buf_ioend(bp, schedule);
L
Linus Torvalds 已提交
1139 1140
}

A
Al Viro 已提交
1141
STATIC void
1142
xfs_buf_bio_end_io(
L
Linus Torvalds 已提交
1143 1144 1145
	struct bio		*bio,
	int			error)
{
1146
	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
L
Linus Torvalds 已提交
1147

1148
	xfs_buf_ioerror(bp, -error);
L
Linus Torvalds 已提交
1149

1150 1151 1152
	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));

1153
	_xfs_buf_ioend(bp, 1);
L
Linus Torvalds 已提交
1154 1155 1156 1157
	bio_put(bio);
}

STATIC void
1158 1159
_xfs_buf_ioapply(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1160
{
1161
	int			rw, map_i, total_nr_pages, nr_pages;
L
Linus Torvalds 已提交
1162
	struct bio		*bio;
1163
	int			offset = bp->b_offset;
1164
	int			size = BBTOB(bp->b_io_length);
1165
	sector_t		sector = bp->b_map.bm_bn;
L
Linus Torvalds 已提交
1166

1167
	total_nr_pages = bp->b_page_count;
L
Linus Torvalds 已提交
1168 1169
	map_i = 0;

1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
	if (bp->b_flags & XBF_WRITE) {
		if (bp->b_flags & XBF_SYNCIO)
			rw = WRITE_SYNC;
		else
			rw = WRITE;
		if (bp->b_flags & XBF_FUA)
			rw |= REQ_FUA;
		if (bp->b_flags & XBF_FLUSH)
			rw |= REQ_FLUSH;
	} else if (bp->b_flags & XBF_READ_AHEAD) {
		rw = READA;
1181
	} else {
1182
		rw = READ;
1183 1184
	}

1185 1186 1187
	/* we only use the buffer cache for meta-data */
	rw |= REQ_META;

L
Linus Torvalds 已提交
1188
next_chunk:
1189
	atomic_inc(&bp->b_io_remaining);
L
Linus Torvalds 已提交
1190 1191 1192 1193 1194
	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
	if (nr_pages > total_nr_pages)
		nr_pages = total_nr_pages;

	bio = bio_alloc(GFP_NOIO, nr_pages);
1195
	bio->bi_bdev = bp->b_target->bt_bdev;
L
Linus Torvalds 已提交
1196
	bio->bi_sector = sector;
1197 1198
	bio->bi_end_io = xfs_buf_bio_end_io;
	bio->bi_private = bp;
L
Linus Torvalds 已提交
1199

1200

L
Linus Torvalds 已提交
1201
	for (; size && nr_pages; nr_pages--, map_i++) {
1202
		int	rbytes, nbytes = PAGE_SIZE - offset;
L
Linus Torvalds 已提交
1203 1204 1205 1206

		if (nbytes > size)
			nbytes = size;

1207 1208
		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
		if (rbytes < nbytes)
L
Linus Torvalds 已提交
1209 1210 1211
			break;

		offset = 0;
1212
		sector += BTOBB(nbytes);
L
Linus Torvalds 已提交
1213 1214 1215 1216 1217
		size -= nbytes;
		total_nr_pages--;
	}

	if (likely(bio->bi_size)) {
1218 1219 1220 1221
		if (xfs_buf_is_vmapped(bp)) {
			flush_kernel_vmap_range(bp->b_addr,
						xfs_buf_vmap_len(bp));
		}
L
Linus Torvalds 已提交
1222 1223 1224 1225
		submit_bio(rw, bio);
		if (size)
			goto next_chunk;
	} else {
1226
		xfs_buf_ioerror(bp, EIO);
1227
		bio_put(bio);
L
Linus Torvalds 已提交
1228 1229 1230
	}
}

1231
void
1232 1233
xfs_buf_iorequest(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1234
{
C
Christoph Hellwig 已提交
1235
	trace_xfs_buf_iorequest(bp, _RET_IP_);
L
Linus Torvalds 已提交
1236

1237
	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
L
Linus Torvalds 已提交
1238

1239
	if (bp->b_flags & XBF_WRITE)
1240 1241
		xfs_buf_wait_unpin(bp);
	xfs_buf_hold(bp);
L
Linus Torvalds 已提交
1242 1243 1244

	/* Set the count to 1 initially, this will stop an I/O
	 * completion callout which happens before we have started
1245
	 * all the I/O from calling xfs_buf_ioend too early.
L
Linus Torvalds 已提交
1246
	 */
1247 1248 1249
	atomic_set(&bp->b_io_remaining, 1);
	_xfs_buf_ioapply(bp);
	_xfs_buf_ioend(bp, 0);
L
Linus Torvalds 已提交
1250

1251
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
1252 1253 1254
}

/*
1255 1256 1257
 * Waits for I/O to complete on the buffer supplied.  It returns immediately if
 * no I/O is pending or there is already a pending error on the buffer.  It
 * returns the I/O error code, if any, or 0 if there was no error.
L
Linus Torvalds 已提交
1258 1259
 */
int
1260 1261
xfs_buf_iowait(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1262
{
C
Christoph Hellwig 已提交
1263 1264
	trace_xfs_buf_iowait(bp, _RET_IP_);

1265 1266
	if (!bp->b_error)
		wait_for_completion(&bp->b_iowait);
C
Christoph Hellwig 已提交
1267 1268

	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1269
	return bp->b_error;
L
Linus Torvalds 已提交
1270 1271
}

1272 1273 1274
xfs_caddr_t
xfs_buf_offset(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1275 1276 1277 1278
	size_t			offset)
{
	struct page		*page;

1279
	if (bp->b_addr)
1280
		return bp->b_addr + offset;
L
Linus Torvalds 已提交
1281

1282
	offset += bp->b_offset;
1283 1284
	page = bp->b_pages[offset >> PAGE_SHIFT];
	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
L
Linus Torvalds 已提交
1285 1286 1287 1288 1289 1290
}

/*
 *	Move data into or out of a buffer.
 */
void
1291 1292
xfs_buf_iomove(
	xfs_buf_t		*bp,	/* buffer to process		*/
L
Linus Torvalds 已提交
1293 1294
	size_t			boff,	/* starting buffer offset	*/
	size_t			bsize,	/* length to copy		*/
1295
	void			*data,	/* data address			*/
1296
	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
L
Linus Torvalds 已提交
1297
{
D
Dave Chinner 已提交
1298
	size_t			bend;
L
Linus Torvalds 已提交
1299 1300 1301

	bend = boff + bsize;
	while (boff < bend) {
D
Dave Chinner 已提交
1302 1303 1304 1305 1306 1307 1308 1309
		struct page	*page;
		int		page_index, page_offset, csize;

		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
		page = bp->b_pages[page_index];
		csize = min_t(size_t, PAGE_SIZE - page_offset,
				      BBTOB(bp->b_io_length) - boff);
L
Linus Torvalds 已提交
1310

D
Dave Chinner 已提交
1311
		ASSERT((csize + page_offset) <= PAGE_SIZE);
L
Linus Torvalds 已提交
1312 1313

		switch (mode) {
1314
		case XBRW_ZERO:
D
Dave Chinner 已提交
1315
			memset(page_address(page) + page_offset, 0, csize);
L
Linus Torvalds 已提交
1316
			break;
1317
		case XBRW_READ:
D
Dave Chinner 已提交
1318
			memcpy(data, page_address(page) + page_offset, csize);
L
Linus Torvalds 已提交
1319
			break;
1320
		case XBRW_WRITE:
D
Dave Chinner 已提交
1321
			memcpy(page_address(page) + page_offset, data, csize);
L
Linus Torvalds 已提交
1322 1323 1324 1325 1326 1327 1328 1329
		}

		boff += csize;
		data += csize;
	}
}

/*
1330
 *	Handling of buffer targets (buftargs).
L
Linus Torvalds 已提交
1331 1332 1333
 */

/*
1334 1335 1336
 * Wait for any bufs with callbacks that have been submitted but have not yet
 * returned. These buffers will have an elevated hold count, so wait on those
 * while freeing all the buffers only held by the LRU.
L
Linus Torvalds 已提交
1337 1338 1339
 */
void
xfs_wait_buftarg(
1340
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1341
{
1342 1343 1344 1345 1346 1347 1348 1349
	struct xfs_buf		*bp;

restart:
	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
		if (atomic_read(&bp->b_hold) > 1) {
			spin_unlock(&btp->bt_lru_lock);
D
Dave Chinner 已提交
1350
			delay(100);
1351
			goto restart;
L
Linus Torvalds 已提交
1352
		}
1353
		/*
1354
		 * clear the LRU reference count so the buffer doesn't get
1355 1356 1357 1358 1359 1360
		 * ignored in xfs_buf_rele().
		 */
		atomic_set(&bp->b_lru_ref, 0);
		spin_unlock(&btp->bt_lru_lock);
		xfs_buf_rele(bp);
		spin_lock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1361
	}
1362
	spin_unlock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1363 1364
}

1365 1366 1367
int
xfs_buftarg_shrink(
	struct shrinker		*shrink,
1368
	struct shrink_control	*sc)
1369
{
1370 1371
	struct xfs_buftarg	*btp = container_of(shrink,
					struct xfs_buftarg, bt_shrinker);
1372
	struct xfs_buf		*bp;
1373
	int nr_to_scan = sc->nr_to_scan;
1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401
	LIST_HEAD(dispose);

	if (!nr_to_scan)
		return btp->bt_lru_nr;

	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		if (nr_to_scan-- <= 0)
			break;

		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);

		/*
		 * Decrement the b_lru_ref count unless the value is already
		 * zero. If the value is already zero, we need to reclaim the
		 * buffer, otherwise it gets another trip through the LRU.
		 */
		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
			list_move_tail(&bp->b_lru, &btp->bt_lru);
			continue;
		}

		/*
		 * remove the buffer from the LRU now to avoid needing another
		 * lock round trip inside xfs_buf_rele().
		 */
		list_move(&bp->b_lru, &dispose);
		btp->bt_lru_nr--;
1402
	}
1403 1404 1405 1406 1407 1408 1409 1410 1411
	spin_unlock(&btp->bt_lru_lock);

	while (!list_empty(&dispose)) {
		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
		list_del_init(&bp->b_lru);
		xfs_buf_rele(bp);
	}

	return btp->bt_lru_nr;
1412 1413
}

L
Linus Torvalds 已提交
1414 1415
void
xfs_free_buftarg(
1416 1417
	struct xfs_mount	*mp,
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1418
{
1419 1420
	unregister_shrinker(&btp->bt_shrinker);

1421 1422
	if (mp->m_flags & XFS_MOUNT_BARRIER)
		xfs_blkdev_issue_flush(btp);
1423

1424
	kmem_free(btp);
L
Linus Torvalds 已提交
1425 1426 1427 1428 1429 1430 1431 1432 1433
}

STATIC int
xfs_setsize_buftarg_flags(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize,
	int			verbose)
{
1434 1435 1436
	btp->bt_bsize = blocksize;
	btp->bt_sshift = ffs(sectorsize) - 1;
	btp->bt_smask = sectorsize - 1;
L
Linus Torvalds 已提交
1437

1438
	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1439 1440 1441 1442
		char name[BDEVNAME_SIZE];

		bdevname(btp->bt_bdev, name);

1443 1444
		xfs_warn(btp->bt_mount,
			"Cannot set_blocksize to %u on device %s\n",
1445
			sectorsize, name);
L
Linus Torvalds 已提交
1446 1447 1448 1449 1450 1451 1452
		return EINVAL;
	}

	return 0;
}

/*
1453 1454 1455 1456
 *	When allocating the initial buffer target we have not yet
 *	read in the superblock, so don't know what sized sectors
 *	are being used is at this early stage.  Play safe.
 */
L
Linus Torvalds 已提交
1457 1458 1459 1460 1461 1462
STATIC int
xfs_setsize_buftarg_early(
	xfs_buftarg_t		*btp,
	struct block_device	*bdev)
{
	return xfs_setsize_buftarg_flags(btp,
1463
			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
L
Linus Torvalds 已提交
1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476
}

int
xfs_setsize_buftarg(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize)
{
	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
}

xfs_buftarg_t *
xfs_alloc_buftarg(
1477
	struct xfs_mount	*mp,
L
Linus Torvalds 已提交
1478
	struct block_device	*bdev,
1479 1480
	int			external,
	const char		*fsname)
L
Linus Torvalds 已提交
1481 1482 1483 1484 1485
{
	xfs_buftarg_t		*btp;

	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);

1486
	btp->bt_mount = mp;
1487 1488
	btp->bt_dev =  bdev->bd_dev;
	btp->bt_bdev = bdev;
1489 1490 1491 1492
	btp->bt_bdi = blk_get_backing_dev_info(bdev);
	if (!btp->bt_bdi)
		goto error;

1493 1494
	INIT_LIST_HEAD(&btp->bt_lru);
	spin_lock_init(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1495 1496
	if (xfs_setsize_buftarg_early(btp, bdev))
		goto error;
1497 1498 1499
	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
	register_shrinker(&btp->bt_shrinker);
L
Linus Torvalds 已提交
1500 1501 1502
	return btp;

error:
1503
	kmem_free(btp);
L
Linus Torvalds 已提交
1504 1505 1506 1507
	return NULL;
}

/*
1508 1509 1510 1511 1512 1513 1514 1515 1516
 * Add a buffer to the delayed write list.
 *
 * This queues a buffer for writeout if it hasn't already been.  Note that
 * neither this routine nor the buffer list submission functions perform
 * any internal synchronization.  It is expected that the lists are thread-local
 * to the callers.
 *
 * Returns true if we queued up the buffer, or false if it already had
 * been on the buffer list.
L
Linus Torvalds 已提交
1517
 */
1518
bool
1519
xfs_buf_delwri_queue(
1520 1521
	struct xfs_buf		*bp,
	struct list_head	*list)
L
Linus Torvalds 已提交
1522
{
1523
	ASSERT(xfs_buf_islocked(bp));
1524
	ASSERT(!(bp->b_flags & XBF_READ));
L
Linus Torvalds 已提交
1525

1526 1527 1528 1529 1530 1531 1532 1533
	/*
	 * If the buffer is already marked delwri it already is queued up
	 * by someone else for imediate writeout.  Just ignore it in that
	 * case.
	 */
	if (bp->b_flags & _XBF_DELWRI_Q) {
		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
		return false;
L
Linus Torvalds 已提交
1534 1535
	}

1536
	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1537 1538

	/*
1539 1540 1541 1542 1543 1544
	 * If a buffer gets written out synchronously or marked stale while it
	 * is on a delwri list we lazily remove it. To do this, the other party
	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
	 * It remains referenced and on the list.  In a rare corner case it
	 * might get readded to a delwri list after the synchronous writeout, in
	 * which case we need just need to re-add the flag here.
1545
	 */
1546 1547 1548 1549
	bp->b_flags |= _XBF_DELWRI_Q;
	if (list_empty(&bp->b_list)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_list, list);
1550 1551
	}

1552
	return true;
1553 1554
}

1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569
/*
 * Compare function is more complex than it needs to be because
 * the return value is only 32 bits and we are doing comparisons
 * on 64 bit values
 */
static int
xfs_buf_cmp(
	void		*priv,
	struct list_head *a,
	struct list_head *b)
{
	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
	xfs_daddr_t		diff;

1570
	diff = ap->b_map.bm_bn - bp->b_map.bm_bn;
1571 1572 1573 1574 1575 1576 1577
	if (diff < 0)
		return -1;
	if (diff > 0)
		return 1;
	return 0;
}

1578 1579 1580 1581 1582
static int
__xfs_buf_delwri_submit(
	struct list_head	*buffer_list,
	struct list_head	*io_list,
	bool			wait)
L
Linus Torvalds 已提交
1583
{
1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598
	struct blk_plug		plug;
	struct xfs_buf		*bp, *n;
	int			pinned = 0;

	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
		if (!wait) {
			if (xfs_buf_ispinned(bp)) {
				pinned++;
				continue;
			}
			if (!xfs_buf_trylock(bp))
				continue;
		} else {
			xfs_buf_lock(bp);
		}
1599

1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610
		/*
		 * Someone else might have written the buffer synchronously or
		 * marked it stale in the meantime.  In that case only the
		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
		 * reference and remove it from the list here.
		 */
		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
			list_del_init(&bp->b_list);
			xfs_buf_relse(bp);
			continue;
		}
D
Dave Chinner 已提交
1611

1612 1613 1614
		list_move_tail(&bp->b_list, io_list);
		trace_xfs_buf_delwri_split(bp, _RET_IP_);
	}
L
Linus Torvalds 已提交
1615

1616
	list_sort(NULL, io_list, xfs_buf_cmp);
L
Linus Torvalds 已提交
1617

1618 1619 1620 1621
	blk_start_plug(&plug);
	list_for_each_entry_safe(bp, n, io_list, b_list) {
		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
		bp->b_flags |= XBF_WRITE;
1622

1623 1624
		if (!wait) {
			bp->b_flags |= XBF_ASYNC;
1625
			list_del_init(&bp->b_list);
L
Linus Torvalds 已提交
1626
		}
1627 1628 1629
		xfs_bdstrat_cb(bp);
	}
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
1630

1631
	return pinned;
L
Linus Torvalds 已提交
1632 1633 1634
}

/*
1635 1636 1637 1638 1639 1640 1641
 * Write out a buffer list asynchronously.
 *
 * This will take the @buffer_list, write all non-locked and non-pinned buffers
 * out and not wait for I/O completion on any of the buffers.  This interface
 * is only safely useable for callers that can track I/O completion by higher
 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
 * function.
L
Linus Torvalds 已提交
1642 1643
 */
int
1644 1645
xfs_buf_delwri_submit_nowait(
	struct list_head	*buffer_list)
L
Linus Torvalds 已提交
1646
{
1647 1648 1649
	LIST_HEAD		(io_list);
	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
}
L
Linus Torvalds 已提交
1650

1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665
/*
 * Write out a buffer list synchronously.
 *
 * This will take the @buffer_list, write all buffers out and wait for I/O
 * completion on all of the buffers. @buffer_list is consumed by the function,
 * so callers must have some other way of tracking buffers if they require such
 * functionality.
 */
int
xfs_buf_delwri_submit(
	struct list_head	*buffer_list)
{
	LIST_HEAD		(io_list);
	int			error = 0, error2;
	struct xfs_buf		*bp;
L
Linus Torvalds 已提交
1666

1667
	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
L
Linus Torvalds 已提交
1668

1669 1670 1671
	/* Wait for IO to complete. */
	while (!list_empty(&io_list)) {
		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1672

1673
		list_del_init(&bp->b_list);
1674 1675 1676 1677
		error2 = xfs_buf_iowait(bp);
		xfs_buf_relse(bp);
		if (!error)
			error = error2;
L
Linus Torvalds 已提交
1678 1679
	}

1680
	return error;
L
Linus Torvalds 已提交
1681 1682
}

1683
int __init
1684
xfs_buf_init(void)
L
Linus Torvalds 已提交
1685
{
1686 1687
	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
						KM_ZONE_HWALIGN, NULL);
1688
	if (!xfs_buf_zone)
C
Christoph Hellwig 已提交
1689
		goto out;
1690

1691
	xfslogd_workqueue = alloc_workqueue("xfslogd",
1692
					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1693
	if (!xfslogd_workqueue)
1694
		goto out_free_buf_zone;
L
Linus Torvalds 已提交
1695

1696
	return 0;
L
Linus Torvalds 已提交
1697

1698
 out_free_buf_zone:
1699
	kmem_zone_destroy(xfs_buf_zone);
C
Christoph Hellwig 已提交
1700
 out:
1701
	return -ENOMEM;
L
Linus Torvalds 已提交
1702 1703 1704
}

void
1705
xfs_buf_terminate(void)
L
Linus Torvalds 已提交
1706
{
1707
	destroy_workqueue(xfslogd_workqueue);
1708
	kmem_zone_destroy(xfs_buf_zone);
L
Linus Torvalds 已提交
1709
}