xfs_buf.c 38.2 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3
 * All Rights Reserved.
L
Linus Torvalds 已提交
4
 *
5 6
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
L
Linus Torvalds 已提交
7 8
 * published by the Free Software Foundation.
 *
9 10 11 12
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
L
Linus Torvalds 已提交
13
 *
14 15 16
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
L
Linus Torvalds 已提交
17
 */
18
#include "xfs.h"
L
Linus Torvalds 已提交
19 20
#include <linux/stddef.h>
#include <linux/errno.h>
21
#include <linux/gfp.h>
L
Linus Torvalds 已提交
22 23 24 25 26 27 28 29 30 31
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/percpu.h>
#include <linux/blkdev.h>
#include <linux/hash.h>
32
#include <linux/kthread.h>
C
Christoph Lameter 已提交
33
#include <linux/migrate.h>
34
#include <linux/backing-dev.h>
35
#include <linux/freezer.h>
L
Linus Torvalds 已提交
36

37 38
#include "xfs_sb.h"
#include "xfs_inum.h"
39
#include "xfs_log.h"
40 41
#include "xfs_ag.h"
#include "xfs_mount.h"
C
Christoph Hellwig 已提交
42
#include "xfs_trace.h"
43

44
static kmem_zone_t *xfs_buf_zone;
45

46
static struct workqueue_struct *xfslogd_workqueue;
L
Linus Torvalds 已提交
47

48 49 50 51
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
L
Linus Torvalds 已提交
52
#else
53 54 55
# define XB_SET_OWNER(bp)	do { } while (0)
# define XB_CLEAR_OWNER(bp)	do { } while (0)
# define XB_GET_OWNER(bp)	do { } while (0)
L
Linus Torvalds 已提交
56 57
#endif

58 59 60
#define xb_to_gfp(flags) \
	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
L
Linus Torvalds 已提交
61

62 63
#define xb_to_km(flags) \
	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
L
Linus Torvalds 已提交
64 65


66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
static inline int
xfs_buf_is_vmapped(
	struct xfs_buf	*bp)
{
	/*
	 * Return true if the buffer is vmapped.
	 *
	 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
	 * code is clever enough to know it doesn't have to map a single page,
	 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
	 */
	return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
}

static inline int
xfs_buf_vmap_len(
	struct xfs_buf	*bp)
{
	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
}

L
Linus Torvalds 已提交
87
/*
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 * xfs_buf_lru_add - add a buffer to the LRU.
 *
 * The LRU takes a new reference to the buffer so that it will only be freed
 * once the shrinker takes the buffer off the LRU.
 */
STATIC void
xfs_buf_lru_add(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	spin_lock(&btp->bt_lru_lock);
	if (list_empty(&bp->b_lru)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_lru, &btp->bt_lru);
		btp->bt_lru_nr++;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * xfs_buf_lru_del - remove a buffer from the LRU
 *
 * The unlocked check is safe here because it only occurs when there are not
 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 * to optimise the shrinker removing the buffer from the LRU and calling
L
Lucas De Marchi 已提交
114
 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
115
 * bt_lru_lock.
L
Linus Torvalds 已提交
116
 */
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
STATIC void
xfs_buf_lru_del(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	if (list_empty(&bp->b_lru))
		return;

	spin_lock(&btp->bt_lru_lock);
	if (!list_empty(&bp->b_lru)) {
		list_del_init(&bp->b_lru);
		btp->bt_lru_nr--;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
 * to remove the reference that LRU holds on the buffer.
 *
 * This prevents build-up of stale buffers on the LRU.
 */
void
xfs_buf_stale(
	struct xfs_buf	*bp)
{
146 147
	ASSERT(xfs_buf_islocked(bp));

148
	bp->b_flags |= XBF_STALE;
149 150 151 152 153 154 155 156

	/*
	 * Clear the delwri status so that a delwri queue walker will not
	 * flush this buffer to disk now that it is stale. The delwri queue has
	 * a reference to the buffer, so this is safe to do.
	 */
	bp->b_flags &= ~_XBF_DELWRI_Q;

157 158 159 160 161 162 163 164 165 166 167 168 169 170
	atomic_set(&(bp)->b_lru_ref, 0);
	if (!list_empty(&bp->b_lru)) {
		struct xfs_buftarg *btp = bp->b_target;

		spin_lock(&btp->bt_lru_lock);
		if (!list_empty(&bp->b_lru)) {
			list_del_init(&bp->b_lru);
			btp->bt_lru_nr--;
			atomic_dec(&bp->b_hold);
		}
		spin_unlock(&btp->bt_lru_lock);
	}
	ASSERT(atomic_read(&bp->b_hold) >= 1);
}
L
Linus Torvalds 已提交
171

172 173 174
struct xfs_buf *
xfs_buf_alloc(
	struct xfs_buftarg	*target,
175 176
	xfs_daddr_t		blkno,
	size_t			numblks,
177
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
178
{
179 180
	struct xfs_buf		*bp;

181
	bp = kmem_zone_zalloc(xfs_buf_zone, xb_to_km(flags));
182 183 184
	if (unlikely(!bp))
		return NULL;

L
Linus Torvalds 已提交
185
	/*
186
	 * We don't want certain flags to appear in b_flags.
L
Linus Torvalds 已提交
187
	 */
188 189 190
	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);

	atomic_set(&bp->b_hold, 1);
191
	atomic_set(&bp->b_lru_ref, 1);
192
	init_completion(&bp->b_iowait);
193
	INIT_LIST_HEAD(&bp->b_lru);
194
	INIT_LIST_HEAD(&bp->b_list);
195
	RB_CLEAR_NODE(&bp->b_rbnode);
T
Thomas Gleixner 已提交
196
	sema_init(&bp->b_sema, 0); /* held, no waiters */
197 198
	XB_SET_OWNER(bp);
	bp->b_target = target;
D
Dave Chinner 已提交
199

L
Linus Torvalds 已提交
200 201 202 203 204
	/*
	 * Set buffer_length and count_desired to the same value initially.
	 * I/O routines should use count_desired, which will be the same in
	 * most cases but may be reset (e.g. XFS recovery).
	 */
205
	bp->b_buffer_length = bp->b_count_desired = numblks << BBSHIFT;
206
	bp->b_flags = flags;
207 208 209 210 211 212 213

	/*
	 * We do not set the block number here in the buffer because we have not
	 * finished initialising the buffer. We insert the buffer into the cache
	 * in this state, so this ensures that we are unable to do IO on a
	 * buffer that hasn't been fully initialised.
	 */
214 215 216 217 218
	bp->b_bn = XFS_BUF_DADDR_NULL;
	atomic_set(&bp->b_pin_count, 0);
	init_waitqueue_head(&bp->b_waiters);

	XFS_STATS_INC(xb_create);
C
Christoph Hellwig 已提交
219
	trace_xfs_buf_init(bp, _RET_IP_);
220 221

	return bp;
L
Linus Torvalds 已提交
222 223 224
}

/*
225 226
 *	Allocate a page array capable of holding a specified number
 *	of pages, and point the page buf at it.
L
Linus Torvalds 已提交
227 228
 */
STATIC int
229 230
_xfs_buf_get_pages(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
231
	int			page_count,
232
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
233 234
{
	/* Make sure that we have a page list */
235 236 237 238
	if (bp->b_pages == NULL) {
		bp->b_page_count = page_count;
		if (page_count <= XB_PAGES) {
			bp->b_pages = bp->b_page_array;
L
Linus Torvalds 已提交
239
		} else {
240 241 242
			bp->b_pages = kmem_alloc(sizeof(struct page *) *
					page_count, xb_to_km(flags));
			if (bp->b_pages == NULL)
L
Linus Torvalds 已提交
243 244
				return -ENOMEM;
		}
245
		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
L
Linus Torvalds 已提交
246 247 248 249 250
	}
	return 0;
}

/*
251
 *	Frees b_pages if it was allocated.
L
Linus Torvalds 已提交
252 253
 */
STATIC void
254
_xfs_buf_free_pages(
L
Linus Torvalds 已提交
255 256
	xfs_buf_t	*bp)
{
257
	if (bp->b_pages != bp->b_page_array) {
258
		kmem_free(bp->b_pages);
259
		bp->b_pages = NULL;
L
Linus Torvalds 已提交
260 261 262 263 264 265 266
	}
}

/*
 *	Releases the specified buffer.
 *
 * 	The modification state of any associated pages is left unchanged.
267
 * 	The buffer most not be on any hash - use xfs_buf_rele instead for
L
Linus Torvalds 已提交
268 269 270
 * 	hashed and refcounted buffers
 */
void
271
xfs_buf_free(
L
Linus Torvalds 已提交
272 273
	xfs_buf_t		*bp)
{
C
Christoph Hellwig 已提交
274
	trace_xfs_buf_free(bp, _RET_IP_);
L
Linus Torvalds 已提交
275

276 277
	ASSERT(list_empty(&bp->b_lru));

278
	if (bp->b_flags & _XBF_PAGES) {
L
Linus Torvalds 已提交
279 280
		uint		i;

281
		if (xfs_buf_is_vmapped(bp))
A
Alex Elder 已提交
282 283
			vm_unmap_ram(bp->b_addr - bp->b_offset,
					bp->b_page_count);
L
Linus Torvalds 已提交
284

285 286 287
		for (i = 0; i < bp->b_page_count; i++) {
			struct page	*page = bp->b_pages[i];

288
			__free_page(page);
289
		}
290 291
	} else if (bp->b_flags & _XBF_KMEM)
		kmem_free(bp->b_addr);
292
	_xfs_buf_free_pages(bp);
293
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
294 295 296
}

/*
297
 * Allocates all the pages for buffer in question and builds it's page list.
L
Linus Torvalds 已提交
298 299
 */
STATIC int
300
xfs_buf_allocate_memory(
L
Linus Torvalds 已提交
301 302 303
	xfs_buf_t		*bp,
	uint			flags)
{
304
	size_t			size = bp->b_count_desired;
L
Linus Torvalds 已提交
305
	size_t			nbytes, offset;
306
	gfp_t			gfp_mask = xb_to_gfp(flags);
L
Linus Torvalds 已提交
307
	unsigned short		page_count, i;
308
	xfs_off_t		end;
L
Linus Torvalds 已提交
309 310
	int			error;

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
	/*
	 * for buffers that are contained within a single page, just allocate
	 * the memory from the heap - there's no need for the complexity of
	 * page arrays to keep allocation down to order 0.
	 */
	if (bp->b_buffer_length < PAGE_SIZE) {
		bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
		if (!bp->b_addr) {
			/* low memory - use alloc_page loop instead */
			goto use_alloc_page;
		}

		if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
								PAGE_MASK) !=
		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
			/* b_addr spans two pages - use alloc_page instead */
			kmem_free(bp->b_addr);
			bp->b_addr = NULL;
			goto use_alloc_page;
		}
		bp->b_offset = offset_in_page(bp->b_addr);
		bp->b_pages = bp->b_page_array;
		bp->b_pages[0] = virt_to_page(bp->b_addr);
		bp->b_page_count = 1;
		bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
		return 0;
	}

use_alloc_page:
D
Dave Chinner 已提交
340 341
	end = BBTOB(bp->b_bn) + bp->b_buffer_length;
	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(BBTOB(bp->b_bn));
342
	error = _xfs_buf_get_pages(bp, page_count, flags);
L
Linus Torvalds 已提交
343 344 345
	if (unlikely(error))
		return error;

346
	offset = bp->b_offset;
347
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
348

349
	for (i = 0; i < bp->b_page_count; i++) {
L
Linus Torvalds 已提交
350 351
		struct page	*page;
		uint		retries = 0;
352 353
retry:
		page = alloc_page(gfp_mask);
L
Linus Torvalds 已提交
354
		if (unlikely(page == NULL)) {
355 356
			if (flags & XBF_READ_AHEAD) {
				bp->b_page_count = i;
357 358
				error = ENOMEM;
				goto out_free_pages;
L
Linus Torvalds 已提交
359 360 361 362 363 364 365 366 367
			}

			/*
			 * This could deadlock.
			 *
			 * But until all the XFS lowlevel code is revamped to
			 * handle buffer allocation failures we can't do much.
			 */
			if (!(++retries % 100))
368 369
				xfs_err(NULL,
		"possible memory allocation deadlock in %s (mode:0x%x)",
370
					__func__, gfp_mask);
L
Linus Torvalds 已提交
371

372
			XFS_STATS_INC(xb_page_retries);
373
			congestion_wait(BLK_RW_ASYNC, HZ/50);
L
Linus Torvalds 已提交
374 375 376
			goto retry;
		}

377
		XFS_STATS_INC(xb_page_found);
L
Linus Torvalds 已提交
378

379
		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
L
Linus Torvalds 已提交
380
		size -= nbytes;
381
		bp->b_pages[i] = page;
L
Linus Torvalds 已提交
382 383
		offset = 0;
	}
384
	return 0;
L
Linus Torvalds 已提交
385

386 387 388
out_free_pages:
	for (i = 0; i < bp->b_page_count; i++)
		__free_page(bp->b_pages[i]);
L
Linus Torvalds 已提交
389 390 391 392
	return error;
}

/*
L
Lucas De Marchi 已提交
393
 *	Map buffer into kernel address-space if necessary.
L
Linus Torvalds 已提交
394 395
 */
STATIC int
396
_xfs_buf_map_pages(
L
Linus Torvalds 已提交
397 398 399
	xfs_buf_t		*bp,
	uint			flags)
{
400
	ASSERT(bp->b_flags & _XBF_PAGES);
401
	if (bp->b_page_count == 1) {
402
		/* A single page buffer is always mappable */
403 404 405
		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
		bp->b_flags |= XBF_MAPPED;
	} else if (flags & XBF_MAPPED) {
406 407 408 409 410 411 412 413 414 415 416
		int retried = 0;

		do {
			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
						-1, PAGE_KERNEL);
			if (bp->b_addr)
				break;
			vm_unmap_aliases();
		} while (retried++ <= 1);

		if (!bp->b_addr)
L
Linus Torvalds 已提交
417
			return -ENOMEM;
418 419
		bp->b_addr += bp->b_offset;
		bp->b_flags |= XBF_MAPPED;
L
Linus Torvalds 已提交
420 421 422 423 424 425 426 427 428 429
	}

	return 0;
}

/*
 *	Finding and Reading Buffers
 */

/*
430
 *	Look up, and creates if absent, a lockable buffer for
L
Linus Torvalds 已提交
431
 *	a given range of an inode.  The buffer is returned
432
 *	locked.	No I/O is implied by this call.
L
Linus Torvalds 已提交
433 434
 */
xfs_buf_t *
435
_xfs_buf_find(
436 437 438
	struct xfs_buftarg	*btp,
	xfs_daddr_t		blkno,
	size_t			numblks,
439 440
	xfs_buf_flags_t		flags,
	xfs_buf_t		*new_bp)
L
Linus Torvalds 已提交
441
{
442
	size_t			numbytes;
443 444 445 446
	struct xfs_perag	*pag;
	struct rb_node		**rbp;
	struct rb_node		*parent;
	xfs_buf_t		*bp;
L
Linus Torvalds 已提交
447

448
	numbytes = BBTOB(numblks);
L
Linus Torvalds 已提交
449 450

	/* Check for IOs smaller than the sector size / not sector aligned */
451
	ASSERT(!(numbytes < (1 << btp->bt_sshift)));
D
Dave Chinner 已提交
452
	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
L
Linus Torvalds 已提交
453

454 455
	/* get tree root */
	pag = xfs_perag_get(btp->bt_mount,
456
				xfs_daddr_to_agno(btp->bt_mount, blkno));
457 458 459 460 461 462 463 464 465 466

	/* walk tree */
	spin_lock(&pag->pag_buf_lock);
	rbp = &pag->pag_buf_tree.rb_node;
	parent = NULL;
	bp = NULL;
	while (*rbp) {
		parent = *rbp;
		bp = rb_entry(parent, struct xfs_buf, b_rbnode);

D
Dave Chinner 已提交
467
		if (blkno < bp->b_bn)
468
			rbp = &(*rbp)->rb_left;
D
Dave Chinner 已提交
469
		else if (blkno > bp->b_bn)
470 471 472
			rbp = &(*rbp)->rb_right;
		else {
			/*
D
Dave Chinner 已提交
473
			 * found a block number match. If the range doesn't
474 475 476 477 478 479
			 * match, the only way this is allowed is if the buffer
			 * in the cache is stale and the transaction that made
			 * it stale has not yet committed. i.e. we are
			 * reallocating a busy extent. Skip this buffer and
			 * continue searching to the right for an exact match.
			 */
480
			if (bp->b_buffer_length != numbytes) {
481 482 483 484
				ASSERT(bp->b_flags & XBF_STALE);
				rbp = &(*rbp)->rb_right;
				continue;
			}
485
			atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
486 487 488 489 490
			goto found;
		}
	}

	/* No match found */
491
	if (new_bp) {
492 493 494 495 496
		rb_link_node(&new_bp->b_rbnode, parent, rbp);
		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
		/* the buffer keeps the perag reference until it is freed */
		new_bp->b_pag = pag;
		spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
497
	} else {
498
		XFS_STATS_INC(xb_miss_locked);
499 500
		spin_unlock(&pag->pag_buf_lock);
		xfs_perag_put(pag);
L
Linus Torvalds 已提交
501
	}
502
	return new_bp;
L
Linus Torvalds 已提交
503 504

found:
505 506
	spin_unlock(&pag->pag_buf_lock);
	xfs_perag_put(pag);
L
Linus Torvalds 已提交
507

508 509
	if (!xfs_buf_trylock(bp)) {
		if (flags & XBF_TRYLOCK) {
510 511 512
			xfs_buf_rele(bp);
			XFS_STATS_INC(xb_busy_locked);
			return NULL;
L
Linus Torvalds 已提交
513
		}
514 515
		xfs_buf_lock(bp);
		XFS_STATS_INC(xb_get_locked_waited);
L
Linus Torvalds 已提交
516 517
	}

518 519 520 521 522
	/*
	 * if the buffer is stale, clear all the external state associated with
	 * it. We need to keep flags such as how we allocated the buffer memory
	 * intact here.
	 */
523 524
	if (bp->b_flags & XBF_STALE) {
		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
525
		bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
526
	}
C
Christoph Hellwig 已提交
527 528

	trace_xfs_buf_find(bp, flags, _RET_IP_);
529 530
	XFS_STATS_INC(xb_get_locked);
	return bp;
L
Linus Torvalds 已提交
531 532 533
}

/*
534 535 536
 * Assembles a buffer covering the specified range. The code is optimised for
 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 * more hits than misses.
L
Linus Torvalds 已提交
537
 */
538
struct xfs_buf *
539
xfs_buf_get(
540 541 542
	xfs_buftarg_t		*target,
	xfs_daddr_t		blkno,
	size_t			numblks,
543
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
544
{
545 546
	struct xfs_buf		*bp;
	struct xfs_buf		*new_bp;
547
	int			error = 0;
L
Linus Torvalds 已提交
548

549
	bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
550 551 552
	if (likely(bp))
		goto found;

553
	new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
554
	if (unlikely(!new_bp))
L
Linus Torvalds 已提交
555 556
		return NULL;

557 558 559 560 561 562
	error = xfs_buf_allocate_memory(new_bp, flags);
	if (error) {
		kmem_zone_free(xfs_buf_zone, new_bp);
		return NULL;
	}

563
	bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
564
	if (!bp) {
565
		xfs_buf_free(new_bp);
566 567 568
		return NULL;
	}

569 570
	if (bp != new_bp)
		xfs_buf_free(new_bp);
L
Linus Torvalds 已提交
571

572 573 574 575
	/*
	 * Now we have a workable buffer, fill in the block number so
	 * that we can do IO on it.
	 */
576
	bp->b_bn = blkno;
577 578 579
	bp->b_count_desired = bp->b_buffer_length;

found:
580 581
	if (!(bp->b_flags & XBF_MAPPED)) {
		error = _xfs_buf_map_pages(bp, flags);
L
Linus Torvalds 已提交
582
		if (unlikely(error)) {
583 584
			xfs_warn(target->bt_mount,
				"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
585 586 587 588
			goto no_buffer;
		}
	}

589
	XFS_STATS_INC(xb_get);
C
Christoph Hellwig 已提交
590
	trace_xfs_buf_get(bp, flags, _RET_IP_);
591
	return bp;
L
Linus Torvalds 已提交
592

593
no_buffer:
594 595 596
	if (flags & (XBF_LOCK | XBF_TRYLOCK))
		xfs_buf_unlock(bp);
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
597 598 599
	return NULL;
}

C
Christoph Hellwig 已提交
600 601 602 603 604
STATIC int
_xfs_buf_read(
	xfs_buf_t		*bp,
	xfs_buf_flags_t		flags)
{
605
	ASSERT(!(flags & XBF_WRITE));
C
Christoph Hellwig 已提交
606 607
	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);

608
	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
609
	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
C
Christoph Hellwig 已提交
610

611 612 613
	xfs_buf_iorequest(bp);
	if (flags & XBF_ASYNC)
		return 0;
614
	return xfs_buf_iowait(bp);
C
Christoph Hellwig 已提交
615 616
}

L
Linus Torvalds 已提交
617
xfs_buf_t *
618
xfs_buf_read(
L
Linus Torvalds 已提交
619
	xfs_buftarg_t		*target,
620 621
	xfs_daddr_t		blkno,
	size_t			numblks,
622
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
623
{
624 625 626 627
	xfs_buf_t		*bp;

	flags |= XBF_READ;

628
	bp = xfs_buf_get(target, blkno, numblks, flags);
629
	if (bp) {
C
Christoph Hellwig 已提交
630 631
		trace_xfs_buf_read(bp, flags, _RET_IP_);

632 633
		if (!XFS_BUF_ISDONE(bp)) {
			XFS_STATS_INC(xb_get_read);
C
Christoph Hellwig 已提交
634
			_xfs_buf_read(bp, flags);
635
		} else if (flags & XBF_ASYNC) {
L
Linus Torvalds 已提交
636 637 638 639 640 641 642
			/*
			 * Read ahead call which is already satisfied,
			 * drop the buffer
			 */
			goto no_buffer;
		} else {
			/* We do not want read in the flags */
643
			bp->b_flags &= ~XBF_READ;
L
Linus Torvalds 已提交
644 645 646
		}
	}

647
	return bp;
L
Linus Torvalds 已提交
648 649

 no_buffer:
650 651 652
	if (flags & (XBF_LOCK | XBF_TRYLOCK))
		xfs_buf_unlock(bp);
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
653 654 655 656
	return NULL;
}

/*
657 658
 *	If we are not low on memory then do the readahead in a deadlock
 *	safe manner.
L
Linus Torvalds 已提交
659 660
 */
void
661
xfs_buf_readahead(
L
Linus Torvalds 已提交
662
	xfs_buftarg_t		*target,
663 664
	xfs_daddr_t		blkno,
	size_t			numblks)
L
Linus Torvalds 已提交
665
{
666
	if (bdi_read_congested(target->bt_bdi))
L
Linus Torvalds 已提交
667 668
		return;

669
	xfs_buf_read(target, blkno, numblks,
C
Christoph Hellwig 已提交
670
		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
L
Linus Torvalds 已提交
671 672
}

673 674 675 676 677 678 679 680
/*
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
struct xfs_buf *
xfs_buf_read_uncached(
	struct xfs_buftarg	*target,
	xfs_daddr_t		daddr,
681
	size_t			numblks,
682 683 684 685 686
	int			flags)
{
	xfs_buf_t		*bp;
	int			error;

687
	bp = xfs_buf_get_uncached(target, numblks, flags);
688 689 690 691 692 693 694
	if (!bp)
		return NULL;

	/* set up the buffer for a read IO */
	XFS_BUF_SET_ADDR(bp, daddr);
	XFS_BUF_READ(bp);

695
	xfsbdstrat(target->bt_mount, bp);
C
Christoph Hellwig 已提交
696
	error = xfs_buf_iowait(bp);
697
	if (error) {
698 699 700 701
		xfs_buf_relse(bp);
		return NULL;
	}
	return bp;
L
Linus Torvalds 已提交
702 703
}

704 705 706 707 708 709 710
/*
 * Return a buffer allocated as an empty buffer and associated to external
 * memory via xfs_buf_associate_memory() back to it's empty state.
 */
void
xfs_buf_set_empty(
	struct xfs_buf		*bp,
711
	size_t			numblks)
712 713 714 715 716 717 718
{
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);

	bp->b_pages = NULL;
	bp->b_page_count = 0;
	bp->b_addr = NULL;
719
	bp->b_buffer_length = bp->b_count_desired = numblks << BBSHIFT;
720 721 722 723
	bp->b_bn = XFS_BUF_DADDR_NULL;
	bp->b_flags &= ~XBF_MAPPED;
}

L
Linus Torvalds 已提交
724 725 726 727
static inline struct page *
mem_to_page(
	void			*addr)
{
728
	if ((!is_vmalloc_addr(addr))) {
L
Linus Torvalds 已提交
729 730 731 732 733 734 735
		return virt_to_page(addr);
	} else {
		return vmalloc_to_page(addr);
	}
}

int
736 737
xfs_buf_associate_memory(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
738 739 740 741 742
	void			*mem,
	size_t			len)
{
	int			rval;
	int			i = 0;
743 744 745
	unsigned long		pageaddr;
	unsigned long		offset;
	size_t			buflen;
L
Linus Torvalds 已提交
746 747
	int			page_count;

748
	pageaddr = (unsigned long)mem & PAGE_MASK;
749
	offset = (unsigned long)mem - pageaddr;
750 751
	buflen = PAGE_ALIGN(len + offset);
	page_count = buflen >> PAGE_SHIFT;
L
Linus Torvalds 已提交
752 753

	/* Free any previous set of page pointers */
754 755
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
756

757 758
	bp->b_pages = NULL;
	bp->b_addr = mem;
L
Linus Torvalds 已提交
759

760
	rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
L
Linus Torvalds 已提交
761 762 763
	if (rval)
		return rval;

764
	bp->b_offset = offset;
765 766 767

	for (i = 0; i < bp->b_page_count; i++) {
		bp->b_pages[i] = mem_to_page((void *)pageaddr);
768
		pageaddr += PAGE_SIZE;
L
Linus Torvalds 已提交
769 770
	}

771 772
	bp->b_count_desired = len;
	bp->b_buffer_length = buflen;
773
	bp->b_flags |= XBF_MAPPED;
L
Linus Torvalds 已提交
774 775 776 777 778

	return 0;
}

xfs_buf_t *
779 780
xfs_buf_get_uncached(
	struct xfs_buftarg	*target,
781
	size_t			numblks,
782
	int			flags)
L
Linus Torvalds 已提交
783
{
784
	unsigned long		page_count;
785
	int			error, i;
L
Linus Torvalds 已提交
786 787
	xfs_buf_t		*bp;

788
	bp = xfs_buf_alloc(target, 0, numblks, 0);
L
Linus Torvalds 已提交
789 790 791
	if (unlikely(bp == NULL))
		goto fail;

792
	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
793 794
	error = _xfs_buf_get_pages(bp, page_count, 0);
	if (error)
L
Linus Torvalds 已提交
795 796
		goto fail_free_buf;

797
	for (i = 0; i < page_count; i++) {
798
		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
799 800
		if (!bp->b_pages[i])
			goto fail_free_mem;
L
Linus Torvalds 已提交
801
	}
802
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
803

804 805
	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
	if (unlikely(error)) {
806 807
		xfs_warn(target->bt_mount,
			"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
808
		goto fail_free_mem;
809
	}
L
Linus Torvalds 已提交
810

811
	trace_xfs_buf_get_uncached(bp, _RET_IP_);
L
Linus Torvalds 已提交
812
	return bp;
813

L
Linus Torvalds 已提交
814
 fail_free_mem:
815 816
	while (--i >= 0)
		__free_page(bp->b_pages[i]);
817
	_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
818
 fail_free_buf:
819
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
820 821 822 823 824 825 826 827 828 829
 fail:
	return NULL;
}

/*
 *	Increment reference count on buffer, to hold the buffer concurrently
 *	with another thread which may release (free) the buffer asynchronously.
 *	Must hold the buffer already to call this function.
 */
void
830 831
xfs_buf_hold(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
832
{
C
Christoph Hellwig 已提交
833
	trace_xfs_buf_hold(bp, _RET_IP_);
834
	atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
835 836 837
}

/*
838 839
 *	Releases a hold on the specified buffer.  If the
 *	the hold count is 1, calls xfs_buf_free.
L
Linus Torvalds 已提交
840 841
 */
void
842 843
xfs_buf_rele(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
844
{
845
	struct xfs_perag	*pag = bp->b_pag;
L
Linus Torvalds 已提交
846

C
Christoph Hellwig 已提交
847
	trace_xfs_buf_rele(bp, _RET_IP_);
L
Linus Torvalds 已提交
848

849
	if (!pag) {
850
		ASSERT(list_empty(&bp->b_lru));
851
		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
852 853 854 855 856
		if (atomic_dec_and_test(&bp->b_hold))
			xfs_buf_free(bp);
		return;
	}

857
	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
858

859
	ASSERT(atomic_read(&bp->b_hold) > 0);
860
	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
861
		if (!(bp->b_flags & XBF_STALE) &&
862 863 864
			   atomic_read(&bp->b_lru_ref)) {
			xfs_buf_lru_add(bp);
			spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
865
		} else {
866
			xfs_buf_lru_del(bp);
867
			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
868 869 870
			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			spin_unlock(&pag->pag_buf_lock);
			xfs_perag_put(pag);
871
			xfs_buf_free(bp);
L
Linus Torvalds 已提交
872 873 874 875 876 877
		}
	}
}


/*
878
 *	Lock a buffer object, if it is not already locked.
879 880 881 882 883 884 885 886
 *
 *	If we come across a stale, pinned, locked buffer, we know that we are
 *	being asked to lock a buffer that has been reallocated. Because it is
 *	pinned, we know that the log has not been pushed to disk and hence it
 *	will still be locked.  Rather than continuing to have trylock attempts
 *	fail until someone else pushes the log, push it ourselves before
 *	returning.  This means that the xfsaild will not get stuck trying
 *	to push on stale inode buffers.
L
Linus Torvalds 已提交
887 888
 */
int
889 890
xfs_buf_trylock(
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
891 892 893
{
	int			locked;

894
	locked = down_trylock(&bp->b_sema) == 0;
C
Christoph Hellwig 已提交
895
	if (locked)
896
		XB_SET_OWNER(bp);
897 898
	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
		xfs_log_force(bp->b_target->bt_mount, 0);
C
Christoph Hellwig 已提交
899

900 901
	trace_xfs_buf_trylock(bp, _RET_IP_);
	return locked;
L
Linus Torvalds 已提交
902 903 904
}

/*
905
 *	Lock a buffer object.
906 907 908 909 910 911
 *
 *	If we come across a stale, pinned, locked buffer, we know that we
 *	are being asked to lock a buffer that has been reallocated. Because
 *	it is pinned, we know that the log has not been pushed to disk and
 *	hence it will still be locked. Rather than sleeping until someone
 *	else pushes the log, push it ourselves before trying to get the lock.
L
Linus Torvalds 已提交
912
 */
913 914
void
xfs_buf_lock(
915
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
916
{
C
Christoph Hellwig 已提交
917 918
	trace_xfs_buf_lock(bp, _RET_IP_);

919
	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
920
		xfs_log_force(bp->b_target->bt_mount, 0);
921 922
	down(&bp->b_sema);
	XB_SET_OWNER(bp);
C
Christoph Hellwig 已提交
923 924

	trace_xfs_buf_lock_done(bp, _RET_IP_);
L
Linus Torvalds 已提交
925 926 927
}

void
928
xfs_buf_unlock(
929
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
930
{
931 932
	XB_CLEAR_OWNER(bp);
	up(&bp->b_sema);
C
Christoph Hellwig 已提交
933 934

	trace_xfs_buf_unlock(bp, _RET_IP_);
L
Linus Torvalds 已提交
935 936
}

937 938 939
STATIC void
xfs_buf_wait_unpin(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
940 941 942
{
	DECLARE_WAITQUEUE	(wait, current);

943
	if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
944 945
		return;

946
	add_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
947 948
	for (;;) {
		set_current_state(TASK_UNINTERRUPTIBLE);
949
		if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
950
			break;
J
Jens Axboe 已提交
951
		io_schedule();
L
Linus Torvalds 已提交
952
	}
953
	remove_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
954 955 956 957 958 959 960 961
	set_current_state(TASK_RUNNING);
}

/*
 *	Buffer Utility Routines
 */

STATIC void
962
xfs_buf_iodone_work(
D
David Howells 已提交
963
	struct work_struct	*work)
L
Linus Torvalds 已提交
964
{
D
David Howells 已提交
965 966
	xfs_buf_t		*bp =
		container_of(work, xfs_buf_t, b_iodone_work);
L
Linus Torvalds 已提交
967

968
	if (bp->b_iodone)
969 970
		(*(bp->b_iodone))(bp);
	else if (bp->b_flags & XBF_ASYNC)
L
Linus Torvalds 已提交
971 972 973 974
		xfs_buf_relse(bp);
}

void
975 976
xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
977 978
	int			schedule)
{
C
Christoph Hellwig 已提交
979 980
	trace_xfs_buf_iodone(bp, _RET_IP_);

981
	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
982 983
	if (bp->b_error == 0)
		bp->b_flags |= XBF_DONE;
L
Linus Torvalds 已提交
984

985
	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
L
Linus Torvalds 已提交
986
		if (schedule) {
D
David Howells 已提交
987
			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
988
			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
L
Linus Torvalds 已提交
989
		} else {
D
David Howells 已提交
990
			xfs_buf_iodone_work(&bp->b_iodone_work);
L
Linus Torvalds 已提交
991 992
		}
	} else {
993
		complete(&bp->b_iowait);
L
Linus Torvalds 已提交
994 995 996 997
	}
}

void
998 999 1000
xfs_buf_ioerror(
	xfs_buf_t		*bp,
	int			error)
L
Linus Torvalds 已提交
1001 1002
{
	ASSERT(error >= 0 && error <= 0xffff);
1003
	bp->b_error = (unsigned short)error;
C
Christoph Hellwig 已提交
1004
	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
L
Linus Torvalds 已提交
1005 1006
}

1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017
void
xfs_buf_ioerror_alert(
	struct xfs_buf		*bp,
	const char		*func)
{
	xfs_alert(bp->b_target->bt_mount,
"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
		(__uint64_t)XFS_BUF_ADDR(bp), func,
		bp->b_error, XFS_BUF_COUNT(bp));
}

L
Linus Torvalds 已提交
1018
int
C
Christoph Hellwig 已提交
1019
xfs_bwrite(
C
Christoph Hellwig 已提交
1020
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
1021
{
1022
	int			error;
L
Linus Torvalds 已提交
1023

1024 1025
	ASSERT(xfs_buf_islocked(bp));

C
Christoph Hellwig 已提交
1026
	bp->b_flags |= XBF_WRITE;
1027
	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
L
Linus Torvalds 已提交
1028

1029
	xfs_bdstrat_cb(bp);
L
Linus Torvalds 已提交
1030

1031
	error = xfs_buf_iowait(bp);
1032 1033 1034 1035
	if (error) {
		xfs_force_shutdown(bp->b_target->bt_mount,
				   SHUTDOWN_META_IO_ERROR);
	}
C
Christoph Hellwig 已提交
1036
	return error;
C
Christoph Hellwig 已提交
1037
}
L
Linus Torvalds 已提交
1038

1039 1040
/*
 * Called when we want to stop a buffer from getting written or read.
C
Christoph Hellwig 已提交
1041
 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
 * so that the proper iodone callbacks get called.
 */
STATIC int
xfs_bioerror(
	xfs_buf_t *bp)
{
#ifdef XFSERRORDEBUG
	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
#endif

	/*
	 * No need to wait until the buffer is unpinned, we aren't flushing it.
	 */
1055
	xfs_buf_ioerror(bp, EIO);
1056 1057

	/*
C
Christoph Hellwig 已提交
1058
	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1059 1060 1061
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDONE(bp);
1062
	xfs_buf_stale(bp);
1063

C
Christoph Hellwig 已提交
1064
	xfs_buf_ioend(bp, 0);
1065 1066 1067 1068 1069 1070

	return EIO;
}

/*
 * Same as xfs_bioerror, except that we are releasing the buffer
C
Christoph Hellwig 已提交
1071
 * here ourselves, and avoiding the xfs_buf_ioend call.
1072 1073 1074 1075 1076 1077 1078
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
STATIC int
xfs_bioerror_relse(
	struct xfs_buf	*bp)
{
1079
	int64_t		fl = bp->b_flags;
1080 1081 1082 1083 1084 1085 1086 1087 1088 1089
	/*
	 * No need to wait until the buffer is unpinned.
	 * We aren't flushing it.
	 *
	 * chunkhold expects B_DONE to be set, whether
	 * we actually finish the I/O or not. We don't want to
	 * change that interface.
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_DONE(bp);
1090
	xfs_buf_stale(bp);
1091
	bp->b_iodone = NULL;
1092
	if (!(fl & XBF_ASYNC)) {
1093 1094 1095 1096 1097 1098
		/*
		 * Mark b_error and B_ERROR _both_.
		 * Lot's of chunkcache code assumes that.
		 * There's no reason to mark error for
		 * ASYNC buffers.
		 */
1099
		xfs_buf_ioerror(bp, EIO);
1100
		complete(&bp->b_iowait);
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
	} else {
		xfs_buf_relse(bp);
	}

	return EIO;
}


/*
 * All xfs metadata buffers except log state machine buffers
 * get this attached as their b_bdstrat callback function.
 * This is so that we can catch a buffer
 * after prematurely unpinning it to forcibly shutdown the filesystem.
 */
int
xfs_bdstrat_cb(
	struct xfs_buf	*bp)
{
1119
	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		/*
		 * Metadata write that didn't get logged but
		 * written delayed anyway. These aren't associated
		 * with a transaction, and can be ignored.
		 */
		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
			return xfs_bioerror_relse(bp);
		else
			return xfs_bioerror(bp);
	}

	xfs_buf_iorequest(bp);
	return 0;
}

/*
 * Wrapper around bdstrat so that we can stop data from going to disk in case
 * we are shutting down the filesystem.  Typically user data goes thru this
 * path; one of the exceptions is the superblock.
 */
void
xfsbdstrat(
	struct xfs_mount	*mp,
	struct xfs_buf		*bp)
{
	if (XFS_FORCED_SHUTDOWN(mp)) {
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		xfs_bioerror_relse(bp);
		return;
	}

	xfs_buf_iorequest(bp);
}

1155
STATIC void
1156 1157
_xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1158 1159
	int			schedule)
{
1160
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1161
		xfs_buf_ioend(bp, schedule);
L
Linus Torvalds 已提交
1162 1163
}

A
Al Viro 已提交
1164
STATIC void
1165
xfs_buf_bio_end_io(
L
Linus Torvalds 已提交
1166 1167 1168
	struct bio		*bio,
	int			error)
{
1169
	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
L
Linus Torvalds 已提交
1170

1171
	xfs_buf_ioerror(bp, -error);
L
Linus Torvalds 已提交
1172

1173 1174 1175
	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));

1176
	_xfs_buf_ioend(bp, 1);
L
Linus Torvalds 已提交
1177 1178 1179 1180
	bio_put(bio);
}

STATIC void
1181 1182
_xfs_buf_ioapply(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1183
{
1184
	int			rw, map_i, total_nr_pages, nr_pages;
L
Linus Torvalds 已提交
1185
	struct bio		*bio;
1186 1187 1188
	int			offset = bp->b_offset;
	int			size = bp->b_count_desired;
	sector_t		sector = bp->b_bn;
L
Linus Torvalds 已提交
1189

1190
	total_nr_pages = bp->b_page_count;
L
Linus Torvalds 已提交
1191 1192
	map_i = 0;

1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203
	if (bp->b_flags & XBF_WRITE) {
		if (bp->b_flags & XBF_SYNCIO)
			rw = WRITE_SYNC;
		else
			rw = WRITE;
		if (bp->b_flags & XBF_FUA)
			rw |= REQ_FUA;
		if (bp->b_flags & XBF_FLUSH)
			rw |= REQ_FLUSH;
	} else if (bp->b_flags & XBF_READ_AHEAD) {
		rw = READA;
1204
	} else {
1205
		rw = READ;
1206 1207
	}

1208 1209 1210
	/* we only use the buffer cache for meta-data */
	rw |= REQ_META;

L
Linus Torvalds 已提交
1211
next_chunk:
1212
	atomic_inc(&bp->b_io_remaining);
L
Linus Torvalds 已提交
1213 1214 1215 1216 1217
	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
	if (nr_pages > total_nr_pages)
		nr_pages = total_nr_pages;

	bio = bio_alloc(GFP_NOIO, nr_pages);
1218
	bio->bi_bdev = bp->b_target->bt_bdev;
L
Linus Torvalds 已提交
1219
	bio->bi_sector = sector;
1220 1221
	bio->bi_end_io = xfs_buf_bio_end_io;
	bio->bi_private = bp;
L
Linus Torvalds 已提交
1222

1223

L
Linus Torvalds 已提交
1224
	for (; size && nr_pages; nr_pages--, map_i++) {
1225
		int	rbytes, nbytes = PAGE_SIZE - offset;
L
Linus Torvalds 已提交
1226 1227 1228 1229

		if (nbytes > size)
			nbytes = size;

1230 1231
		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
		if (rbytes < nbytes)
L
Linus Torvalds 已提交
1232 1233 1234 1235 1236 1237 1238 1239 1240
			break;

		offset = 0;
		sector += nbytes >> BBSHIFT;
		size -= nbytes;
		total_nr_pages--;
	}

	if (likely(bio->bi_size)) {
1241 1242 1243 1244
		if (xfs_buf_is_vmapped(bp)) {
			flush_kernel_vmap_range(bp->b_addr,
						xfs_buf_vmap_len(bp));
		}
L
Linus Torvalds 已提交
1245 1246 1247 1248
		submit_bio(rw, bio);
		if (size)
			goto next_chunk;
	} else {
1249
		xfs_buf_ioerror(bp, EIO);
1250
		bio_put(bio);
L
Linus Torvalds 已提交
1251 1252 1253
	}
}

1254
void
1255 1256
xfs_buf_iorequest(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1257
{
C
Christoph Hellwig 已提交
1258
	trace_xfs_buf_iorequest(bp, _RET_IP_);
L
Linus Torvalds 已提交
1259

1260
	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
L
Linus Torvalds 已提交
1261

1262
	if (bp->b_flags & XBF_WRITE)
1263 1264
		xfs_buf_wait_unpin(bp);
	xfs_buf_hold(bp);
L
Linus Torvalds 已提交
1265 1266 1267

	/* Set the count to 1 initially, this will stop an I/O
	 * completion callout which happens before we have started
1268
	 * all the I/O from calling xfs_buf_ioend too early.
L
Linus Torvalds 已提交
1269
	 */
1270 1271 1272
	atomic_set(&bp->b_io_remaining, 1);
	_xfs_buf_ioapply(bp);
	_xfs_buf_ioend(bp, 0);
L
Linus Torvalds 已提交
1273

1274
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
1275 1276 1277
}

/*
1278 1279 1280
 * Waits for I/O to complete on the buffer supplied.  It returns immediately if
 * no I/O is pending or there is already a pending error on the buffer.  It
 * returns the I/O error code, if any, or 0 if there was no error.
L
Linus Torvalds 已提交
1281 1282
 */
int
1283 1284
xfs_buf_iowait(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1285
{
C
Christoph Hellwig 已提交
1286 1287
	trace_xfs_buf_iowait(bp, _RET_IP_);

1288 1289
	if (!bp->b_error)
		wait_for_completion(&bp->b_iowait);
C
Christoph Hellwig 已提交
1290 1291

	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1292
	return bp->b_error;
L
Linus Torvalds 已提交
1293 1294
}

1295 1296 1297
xfs_caddr_t
xfs_buf_offset(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1298 1299 1300 1301
	size_t			offset)
{
	struct page		*page;

1302
	if (bp->b_flags & XBF_MAPPED)
1303
		return bp->b_addr + offset;
L
Linus Torvalds 已提交
1304

1305
	offset += bp->b_offset;
1306 1307
	page = bp->b_pages[offset >> PAGE_SHIFT];
	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
L
Linus Torvalds 已提交
1308 1309 1310 1311 1312 1313
}

/*
 *	Move data into or out of a buffer.
 */
void
1314 1315
xfs_buf_iomove(
	xfs_buf_t		*bp,	/* buffer to process		*/
L
Linus Torvalds 已提交
1316 1317
	size_t			boff,	/* starting buffer offset	*/
	size_t			bsize,	/* length to copy		*/
1318
	void			*data,	/* data address			*/
1319
	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
L
Linus Torvalds 已提交
1320 1321 1322 1323 1324 1325
{
	size_t			bend, cpoff, csize;
	struct page		*page;

	bend = boff + bsize;
	while (boff < bend) {
1326 1327
		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
		cpoff = xfs_buf_poff(boff + bp->b_offset);
L
Linus Torvalds 已提交
1328
		csize = min_t(size_t,
1329
			      PAGE_SIZE-cpoff, bp->b_count_desired-boff);
L
Linus Torvalds 已提交
1330

1331
		ASSERT(((csize + cpoff) <= PAGE_SIZE));
L
Linus Torvalds 已提交
1332 1333

		switch (mode) {
1334
		case XBRW_ZERO:
L
Linus Torvalds 已提交
1335 1336
			memset(page_address(page) + cpoff, 0, csize);
			break;
1337
		case XBRW_READ:
L
Linus Torvalds 已提交
1338 1339
			memcpy(data, page_address(page) + cpoff, csize);
			break;
1340
		case XBRW_WRITE:
L
Linus Torvalds 已提交
1341 1342 1343 1344 1345 1346 1347 1348 1349
			memcpy(page_address(page) + cpoff, data, csize);
		}

		boff += csize;
		data += csize;
	}
}

/*
1350
 *	Handling of buffer targets (buftargs).
L
Linus Torvalds 已提交
1351 1352 1353
 */

/*
1354 1355 1356
 * Wait for any bufs with callbacks that have been submitted but have not yet
 * returned. These buffers will have an elevated hold count, so wait on those
 * while freeing all the buffers only held by the LRU.
L
Linus Torvalds 已提交
1357 1358 1359
 */
void
xfs_wait_buftarg(
1360
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1361
{
1362 1363 1364 1365 1366 1367 1368 1369
	struct xfs_buf		*bp;

restart:
	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
		if (atomic_read(&bp->b_hold) > 1) {
			spin_unlock(&btp->bt_lru_lock);
D
Dave Chinner 已提交
1370
			delay(100);
1371
			goto restart;
L
Linus Torvalds 已提交
1372
		}
1373
		/*
1374
		 * clear the LRU reference count so the buffer doesn't get
1375 1376 1377 1378 1379 1380
		 * ignored in xfs_buf_rele().
		 */
		atomic_set(&bp->b_lru_ref, 0);
		spin_unlock(&btp->bt_lru_lock);
		xfs_buf_rele(bp);
		spin_lock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1381
	}
1382
	spin_unlock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1383 1384
}

1385 1386 1387
int
xfs_buftarg_shrink(
	struct shrinker		*shrink,
1388
	struct shrink_control	*sc)
1389
{
1390 1391
	struct xfs_buftarg	*btp = container_of(shrink,
					struct xfs_buftarg, bt_shrinker);
1392
	struct xfs_buf		*bp;
1393
	int nr_to_scan = sc->nr_to_scan;
1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421
	LIST_HEAD(dispose);

	if (!nr_to_scan)
		return btp->bt_lru_nr;

	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		if (nr_to_scan-- <= 0)
			break;

		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);

		/*
		 * Decrement the b_lru_ref count unless the value is already
		 * zero. If the value is already zero, we need to reclaim the
		 * buffer, otherwise it gets another trip through the LRU.
		 */
		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
			list_move_tail(&bp->b_lru, &btp->bt_lru);
			continue;
		}

		/*
		 * remove the buffer from the LRU now to avoid needing another
		 * lock round trip inside xfs_buf_rele().
		 */
		list_move(&bp->b_lru, &dispose);
		btp->bt_lru_nr--;
1422
	}
1423 1424 1425 1426 1427 1428 1429 1430 1431
	spin_unlock(&btp->bt_lru_lock);

	while (!list_empty(&dispose)) {
		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
		list_del_init(&bp->b_lru);
		xfs_buf_rele(bp);
	}

	return btp->bt_lru_nr;
1432 1433
}

L
Linus Torvalds 已提交
1434 1435
void
xfs_free_buftarg(
1436 1437
	struct xfs_mount	*mp,
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1438
{
1439 1440
	unregister_shrinker(&btp->bt_shrinker);

1441 1442
	if (mp->m_flags & XFS_MOUNT_BARRIER)
		xfs_blkdev_issue_flush(btp);
1443

1444
	kmem_free(btp);
L
Linus Torvalds 已提交
1445 1446 1447 1448 1449 1450 1451 1452 1453
}

STATIC int
xfs_setsize_buftarg_flags(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize,
	int			verbose)
{
1454 1455 1456
	btp->bt_bsize = blocksize;
	btp->bt_sshift = ffs(sectorsize) - 1;
	btp->bt_smask = sectorsize - 1;
L
Linus Torvalds 已提交
1457

1458
	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1459 1460 1461 1462
		char name[BDEVNAME_SIZE];

		bdevname(btp->bt_bdev, name);

1463 1464
		xfs_warn(btp->bt_mount,
			"Cannot set_blocksize to %u on device %s\n",
1465
			sectorsize, name);
L
Linus Torvalds 已提交
1466 1467 1468 1469 1470 1471 1472
		return EINVAL;
	}

	return 0;
}

/*
1473 1474 1475 1476
 *	When allocating the initial buffer target we have not yet
 *	read in the superblock, so don't know what sized sectors
 *	are being used is at this early stage.  Play safe.
 */
L
Linus Torvalds 已提交
1477 1478 1479 1480 1481 1482
STATIC int
xfs_setsize_buftarg_early(
	xfs_buftarg_t		*btp,
	struct block_device	*bdev)
{
	return xfs_setsize_buftarg_flags(btp,
1483
			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
L
Linus Torvalds 已提交
1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
}

int
xfs_setsize_buftarg(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize)
{
	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
}

xfs_buftarg_t *
xfs_alloc_buftarg(
1497
	struct xfs_mount	*mp,
L
Linus Torvalds 已提交
1498
	struct block_device	*bdev,
1499 1500
	int			external,
	const char		*fsname)
L
Linus Torvalds 已提交
1501 1502 1503 1504 1505
{
	xfs_buftarg_t		*btp;

	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);

1506
	btp->bt_mount = mp;
1507 1508
	btp->bt_dev =  bdev->bd_dev;
	btp->bt_bdev = bdev;
1509 1510 1511 1512
	btp->bt_bdi = blk_get_backing_dev_info(bdev);
	if (!btp->bt_bdi)
		goto error;

1513 1514
	INIT_LIST_HEAD(&btp->bt_lru);
	spin_lock_init(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1515 1516
	if (xfs_setsize_buftarg_early(btp, bdev))
		goto error;
1517 1518 1519
	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
	register_shrinker(&btp->bt_shrinker);
L
Linus Torvalds 已提交
1520 1521 1522
	return btp;

error:
1523
	kmem_free(btp);
L
Linus Torvalds 已提交
1524 1525 1526 1527
	return NULL;
}

/*
1528 1529 1530 1531 1532 1533 1534 1535 1536
 * Add a buffer to the delayed write list.
 *
 * This queues a buffer for writeout if it hasn't already been.  Note that
 * neither this routine nor the buffer list submission functions perform
 * any internal synchronization.  It is expected that the lists are thread-local
 * to the callers.
 *
 * Returns true if we queued up the buffer, or false if it already had
 * been on the buffer list.
L
Linus Torvalds 已提交
1537
 */
1538
bool
1539
xfs_buf_delwri_queue(
1540 1541
	struct xfs_buf		*bp,
	struct list_head	*list)
L
Linus Torvalds 已提交
1542
{
1543
	ASSERT(xfs_buf_islocked(bp));
1544
	ASSERT(!(bp->b_flags & XBF_READ));
L
Linus Torvalds 已提交
1545

1546 1547 1548 1549 1550 1551 1552 1553
	/*
	 * If the buffer is already marked delwri it already is queued up
	 * by someone else for imediate writeout.  Just ignore it in that
	 * case.
	 */
	if (bp->b_flags & _XBF_DELWRI_Q) {
		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
		return false;
L
Linus Torvalds 已提交
1554 1555
	}

1556
	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1557 1558

	/*
1559 1560 1561 1562 1563 1564
	 * If a buffer gets written out synchronously or marked stale while it
	 * is on a delwri list we lazily remove it. To do this, the other party
	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
	 * It remains referenced and on the list.  In a rare corner case it
	 * might get readded to a delwri list after the synchronous writeout, in
	 * which case we need just need to re-add the flag here.
1565
	 */
1566 1567 1568 1569
	bp->b_flags |= _XBF_DELWRI_Q;
	if (list_empty(&bp->b_list)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_list, list);
1570 1571
	}

1572
	return true;
1573 1574
}

1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597
/*
 * Compare function is more complex than it needs to be because
 * the return value is only 32 bits and we are doing comparisons
 * on 64 bit values
 */
static int
xfs_buf_cmp(
	void		*priv,
	struct list_head *a,
	struct list_head *b)
{
	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
	xfs_daddr_t		diff;

	diff = ap->b_bn - bp->b_bn;
	if (diff < 0)
		return -1;
	if (diff > 0)
		return 1;
	return 0;
}

1598 1599 1600 1601 1602
static int
__xfs_buf_delwri_submit(
	struct list_head	*buffer_list,
	struct list_head	*io_list,
	bool			wait)
L
Linus Torvalds 已提交
1603
{
1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618
	struct blk_plug		plug;
	struct xfs_buf		*bp, *n;
	int			pinned = 0;

	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
		if (!wait) {
			if (xfs_buf_ispinned(bp)) {
				pinned++;
				continue;
			}
			if (!xfs_buf_trylock(bp))
				continue;
		} else {
			xfs_buf_lock(bp);
		}
1619

1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
		/*
		 * Someone else might have written the buffer synchronously or
		 * marked it stale in the meantime.  In that case only the
		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
		 * reference and remove it from the list here.
		 */
		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
			list_del_init(&bp->b_list);
			xfs_buf_relse(bp);
			continue;
		}
D
Dave Chinner 已提交
1631

1632 1633 1634
		list_move_tail(&bp->b_list, io_list);
		trace_xfs_buf_delwri_split(bp, _RET_IP_);
	}
L
Linus Torvalds 已提交
1635

1636
	list_sort(NULL, io_list, xfs_buf_cmp);
L
Linus Torvalds 已提交
1637

1638 1639 1640 1641
	blk_start_plug(&plug);
	list_for_each_entry_safe(bp, n, io_list, b_list) {
		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
		bp->b_flags |= XBF_WRITE;
1642

1643 1644
		if (!wait) {
			bp->b_flags |= XBF_ASYNC;
1645
			list_del_init(&bp->b_list);
L
Linus Torvalds 已提交
1646
		}
1647 1648 1649
		xfs_bdstrat_cb(bp);
	}
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
1650

1651
	return pinned;
L
Linus Torvalds 已提交
1652 1653 1654
}

/*
1655 1656 1657 1658 1659 1660 1661
 * Write out a buffer list asynchronously.
 *
 * This will take the @buffer_list, write all non-locked and non-pinned buffers
 * out and not wait for I/O completion on any of the buffers.  This interface
 * is only safely useable for callers that can track I/O completion by higher
 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
 * function.
L
Linus Torvalds 已提交
1662 1663
 */
int
1664 1665
xfs_buf_delwri_submit_nowait(
	struct list_head	*buffer_list)
L
Linus Torvalds 已提交
1666
{
1667 1668 1669
	LIST_HEAD		(io_list);
	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
}
L
Linus Torvalds 已提交
1670

1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685
/*
 * Write out a buffer list synchronously.
 *
 * This will take the @buffer_list, write all buffers out and wait for I/O
 * completion on all of the buffers. @buffer_list is consumed by the function,
 * so callers must have some other way of tracking buffers if they require such
 * functionality.
 */
int
xfs_buf_delwri_submit(
	struct list_head	*buffer_list)
{
	LIST_HEAD		(io_list);
	int			error = 0, error2;
	struct xfs_buf		*bp;
L
Linus Torvalds 已提交
1686

1687
	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
L
Linus Torvalds 已提交
1688

1689 1690 1691
	/* Wait for IO to complete. */
	while (!list_empty(&io_list)) {
		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1692

1693
		list_del_init(&bp->b_list);
1694 1695 1696 1697
		error2 = xfs_buf_iowait(bp);
		xfs_buf_relse(bp);
		if (!error)
			error = error2;
L
Linus Torvalds 已提交
1698 1699
	}

1700
	return error;
L
Linus Torvalds 已提交
1701 1702
}

1703
int __init
1704
xfs_buf_init(void)
L
Linus Torvalds 已提交
1705
{
1706 1707
	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
						KM_ZONE_HWALIGN, NULL);
1708
	if (!xfs_buf_zone)
C
Christoph Hellwig 已提交
1709
		goto out;
1710

1711
	xfslogd_workqueue = alloc_workqueue("xfslogd",
1712
					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1713
	if (!xfslogd_workqueue)
1714
		goto out_free_buf_zone;
L
Linus Torvalds 已提交
1715

1716
	return 0;
L
Linus Torvalds 已提交
1717

1718
 out_free_buf_zone:
1719
	kmem_zone_destroy(xfs_buf_zone);
C
Christoph Hellwig 已提交
1720
 out:
1721
	return -ENOMEM;
L
Linus Torvalds 已提交
1722 1723 1724
}

void
1725
xfs_buf_terminate(void)
L
Linus Torvalds 已提交
1726
{
1727
	destroy_workqueue(xfslogd_workqueue);
1728
	kmem_zone_destroy(xfs_buf_zone);
L
Linus Torvalds 已提交
1729
}