xfs_buf.c 43.4 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3
 * All Rights Reserved.
L
Linus Torvalds 已提交
4
 *
5 6
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
L
Linus Torvalds 已提交
7 8
 * published by the Free Software Foundation.
 *
9 10 11 12
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
L
Linus Torvalds 已提交
13
 *
14 15 16
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
L
Linus Torvalds 已提交
17
 */
18
#include "xfs.h"
L
Linus Torvalds 已提交
19 20
#include <linux/stddef.h>
#include <linux/errno.h>
21
#include <linux/gfp.h>
L
Linus Torvalds 已提交
22 23 24 25 26 27 28 29 30 31
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/percpu.h>
#include <linux/blkdev.h>
#include <linux/hash.h>
32
#include <linux/kthread.h>
C
Christoph Lameter 已提交
33
#include <linux/migrate.h>
34
#include <linux/backing-dev.h>
35
#include <linux/freezer.h>
L
Linus Torvalds 已提交
36

37
#include "xfs_format.h"
38
#include "xfs_log_format.h"
39
#include "xfs_trans_resv.h"
40
#include "xfs_sb.h"
41
#include "xfs_mount.h"
C
Christoph Hellwig 已提交
42
#include "xfs_trace.h"
43
#include "xfs_log.h"
44

45
static kmem_zone_t *xfs_buf_zone;
46

47 48 49 50
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
L
Linus Torvalds 已提交
51
#else
52 53 54
# define XB_SET_OWNER(bp)	do { } while (0)
# define XB_CLEAR_OWNER(bp)	do { } while (0)
# define XB_GET_OWNER(bp)	do { } while (0)
L
Linus Torvalds 已提交
55 56
#endif

57
#define xb_to_gfp(flags) \
D
Dave Chinner 已提交
58
	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
L
Linus Torvalds 已提交
59 60


61 62 63 64 65 66 67
static inline int
xfs_buf_is_vmapped(
	struct xfs_buf	*bp)
{
	/*
	 * Return true if the buffer is vmapped.
	 *
68 69 70
	 * b_addr is null if the buffer is not mapped, but the code is clever
	 * enough to know it doesn't have to map a single page, so the check has
	 * to be both for b_addr and bp->b_page_count > 1.
71
	 */
72
	return bp->b_addr && bp->b_page_count > 1;
73 74 75 76 77 78 79 80 81
}

static inline int
xfs_buf_vmap_len(
	struct xfs_buf	*bp)
{
	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
}

82 83 84 85 86 87 88 89 90 91 92 93
/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
 * to remove the reference that LRU holds on the buffer.
 *
 * This prevents build-up of stale buffers on the LRU.
 */
void
xfs_buf_stale(
	struct xfs_buf	*bp)
{
94 95
	ASSERT(xfs_buf_islocked(bp));

96
	bp->b_flags |= XBF_STALE;
97 98 99 100 101 102 103 104

	/*
	 * Clear the delwri status so that a delwri queue walker will not
	 * flush this buffer to disk now that it is stale. The delwri queue has
	 * a reference to the buffer, so this is safe to do.
	 */
	bp->b_flags &= ~_XBF_DELWRI_Q;

105 106 107
	spin_lock(&bp->b_lock);
	atomic_set(&bp->b_lru_ref, 0);
	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
108 109 110
	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
		atomic_dec(&bp->b_hold);

111
	ASSERT(atomic_read(&bp->b_hold) >= 1);
112
	spin_unlock(&bp->b_lock);
113
}
L
Linus Torvalds 已提交
114

115 116 117 118 119 120 121 122 123
static int
xfs_buf_get_maps(
	struct xfs_buf		*bp,
	int			map_count)
{
	ASSERT(bp->b_maps == NULL);
	bp->b_map_count = map_count;

	if (map_count == 1) {
124
		bp->b_maps = &bp->__b_map;
125 126 127 128 129 130
		return 0;
	}

	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
				KM_NOFS);
	if (!bp->b_maps)
D
Dave Chinner 已提交
131
		return -ENOMEM;
132 133 134 135 136 137 138 139 140 141
	return 0;
}

/*
 *	Frees b_pages if it was allocated.
 */
static void
xfs_buf_free_maps(
	struct xfs_buf	*bp)
{
142
	if (bp->b_maps != &bp->__b_map) {
143 144 145 146 147
		kmem_free(bp->b_maps);
		bp->b_maps = NULL;
	}
}

148
struct xfs_buf *
149
_xfs_buf_alloc(
150
	struct xfs_buftarg	*target,
151 152
	struct xfs_buf_map	*map,
	int			nmaps,
153
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
154
{
155
	struct xfs_buf		*bp;
156 157
	int			error;
	int			i;
158

D
Dave Chinner 已提交
159
	bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
160 161 162
	if (unlikely(!bp))
		return NULL;

L
Linus Torvalds 已提交
163
	/*
164 165
	 * We don't want certain flags to appear in b_flags unless they are
	 * specifically set by later operations on the buffer.
L
Linus Torvalds 已提交
166
	 */
167
	flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
168 169

	atomic_set(&bp->b_hold, 1);
170
	atomic_set(&bp->b_lru_ref, 1);
171
	init_completion(&bp->b_iowait);
172
	INIT_LIST_HEAD(&bp->b_lru);
173
	INIT_LIST_HEAD(&bp->b_list);
174
	RB_CLEAR_NODE(&bp->b_rbnode);
T
Thomas Gleixner 已提交
175
	sema_init(&bp->b_sema, 0); /* held, no waiters */
176
	spin_lock_init(&bp->b_lock);
177 178
	XB_SET_OWNER(bp);
	bp->b_target = target;
179
	bp->b_flags = flags;
D
Dave Chinner 已提交
180

L
Linus Torvalds 已提交
181
	/*
182 183
	 * Set length and io_length to the same value initially.
	 * I/O routines should use io_length, which will be the same in
L
Linus Torvalds 已提交
184 185
	 * most cases but may be reset (e.g. XFS recovery).
	 */
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
	error = xfs_buf_get_maps(bp, nmaps);
	if (error)  {
		kmem_zone_free(xfs_buf_zone, bp);
		return NULL;
	}

	bp->b_bn = map[0].bm_bn;
	bp->b_length = 0;
	for (i = 0; i < nmaps; i++) {
		bp->b_maps[i].bm_bn = map[i].bm_bn;
		bp->b_maps[i].bm_len = map[i].bm_len;
		bp->b_length += map[i].bm_len;
	}
	bp->b_io_length = bp->b_length;

201 202 203 204
	atomic_set(&bp->b_pin_count, 0);
	init_waitqueue_head(&bp->b_waiters);

	XFS_STATS_INC(xb_create);
C
Christoph Hellwig 已提交
205
	trace_xfs_buf_init(bp, _RET_IP_);
206 207

	return bp;
L
Linus Torvalds 已提交
208 209 210
}

/*
211 212
 *	Allocate a page array capable of holding a specified number
 *	of pages, and point the page buf at it.
L
Linus Torvalds 已提交
213 214
 */
STATIC int
215 216
_xfs_buf_get_pages(
	xfs_buf_t		*bp,
217
	int			page_count)
L
Linus Torvalds 已提交
218 219
{
	/* Make sure that we have a page list */
220 221 222 223
	if (bp->b_pages == NULL) {
		bp->b_page_count = page_count;
		if (page_count <= XB_PAGES) {
			bp->b_pages = bp->b_page_array;
L
Linus Torvalds 已提交
224
		} else {
225
			bp->b_pages = kmem_alloc(sizeof(struct page *) *
D
Dave Chinner 已提交
226
						 page_count, KM_NOFS);
227
			if (bp->b_pages == NULL)
L
Linus Torvalds 已提交
228 229
				return -ENOMEM;
		}
230
		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
L
Linus Torvalds 已提交
231 232 233 234 235
	}
	return 0;
}

/*
236
 *	Frees b_pages if it was allocated.
L
Linus Torvalds 已提交
237 238
 */
STATIC void
239
_xfs_buf_free_pages(
L
Linus Torvalds 已提交
240 241
	xfs_buf_t	*bp)
{
242
	if (bp->b_pages != bp->b_page_array) {
243
		kmem_free(bp->b_pages);
244
		bp->b_pages = NULL;
L
Linus Torvalds 已提交
245 246 247 248 249 250 251
	}
}

/*
 *	Releases the specified buffer.
 *
 * 	The modification state of any associated pages is left unchanged.
252
 * 	The buffer must not be on any hash - use xfs_buf_rele instead for
L
Linus Torvalds 已提交
253 254 255
 * 	hashed and refcounted buffers
 */
void
256
xfs_buf_free(
L
Linus Torvalds 已提交
257 258
	xfs_buf_t		*bp)
{
C
Christoph Hellwig 已提交
259
	trace_xfs_buf_free(bp, _RET_IP_);
L
Linus Torvalds 已提交
260

261 262
	ASSERT(list_empty(&bp->b_lru));

263
	if (bp->b_flags & _XBF_PAGES) {
L
Linus Torvalds 已提交
264 265
		uint		i;

266
		if (xfs_buf_is_vmapped(bp))
A
Alex Elder 已提交
267 268
			vm_unmap_ram(bp->b_addr - bp->b_offset,
					bp->b_page_count);
L
Linus Torvalds 已提交
269

270 271 272
		for (i = 0; i < bp->b_page_count; i++) {
			struct page	*page = bp->b_pages[i];

273
			__free_page(page);
274
		}
275 276
	} else if (bp->b_flags & _XBF_KMEM)
		kmem_free(bp->b_addr);
277
	_xfs_buf_free_pages(bp);
278
	xfs_buf_free_maps(bp);
279
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
280 281 282
}

/*
283
 * Allocates all the pages for buffer in question and builds it's page list.
L
Linus Torvalds 已提交
284 285
 */
STATIC int
286
xfs_buf_allocate_memory(
L
Linus Torvalds 已提交
287 288 289
	xfs_buf_t		*bp,
	uint			flags)
{
290
	size_t			size;
L
Linus Torvalds 已提交
291
	size_t			nbytes, offset;
292
	gfp_t			gfp_mask = xb_to_gfp(flags);
L
Linus Torvalds 已提交
293
	unsigned short		page_count, i;
D
Dave Chinner 已提交
294
	xfs_off_t		start, end;
L
Linus Torvalds 已提交
295 296
	int			error;

297 298 299 300 301
	/*
	 * for buffers that are contained within a single page, just allocate
	 * the memory from the heap - there's no need for the complexity of
	 * page arrays to keep allocation down to order 0.
	 */
D
Dave Chinner 已提交
302 303
	size = BBTOB(bp->b_length);
	if (size < PAGE_SIZE) {
D
Dave Chinner 已提交
304
		bp->b_addr = kmem_alloc(size, KM_NOFS);
305 306 307 308 309
		if (!bp->b_addr) {
			/* low memory - use alloc_page loop instead */
			goto use_alloc_page;
		}

D
Dave Chinner 已提交
310
		if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
311 312 313 314 315 316 317 318 319 320
		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
			/* b_addr spans two pages - use alloc_page instead */
			kmem_free(bp->b_addr);
			bp->b_addr = NULL;
			goto use_alloc_page;
		}
		bp->b_offset = offset_in_page(bp->b_addr);
		bp->b_pages = bp->b_page_array;
		bp->b_pages[0] = virt_to_page(bp->b_addr);
		bp->b_page_count = 1;
321
		bp->b_flags |= _XBF_KMEM;
322 323 324 325
		return 0;
	}

use_alloc_page:
326 327
	start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
	end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
328
								>> PAGE_SHIFT;
D
Dave Chinner 已提交
329
	page_count = end - start;
330
	error = _xfs_buf_get_pages(bp, page_count);
L
Linus Torvalds 已提交
331 332 333
	if (unlikely(error))
		return error;

334
	offset = bp->b_offset;
335
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
336

337
	for (i = 0; i < bp->b_page_count; i++) {
L
Linus Torvalds 已提交
338 339
		struct page	*page;
		uint		retries = 0;
340 341
retry:
		page = alloc_page(gfp_mask);
L
Linus Torvalds 已提交
342
		if (unlikely(page == NULL)) {
343 344
			if (flags & XBF_READ_AHEAD) {
				bp->b_page_count = i;
D
Dave Chinner 已提交
345
				error = -ENOMEM;
346
				goto out_free_pages;
L
Linus Torvalds 已提交
347 348 349 350 351 352 353 354 355
			}

			/*
			 * This could deadlock.
			 *
			 * But until all the XFS lowlevel code is revamped to
			 * handle buffer allocation failures we can't do much.
			 */
			if (!(++retries % 100))
356 357
				xfs_err(NULL,
		"possible memory allocation deadlock in %s (mode:0x%x)",
358
					__func__, gfp_mask);
L
Linus Torvalds 已提交
359

360
			XFS_STATS_INC(xb_page_retries);
361
			congestion_wait(BLK_RW_ASYNC, HZ/50);
L
Linus Torvalds 已提交
362 363 364
			goto retry;
		}

365
		XFS_STATS_INC(xb_page_found);
L
Linus Torvalds 已提交
366

367
		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
L
Linus Torvalds 已提交
368
		size -= nbytes;
369
		bp->b_pages[i] = page;
L
Linus Torvalds 已提交
370 371
		offset = 0;
	}
372
	return 0;
L
Linus Torvalds 已提交
373

374 375 376
out_free_pages:
	for (i = 0; i < bp->b_page_count; i++)
		__free_page(bp->b_pages[i]);
L
Linus Torvalds 已提交
377 378 379 380
	return error;
}

/*
L
Lucas De Marchi 已提交
381
 *	Map buffer into kernel address-space if necessary.
L
Linus Torvalds 已提交
382 383
 */
STATIC int
384
_xfs_buf_map_pages(
L
Linus Torvalds 已提交
385 386 387
	xfs_buf_t		*bp,
	uint			flags)
{
388
	ASSERT(bp->b_flags & _XBF_PAGES);
389
	if (bp->b_page_count == 1) {
390
		/* A single page buffer is always mappable */
391
		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
392 393 394
	} else if (flags & XBF_UNMAPPED) {
		bp->b_addr = NULL;
	} else {
395
		int retried = 0;
396 397 398 399 400 401 402 403 404 405 406
		unsigned noio_flag;

		/*
		 * vm_map_ram() will allocate auxillary structures (e.g.
		 * pagetables) with GFP_KERNEL, yet we are likely to be under
		 * GFP_NOFS context here. Hence we need to tell memory reclaim
		 * that we are in such a context via PF_MEMALLOC_NOIO to prevent
		 * memory reclaim re-entering the filesystem here and
		 * potentially deadlocking.
		 */
		noio_flag = memalloc_noio_save();
407 408 409 410 411 412 413
		do {
			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
						-1, PAGE_KERNEL);
			if (bp->b_addr)
				break;
			vm_unmap_aliases();
		} while (retried++ <= 1);
414
		memalloc_noio_restore(noio_flag);
415 416

		if (!bp->b_addr)
L
Linus Torvalds 已提交
417
			return -ENOMEM;
418
		bp->b_addr += bp->b_offset;
L
Linus Torvalds 已提交
419 420 421 422 423 424 425 426 427 428
	}

	return 0;
}

/*
 *	Finding and Reading Buffers
 */

/*
429
 *	Look up, and creates if absent, a lockable buffer for
L
Linus Torvalds 已提交
430
 *	a given range of an inode.  The buffer is returned
431
 *	locked.	No I/O is implied by this call.
L
Linus Torvalds 已提交
432 433
 */
xfs_buf_t *
434
_xfs_buf_find(
435
	struct xfs_buftarg	*btp,
436 437
	struct xfs_buf_map	*map,
	int			nmaps,
438 439
	xfs_buf_flags_t		flags,
	xfs_buf_t		*new_bp)
L
Linus Torvalds 已提交
440
{
441
	size_t			numbytes;
442 443 444 445
	struct xfs_perag	*pag;
	struct rb_node		**rbp;
	struct rb_node		*parent;
	xfs_buf_t		*bp;
446
	xfs_daddr_t		blkno = map[0].bm_bn;
447
	xfs_daddr_t		eofs;
448 449
	int			numblks = 0;
	int			i;
L
Linus Torvalds 已提交
450

451 452
	for (i = 0; i < nmaps; i++)
		numblks += map[i].bm_len;
453
	numbytes = BBTOB(numblks);
L
Linus Torvalds 已提交
454 455

	/* Check for IOs smaller than the sector size / not sector aligned */
456 457
	ASSERT(!(numbytes < btp->bt_meta_sectorsize));
	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
L
Linus Torvalds 已提交
458

459 460 461 462 463
	/*
	 * Corrupted block numbers can get through to here, unfortunately, so we
	 * have to check that the buffer falls within the filesystem bounds.
	 */
	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
464
	if (blkno < 0 || blkno >= eofs) {
465
		/*
D
Dave Chinner 已提交
466
		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
467 468 469 470 471 472
		 * but none of the higher level infrastructure supports
		 * returning a specific error on buffer lookup failures.
		 */
		xfs_alert(btp->bt_mount,
			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
			  __func__, blkno, eofs);
D
Dave Chinner 已提交
473
		WARN_ON(1);
474 475 476
		return NULL;
	}

477 478
	/* get tree root */
	pag = xfs_perag_get(btp->bt_mount,
479
				xfs_daddr_to_agno(btp->bt_mount, blkno));
480 481 482 483 484 485 486 487 488 489

	/* walk tree */
	spin_lock(&pag->pag_buf_lock);
	rbp = &pag->pag_buf_tree.rb_node;
	parent = NULL;
	bp = NULL;
	while (*rbp) {
		parent = *rbp;
		bp = rb_entry(parent, struct xfs_buf, b_rbnode);

D
Dave Chinner 已提交
490
		if (blkno < bp->b_bn)
491
			rbp = &(*rbp)->rb_left;
D
Dave Chinner 已提交
492
		else if (blkno > bp->b_bn)
493 494 495
			rbp = &(*rbp)->rb_right;
		else {
			/*
D
Dave Chinner 已提交
496
			 * found a block number match. If the range doesn't
497 498 499 500 501 502
			 * match, the only way this is allowed is if the buffer
			 * in the cache is stale and the transaction that made
			 * it stale has not yet committed. i.e. we are
			 * reallocating a busy extent. Skip this buffer and
			 * continue searching to the right for an exact match.
			 */
503
			if (bp->b_length != numblks) {
504 505 506 507
				ASSERT(bp->b_flags & XBF_STALE);
				rbp = &(*rbp)->rb_right;
				continue;
			}
508
			atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
509 510 511 512 513
			goto found;
		}
	}

	/* No match found */
514
	if (new_bp) {
515 516 517 518 519
		rb_link_node(&new_bp->b_rbnode, parent, rbp);
		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
		/* the buffer keeps the perag reference until it is freed */
		new_bp->b_pag = pag;
		spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
520
	} else {
521
		XFS_STATS_INC(xb_miss_locked);
522 523
		spin_unlock(&pag->pag_buf_lock);
		xfs_perag_put(pag);
L
Linus Torvalds 已提交
524
	}
525
	return new_bp;
L
Linus Torvalds 已提交
526 527

found:
528 529
	spin_unlock(&pag->pag_buf_lock);
	xfs_perag_put(pag);
L
Linus Torvalds 已提交
530

531 532
	if (!xfs_buf_trylock(bp)) {
		if (flags & XBF_TRYLOCK) {
533 534 535
			xfs_buf_rele(bp);
			XFS_STATS_INC(xb_busy_locked);
			return NULL;
L
Linus Torvalds 已提交
536
		}
537 538
		xfs_buf_lock(bp);
		XFS_STATS_INC(xb_get_locked_waited);
L
Linus Torvalds 已提交
539 540
	}

541 542 543 544 545
	/*
	 * if the buffer is stale, clear all the external state associated with
	 * it. We need to keep flags such as how we allocated the buffer memory
	 * intact here.
	 */
546 547
	if (bp->b_flags & XBF_STALE) {
		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
D
Dave Chinner 已提交
548
		ASSERT(bp->b_iodone == NULL);
549
		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
550
		bp->b_ops = NULL;
551
	}
C
Christoph Hellwig 已提交
552 553

	trace_xfs_buf_find(bp, flags, _RET_IP_);
554 555
	XFS_STATS_INC(xb_get_locked);
	return bp;
L
Linus Torvalds 已提交
556 557 558
}

/*
559 560 561
 * Assembles a buffer covering the specified range. The code is optimised for
 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 * more hits than misses.
L
Linus Torvalds 已提交
562
 */
563
struct xfs_buf *
564 565 566 567
xfs_buf_get_map(
	struct xfs_buftarg	*target,
	struct xfs_buf_map	*map,
	int			nmaps,
568
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
569
{
570 571
	struct xfs_buf		*bp;
	struct xfs_buf		*new_bp;
572
	int			error = 0;
L
Linus Torvalds 已提交
573

574
	bp = _xfs_buf_find(target, map, nmaps, flags, NULL);
575 576 577
	if (likely(bp))
		goto found;

578
	new_bp = _xfs_buf_alloc(target, map, nmaps, flags);
579
	if (unlikely(!new_bp))
L
Linus Torvalds 已提交
580 581
		return NULL;

582 583
	error = xfs_buf_allocate_memory(new_bp, flags);
	if (error) {
584
		xfs_buf_free(new_bp);
585 586 587
		return NULL;
	}

588
	bp = _xfs_buf_find(target, map, nmaps, flags, new_bp);
589
	if (!bp) {
590
		xfs_buf_free(new_bp);
591 592 593
		return NULL;
	}

594 595
	if (bp != new_bp)
		xfs_buf_free(new_bp);
L
Linus Torvalds 已提交
596

597
found:
598
	if (!bp->b_addr) {
599
		error = _xfs_buf_map_pages(bp, flags);
L
Linus Torvalds 已提交
600
		if (unlikely(error)) {
601
			xfs_warn(target->bt_mount,
602
				"%s: failed to map pagesn", __func__);
D
Dave Chinner 已提交
603 604
			xfs_buf_relse(bp);
			return NULL;
L
Linus Torvalds 已提交
605 606 607
		}
	}

608
	XFS_STATS_INC(xb_get);
C
Christoph Hellwig 已提交
609
	trace_xfs_buf_get(bp, flags, _RET_IP_);
610
	return bp;
L
Linus Torvalds 已提交
611 612
}

C
Christoph Hellwig 已提交
613 614 615 616 617
STATIC int
_xfs_buf_read(
	xfs_buf_t		*bp,
	xfs_buf_flags_t		flags)
{
618
	ASSERT(!(flags & XBF_WRITE));
619
	ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
C
Christoph Hellwig 已提交
620

621
	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
622
	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
C
Christoph Hellwig 已提交
623

624 625
	if (flags & XBF_ASYNC) {
		xfs_buf_submit(bp);
626
		return 0;
627 628
	}
	return xfs_buf_submit_wait(bp);
C
Christoph Hellwig 已提交
629 630
}

L
Linus Torvalds 已提交
631
xfs_buf_t *
632 633 634 635
xfs_buf_read_map(
	struct xfs_buftarg	*target,
	struct xfs_buf_map	*map,
	int			nmaps,
636
	xfs_buf_flags_t		flags,
637
	const struct xfs_buf_ops *ops)
L
Linus Torvalds 已提交
638
{
639
	struct xfs_buf		*bp;
640 641 642

	flags |= XBF_READ;

643
	bp = xfs_buf_get_map(target, map, nmaps, flags);
644
	if (bp) {
C
Christoph Hellwig 已提交
645 646
		trace_xfs_buf_read(bp, flags, _RET_IP_);

647 648
		if (!XFS_BUF_ISDONE(bp)) {
			XFS_STATS_INC(xb_get_read);
649
			bp->b_ops = ops;
C
Christoph Hellwig 已提交
650
			_xfs_buf_read(bp, flags);
651
		} else if (flags & XBF_ASYNC) {
L
Linus Torvalds 已提交
652 653 654 655
			/*
			 * Read ahead call which is already satisfied,
			 * drop the buffer
			 */
D
Dave Chinner 已提交
656 657
			xfs_buf_relse(bp);
			return NULL;
L
Linus Torvalds 已提交
658 659
		} else {
			/* We do not want read in the flags */
660
			bp->b_flags &= ~XBF_READ;
L
Linus Torvalds 已提交
661 662 663
		}
	}

664
	return bp;
L
Linus Torvalds 已提交
665 666 667
}

/*
668 669
 *	If we are not low on memory then do the readahead in a deadlock
 *	safe manner.
L
Linus Torvalds 已提交
670 671
 */
void
672 673 674
xfs_buf_readahead_map(
	struct xfs_buftarg	*target,
	struct xfs_buf_map	*map,
675
	int			nmaps,
676
	const struct xfs_buf_ops *ops)
L
Linus Torvalds 已提交
677
{
678
	if (bdi_read_congested(target->bt_bdi))
L
Linus Torvalds 已提交
679 680
		return;

681
	xfs_buf_read_map(target, map, nmaps,
682
		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
L
Linus Torvalds 已提交
683 684
}

685 686 687 688
/*
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
689
int
690 691 692
xfs_buf_read_uncached(
	struct xfs_buftarg	*target,
	xfs_daddr_t		daddr,
693
	size_t			numblks,
694
	int			flags,
695
	struct xfs_buf		**bpp,
696
	const struct xfs_buf_ops *ops)
697
{
698
	struct xfs_buf		*bp;
699

700 701
	*bpp = NULL;

702
	bp = xfs_buf_get_uncached(target, numblks, flags);
703
	if (!bp)
704
		return -ENOMEM;
705 706

	/* set up the buffer for a read IO */
707
	ASSERT(bp->b_map_count == 1);
708
	bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
709
	bp->b_maps[0].bm_bn = daddr;
710
	bp->b_flags |= XBF_READ;
711
	bp->b_ops = ops;
712

713
	xfs_buf_submit_wait(bp);
714 715
	if (bp->b_error) {
		int	error = bp->b_error;
C
Christoph Hellwig 已提交
716
		xfs_buf_relse(bp);
717
		return error;
C
Christoph Hellwig 已提交
718
	}
719 720 721

	*bpp = bp;
	return 0;
L
Linus Torvalds 已提交
722 723
}

724 725 726 727 728 729 730
/*
 * Return a buffer allocated as an empty buffer and associated to external
 * memory via xfs_buf_associate_memory() back to it's empty state.
 */
void
xfs_buf_set_empty(
	struct xfs_buf		*bp,
731
	size_t			numblks)
732 733 734 735 736 737 738
{
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);

	bp->b_pages = NULL;
	bp->b_page_count = 0;
	bp->b_addr = NULL;
739
	bp->b_length = numblks;
740
	bp->b_io_length = numblks;
741 742

	ASSERT(bp->b_map_count == 1);
743
	bp->b_bn = XFS_BUF_DADDR_NULL;
744 745
	bp->b_maps[0].bm_bn = XFS_BUF_DADDR_NULL;
	bp->b_maps[0].bm_len = bp->b_length;
746 747
}

L
Linus Torvalds 已提交
748 749 750 751
static inline struct page *
mem_to_page(
	void			*addr)
{
752
	if ((!is_vmalloc_addr(addr))) {
L
Linus Torvalds 已提交
753 754 755 756 757 758 759
		return virt_to_page(addr);
	} else {
		return vmalloc_to_page(addr);
	}
}

int
760 761
xfs_buf_associate_memory(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
762 763 764 765 766
	void			*mem,
	size_t			len)
{
	int			rval;
	int			i = 0;
767 768 769
	unsigned long		pageaddr;
	unsigned long		offset;
	size_t			buflen;
L
Linus Torvalds 已提交
770 771
	int			page_count;

772
	pageaddr = (unsigned long)mem & PAGE_MASK;
773
	offset = (unsigned long)mem - pageaddr;
774 775
	buflen = PAGE_ALIGN(len + offset);
	page_count = buflen >> PAGE_SHIFT;
L
Linus Torvalds 已提交
776 777

	/* Free any previous set of page pointers */
778 779
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
780

781 782
	bp->b_pages = NULL;
	bp->b_addr = mem;
L
Linus Torvalds 已提交
783

784
	rval = _xfs_buf_get_pages(bp, page_count);
L
Linus Torvalds 已提交
785 786 787
	if (rval)
		return rval;

788
	bp->b_offset = offset;
789 790 791

	for (i = 0; i < bp->b_page_count; i++) {
		bp->b_pages[i] = mem_to_page((void *)pageaddr);
792
		pageaddr += PAGE_SIZE;
L
Linus Torvalds 已提交
793 794
	}

795
	bp->b_io_length = BTOBB(len);
796
	bp->b_length = BTOBB(buflen);
L
Linus Torvalds 已提交
797 798 799 800 801

	return 0;
}

xfs_buf_t *
802 803
xfs_buf_get_uncached(
	struct xfs_buftarg	*target,
804
	size_t			numblks,
805
	int			flags)
L
Linus Torvalds 已提交
806
{
807
	unsigned long		page_count;
808
	int			error, i;
809 810
	struct xfs_buf		*bp;
	DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
L
Linus Torvalds 已提交
811

812
	bp = _xfs_buf_alloc(target, &map, 1, 0);
L
Linus Torvalds 已提交
813 814 815
	if (unlikely(bp == NULL))
		goto fail;

816
	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
817
	error = _xfs_buf_get_pages(bp, page_count);
818
	if (error)
L
Linus Torvalds 已提交
819 820
		goto fail_free_buf;

821
	for (i = 0; i < page_count; i++) {
822
		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
823 824
		if (!bp->b_pages[i])
			goto fail_free_mem;
L
Linus Torvalds 已提交
825
	}
826
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
827

828
	error = _xfs_buf_map_pages(bp, 0);
829
	if (unlikely(error)) {
830
		xfs_warn(target->bt_mount,
831
			"%s: failed to map pages", __func__);
L
Linus Torvalds 已提交
832
		goto fail_free_mem;
833
	}
L
Linus Torvalds 已提交
834

835
	trace_xfs_buf_get_uncached(bp, _RET_IP_);
L
Linus Torvalds 已提交
836
	return bp;
837

L
Linus Torvalds 已提交
838
 fail_free_mem:
839 840
	while (--i >= 0)
		__free_page(bp->b_pages[i]);
841
	_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
842
 fail_free_buf:
843
	xfs_buf_free_maps(bp);
844
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
845 846 847 848 849 850 851 852 853 854
 fail:
	return NULL;
}

/*
 *	Increment reference count on buffer, to hold the buffer concurrently
 *	with another thread which may release (free) the buffer asynchronously.
 *	Must hold the buffer already to call this function.
 */
void
855 856
xfs_buf_hold(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
857
{
C
Christoph Hellwig 已提交
858
	trace_xfs_buf_hold(bp, _RET_IP_);
859
	atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
860 861 862
}

/*
863 864
 *	Releases a hold on the specified buffer.  If the
 *	the hold count is 1, calls xfs_buf_free.
L
Linus Torvalds 已提交
865 866
 */
void
867 868
xfs_buf_rele(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
869
{
870
	struct xfs_perag	*pag = bp->b_pag;
L
Linus Torvalds 已提交
871

C
Christoph Hellwig 已提交
872
	trace_xfs_buf_rele(bp, _RET_IP_);
L
Linus Torvalds 已提交
873

874
	if (!pag) {
875
		ASSERT(list_empty(&bp->b_lru));
876
		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
877 878 879 880 881
		if (atomic_dec_and_test(&bp->b_hold))
			xfs_buf_free(bp);
		return;
	}

882
	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
883

884
	ASSERT(atomic_read(&bp->b_hold) > 0);
885
	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
886 887 888 889 890 891 892 893 894 895 896 897
		spin_lock(&bp->b_lock);
		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
			/*
			 * If the buffer is added to the LRU take a new
			 * reference to the buffer for the LRU and clear the
			 * (now stale) dispose list state flag
			 */
			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
				bp->b_state &= ~XFS_BSTATE_DISPOSE;
				atomic_inc(&bp->b_hold);
			}
			spin_unlock(&bp->b_lock);
898
			spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
899
		} else {
900 901 902 903 904 905 906 907 908 909 910 911 912
			/*
			 * most of the time buffers will already be removed from
			 * the LRU, so optimise that case by checking for the
			 * XFS_BSTATE_DISPOSE flag indicating the last list the
			 * buffer was on was the disposal list
			 */
			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
			} else {
				ASSERT(list_empty(&bp->b_lru));
			}
			spin_unlock(&bp->b_lock);

913
			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
914 915 916
			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			spin_unlock(&pag->pag_buf_lock);
			xfs_perag_put(pag);
917
			xfs_buf_free(bp);
L
Linus Torvalds 已提交
918 919 920 921 922 923
		}
	}
}


/*
924
 *	Lock a buffer object, if it is not already locked.
925 926 927 928 929 930 931 932
 *
 *	If we come across a stale, pinned, locked buffer, we know that we are
 *	being asked to lock a buffer that has been reallocated. Because it is
 *	pinned, we know that the log has not been pushed to disk and hence it
 *	will still be locked.  Rather than continuing to have trylock attempts
 *	fail until someone else pushes the log, push it ourselves before
 *	returning.  This means that the xfsaild will not get stuck trying
 *	to push on stale inode buffers.
L
Linus Torvalds 已提交
933 934
 */
int
935 936
xfs_buf_trylock(
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
937 938 939
{
	int			locked;

940
	locked = down_trylock(&bp->b_sema) == 0;
C
Christoph Hellwig 已提交
941
	if (locked)
942
		XB_SET_OWNER(bp);
C
Christoph Hellwig 已提交
943

944 945
	trace_xfs_buf_trylock(bp, _RET_IP_);
	return locked;
L
Linus Torvalds 已提交
946 947 948
}

/*
949
 *	Lock a buffer object.
950 951 952 953 954 955
 *
 *	If we come across a stale, pinned, locked buffer, we know that we
 *	are being asked to lock a buffer that has been reallocated. Because
 *	it is pinned, we know that the log has not been pushed to disk and
 *	hence it will still be locked. Rather than sleeping until someone
 *	else pushes the log, push it ourselves before trying to get the lock.
L
Linus Torvalds 已提交
956
 */
957 958
void
xfs_buf_lock(
959
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
960
{
C
Christoph Hellwig 已提交
961 962
	trace_xfs_buf_lock(bp, _RET_IP_);

963
	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
964
		xfs_log_force(bp->b_target->bt_mount, 0);
965 966
	down(&bp->b_sema);
	XB_SET_OWNER(bp);
C
Christoph Hellwig 已提交
967 968

	trace_xfs_buf_lock_done(bp, _RET_IP_);
L
Linus Torvalds 已提交
969 970 971
}

void
972
xfs_buf_unlock(
973
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
974
{
975 976
	XB_CLEAR_OWNER(bp);
	up(&bp->b_sema);
C
Christoph Hellwig 已提交
977 978

	trace_xfs_buf_unlock(bp, _RET_IP_);
L
Linus Torvalds 已提交
979 980
}

981 982 983
STATIC void
xfs_buf_wait_unpin(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
984 985 986
{
	DECLARE_WAITQUEUE	(wait, current);

987
	if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
988 989
		return;

990
	add_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
991 992
	for (;;) {
		set_current_state(TASK_UNINTERRUPTIBLE);
993
		if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
994
			break;
J
Jens Axboe 已提交
995
		io_schedule();
L
Linus Torvalds 已提交
996
	}
997
	remove_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
998 999 1000 1001 1002 1003 1004
	set_current_state(TASK_RUNNING);
}

/*
 *	Buffer Utility Routines
 */

1005 1006 1007
void
xfs_buf_ioend(
	struct xfs_buf	*bp)
L
Linus Torvalds 已提交
1008
{
1009 1010 1011
	bool		read = bp->b_flags & XBF_READ;

	trace_xfs_buf_iodone(bp, _RET_IP_);
1012 1013

	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1014

1015 1016 1017 1018 1019 1020 1021
	/*
	 * Pull in IO completion errors now. We are guaranteed to be running
	 * single threaded, so we don't need the lock to read b_io_error.
	 */
	if (!bp->b_error && bp->b_io_error)
		xfs_buf_ioerror(bp, bp->b_io_error);

1022 1023 1024
	/* Only validate buffers that were read without errors */
	if (read && !bp->b_error && bp->b_ops) {
		ASSERT(!bp->b_iodone);
1025
		bp->b_ops->verify_read(bp);
1026 1027 1028 1029
	}

	if (!bp->b_error)
		bp->b_flags |= XBF_DONE;
L
Linus Torvalds 已提交
1030

1031
	if (bp->b_iodone)
1032 1033
		(*(bp->b_iodone))(bp);
	else if (bp->b_flags & XBF_ASYNC)
L
Linus Torvalds 已提交
1034
		xfs_buf_relse(bp);
1035
	else
1036
		complete(&bp->b_iowait);
L
Linus Torvalds 已提交
1037 1038
}

1039 1040 1041
static void
xfs_buf_ioend_work(
	struct work_struct	*work)
L
Linus Torvalds 已提交
1042
{
1043
	struct xfs_buf		*bp =
1044
		container_of(work, xfs_buf_t, b_ioend_work);
C
Christoph Hellwig 已提交
1045

1046 1047
	xfs_buf_ioend(bp);
}
L
Linus Torvalds 已提交
1048

1049 1050 1051 1052
void
xfs_buf_ioend_async(
	struct xfs_buf	*bp)
{
1053 1054
	INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
	queue_work(bp->b_ioend_wq, &bp->b_ioend_work);
L
Linus Torvalds 已提交
1055 1056 1057
}

void
1058 1059 1060
xfs_buf_ioerror(
	xfs_buf_t		*bp,
	int			error)
L
Linus Torvalds 已提交
1061
{
D
Dave Chinner 已提交
1062 1063
	ASSERT(error <= 0 && error >= -1000);
	bp->b_error = error;
C
Christoph Hellwig 已提交
1064
	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
L
Linus Torvalds 已提交
1065 1066
}

1067 1068 1069 1070 1071 1072
void
xfs_buf_ioerror_alert(
	struct xfs_buf		*bp,
	const char		*func)
{
	xfs_alert(bp->b_target->bt_mount,
1073
"metadata I/O error: block 0x%llx (\"%s\") error %d numblks %d",
D
Dave Chinner 已提交
1074
		(__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length);
1075 1076
}

1077 1078 1079 1080 1081 1082 1083 1084 1085
int
xfs_bwrite(
	struct xfs_buf		*bp)
{
	int			error;

	ASSERT(xfs_buf_islocked(bp));

	bp->b_flags |= XBF_WRITE;
D
Dave Chinner 已提交
1086 1087
	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
			 XBF_WRITE_FAIL | XBF_DONE);
1088

1089
	error = xfs_buf_submit_wait(bp);
1090 1091 1092 1093 1094 1095 1096
	if (error) {
		xfs_force_shutdown(bp->b_target->bt_mount,
				   SHUTDOWN_META_IO_ERROR);
	}
	return error;
}

A
Al Viro 已提交
1097
STATIC void
1098
xfs_buf_bio_end_io(
L
Linus Torvalds 已提交
1099 1100 1101
	struct bio		*bio,
	int			error)
{
1102
	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
L
Linus Torvalds 已提交
1103

1104 1105 1106 1107
	/*
	 * don't overwrite existing errors - otherwise we can lose errors on
	 * buffers that require multiple bios to complete.
	 */
1108 1109 1110 1111 1112 1113
	if (error) {
		spin_lock(&bp->b_lock);
		if (!bp->b_io_error)
			bp->b_io_error = error;
		spin_unlock(&bp->b_lock);
	}
L
Linus Torvalds 已提交
1114

1115
	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1116 1117
		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));

1118 1119
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
		xfs_buf_ioend_async(bp);
L
Linus Torvalds 已提交
1120 1121 1122
	bio_put(bio);
}

1123 1124 1125 1126 1127 1128 1129
static void
xfs_buf_ioapply_map(
	struct xfs_buf	*bp,
	int		map,
	int		*buf_offset,
	int		*count,
	int		rw)
L
Linus Torvalds 已提交
1130
{
1131 1132 1133 1134 1135 1136 1137
	int		page_index;
	int		total_nr_pages = bp->b_page_count;
	int		nr_pages;
	struct bio	*bio;
	sector_t	sector =  bp->b_maps[map].bm_bn;
	int		size;
	int		offset;
L
Linus Torvalds 已提交
1138

1139
	total_nr_pages = bp->b_page_count;
L
Linus Torvalds 已提交
1140

1141 1142 1143 1144 1145 1146
	/* skip the pages in the buffer before the start offset */
	page_index = 0;
	offset = *buf_offset;
	while (offset >= PAGE_SIZE) {
		page_index++;
		offset -= PAGE_SIZE;
1147 1148
	}

1149 1150 1151 1152 1153 1154 1155
	/*
	 * Limit the IO size to the length of the current vector, and update the
	 * remaining IO count for the next time around.
	 */
	size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
	*count -= size;
	*buf_offset += size;
1156

L
Linus Torvalds 已提交
1157
next_chunk:
1158
	atomic_inc(&bp->b_io_remaining);
L
Linus Torvalds 已提交
1159 1160 1161 1162 1163
	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
	if (nr_pages > total_nr_pages)
		nr_pages = total_nr_pages;

	bio = bio_alloc(GFP_NOIO, nr_pages);
1164
	bio->bi_bdev = bp->b_target->bt_bdev;
1165
	bio->bi_iter.bi_sector = sector;
1166 1167
	bio->bi_end_io = xfs_buf_bio_end_io;
	bio->bi_private = bp;
L
Linus Torvalds 已提交
1168

1169

1170
	for (; size && nr_pages; nr_pages--, page_index++) {
1171
		int	rbytes, nbytes = PAGE_SIZE - offset;
L
Linus Torvalds 已提交
1172 1173 1174 1175

		if (nbytes > size)
			nbytes = size;

1176 1177
		rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
				      offset);
1178
		if (rbytes < nbytes)
L
Linus Torvalds 已提交
1179 1180 1181
			break;

		offset = 0;
1182
		sector += BTOBB(nbytes);
L
Linus Torvalds 已提交
1183 1184 1185 1186
		size -= nbytes;
		total_nr_pages--;
	}

1187
	if (likely(bio->bi_iter.bi_size)) {
1188 1189 1190 1191
		if (xfs_buf_is_vmapped(bp)) {
			flush_kernel_vmap_range(bp->b_addr,
						xfs_buf_vmap_len(bp));
		}
L
Linus Torvalds 已提交
1192 1193 1194 1195
		submit_bio(rw, bio);
		if (size)
			goto next_chunk;
	} else {
1196 1197
		/*
		 * This is guaranteed not to be the last io reference count
1198
		 * because the caller (xfs_buf_submit) holds a count itself.
1199 1200
		 */
		atomic_dec(&bp->b_io_remaining);
D
Dave Chinner 已提交
1201
		xfs_buf_ioerror(bp, -EIO);
1202
		bio_put(bio);
L
Linus Torvalds 已提交
1203
	}
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216

}

STATIC void
_xfs_buf_ioapply(
	struct xfs_buf	*bp)
{
	struct blk_plug	plug;
	int		rw;
	int		offset;
	int		size;
	int		i;

1217 1218 1219 1220 1221 1222
	/*
	 * Make sure we capture only current IO errors rather than stale errors
	 * left over from previous use of the buffer (e.g. failed readahead).
	 */
	bp->b_error = 0;

1223 1224 1225 1226 1227 1228 1229
	/*
	 * Initialize the I/O completion workqueue if we haven't yet or the
	 * submitter has not opted to specify a custom one.
	 */
	if (!bp->b_ioend_wq)
		bp->b_ioend_wq = bp->b_target->bt_mount->m_buf_workqueue;

1230 1231 1232 1233 1234 1235 1236 1237 1238
	if (bp->b_flags & XBF_WRITE) {
		if (bp->b_flags & XBF_SYNCIO)
			rw = WRITE_SYNC;
		else
			rw = WRITE;
		if (bp->b_flags & XBF_FUA)
			rw |= REQ_FUA;
		if (bp->b_flags & XBF_FLUSH)
			rw |= REQ_FLUSH;
1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251

		/*
		 * Run the write verifier callback function if it exists. If
		 * this function fails it will mark the buffer with an error and
		 * the IO should not be dispatched.
		 */
		if (bp->b_ops) {
			bp->b_ops->verify_write(bp);
			if (bp->b_error) {
				xfs_force_shutdown(bp->b_target->bt_mount,
						   SHUTDOWN_CORRUPT_INCORE);
				return;
			}
1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
		} else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
			struct xfs_mount *mp = bp->b_target->bt_mount;

			/*
			 * non-crc filesystems don't attach verifiers during
			 * log recovery, so don't warn for such filesystems.
			 */
			if (xfs_sb_version_hascrc(&mp->m_sb)) {
				xfs_warn(mp,
					"%s: no ops on block 0x%llx/0x%x",
					__func__, bp->b_bn, bp->b_length);
				xfs_hex_dump(bp->b_addr, 64);
				dump_stack();
			}
1266
		}
1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
	} else if (bp->b_flags & XBF_READ_AHEAD) {
		rw = READA;
	} else {
		rw = READ;
	}

	/* we only use the buffer cache for meta-data */
	rw |= REQ_META;

	/*
	 * Walk all the vectors issuing IO on them. Set up the initial offset
	 * into the buffer and the desired IO size before we start -
	 * _xfs_buf_ioapply_vec() will modify them appropriately for each
	 * subsequent call.
	 */
	offset = bp->b_offset;
	size = BBTOB(bp->b_io_length);
	blk_start_plug(&plug);
	for (i = 0; i < bp->b_map_count; i++) {
		xfs_buf_ioapply_map(bp, i, &offset, &size, rw);
		if (bp->b_error)
			break;
		if (size <= 0)
			break;	/* all done */
	}
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
1293 1294
}

1295 1296 1297 1298 1299 1300
/*
 * Asynchronous IO submission path. This transfers the buffer lock ownership and
 * the current reference to the IO. It is not safe to reference the buffer after
 * a call to this function unless the caller holds an additional reference
 * itself.
 */
1301
void
1302 1303
xfs_buf_submit(
	struct xfs_buf	*bp)
L
Linus Torvalds 已提交
1304
{
1305
	trace_xfs_buf_submit(bp, _RET_IP_);
L
Linus Torvalds 已提交
1306

1307
	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1308 1309 1310 1311 1312 1313 1314 1315 1316 1317
	ASSERT(bp->b_flags & XBF_ASYNC);

	/* on shutdown we stale and complete the buffer immediately */
	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
		xfs_buf_ioerror(bp, -EIO);
		bp->b_flags &= ~XBF_DONE;
		xfs_buf_stale(bp);
		xfs_buf_ioend(bp);
		return;
	}
L
Linus Torvalds 已提交
1318

1319
	if (bp->b_flags & XBF_WRITE)
1320
		xfs_buf_wait_unpin(bp);
1321

1322 1323 1324
	/* clear the internal error state to avoid spurious errors */
	bp->b_io_error = 0;

1325
	/*
1326 1327 1328 1329 1330 1331
	 * The caller's reference is released during I/O completion.
	 * This occurs some time after the last b_io_remaining reference is
	 * released, so after we drop our Io reference we have to have some
	 * other reference to ensure the buffer doesn't go away from underneath
	 * us. Take a direct reference to ensure we have safe access to the
	 * buffer until we are finished with it.
1332
	 */
1333
	xfs_buf_hold(bp);
L
Linus Torvalds 已提交
1334

1335
	/*
1336 1337 1338
	 * Set the count to 1 initially, this will stop an I/O completion
	 * callout which happens before we have started all the I/O from calling
	 * xfs_buf_ioend too early.
L
Linus Torvalds 已提交
1339
	 */
1340 1341
	atomic_set(&bp->b_io_remaining, 1);
	_xfs_buf_ioapply(bp);
1342

1343
	/*
1344 1345 1346
	 * If _xfs_buf_ioapply failed, we can get back here with only the IO
	 * reference we took above. If we drop it to zero, run completion so
	 * that we don't return to the caller with completion still pending.
1347
	 */
1348
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1349
		if (bp->b_error)
1350 1351 1352 1353
			xfs_buf_ioend(bp);
		else
			xfs_buf_ioend_async(bp);
	}
L
Linus Torvalds 已提交
1354

1355
	xfs_buf_rele(bp);
1356
	/* Note: it is not safe to reference bp now we've dropped our ref */
L
Linus Torvalds 已提交
1357 1358 1359
}

/*
1360
 * Synchronous buffer IO submission path, read or write.
L
Linus Torvalds 已提交
1361 1362
 */
int
1363 1364
xfs_buf_submit_wait(
	struct xfs_buf	*bp)
L
Linus Torvalds 已提交
1365
{
1366
	int		error;
C
Christoph Hellwig 已提交
1367

1368 1369 1370
	trace_xfs_buf_submit_wait(bp, _RET_IP_);

	ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC)));
C
Christoph Hellwig 已提交
1371

1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
		xfs_buf_ioerror(bp, -EIO);
		xfs_buf_stale(bp);
		bp->b_flags &= ~XBF_DONE;
		return -EIO;
	}

	if (bp->b_flags & XBF_WRITE)
		xfs_buf_wait_unpin(bp);

	/* clear the internal error state to avoid spurious errors */
	bp->b_io_error = 0;

	/*
	 * For synchronous IO, the IO does not inherit the submitters reference
	 * count, nor the buffer lock. Hence we cannot release the reference we
	 * are about to take until we've waited for all IO completion to occur,
	 * including any xfs_buf_ioend_async() work that may be pending.
	 */
	xfs_buf_hold(bp);

	/*
	 * Set the count to 1 initially, this will stop an I/O completion
	 * callout which happens before we have started all the I/O from calling
	 * xfs_buf_ioend too early.
	 */
	atomic_set(&bp->b_io_remaining, 1);
	_xfs_buf_ioapply(bp);

	/*
	 * make sure we run completion synchronously if it raced with us and is
	 * already complete.
	 */
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
		xfs_buf_ioend(bp);
C
Christoph Hellwig 已提交
1407

1408 1409 1410
	/* wait for completion before gathering the error from the buffer */
	trace_xfs_buf_iowait(bp, _RET_IP_);
	wait_for_completion(&bp->b_iowait);
C
Christoph Hellwig 已提交
1411
	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1412 1413 1414 1415 1416 1417 1418 1419
	error = bp->b_error;

	/*
	 * all done now, we can release the hold that keeps the buffer
	 * referenced for the entire IO.
	 */
	xfs_buf_rele(bp);
	return error;
L
Linus Torvalds 已提交
1420 1421
}

1422 1423 1424
xfs_caddr_t
xfs_buf_offset(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1425 1426 1427 1428
	size_t			offset)
{
	struct page		*page;

1429
	if (bp->b_addr)
1430
		return bp->b_addr + offset;
L
Linus Torvalds 已提交
1431

1432
	offset += bp->b_offset;
1433 1434
	page = bp->b_pages[offset >> PAGE_SHIFT];
	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
L
Linus Torvalds 已提交
1435 1436 1437 1438 1439 1440
}

/*
 *	Move data into or out of a buffer.
 */
void
1441 1442
xfs_buf_iomove(
	xfs_buf_t		*bp,	/* buffer to process		*/
L
Linus Torvalds 已提交
1443 1444
	size_t			boff,	/* starting buffer offset	*/
	size_t			bsize,	/* length to copy		*/
1445
	void			*data,	/* data address			*/
1446
	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
L
Linus Torvalds 已提交
1447
{
D
Dave Chinner 已提交
1448
	size_t			bend;
L
Linus Torvalds 已提交
1449 1450 1451

	bend = boff + bsize;
	while (boff < bend) {
D
Dave Chinner 已提交
1452 1453 1454 1455 1456 1457 1458 1459
		struct page	*page;
		int		page_index, page_offset, csize;

		page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
		page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
		page = bp->b_pages[page_index];
		csize = min_t(size_t, PAGE_SIZE - page_offset,
				      BBTOB(bp->b_io_length) - boff);
L
Linus Torvalds 已提交
1460

D
Dave Chinner 已提交
1461
		ASSERT((csize + page_offset) <= PAGE_SIZE);
L
Linus Torvalds 已提交
1462 1463

		switch (mode) {
1464
		case XBRW_ZERO:
D
Dave Chinner 已提交
1465
			memset(page_address(page) + page_offset, 0, csize);
L
Linus Torvalds 已提交
1466
			break;
1467
		case XBRW_READ:
D
Dave Chinner 已提交
1468
			memcpy(data, page_address(page) + page_offset, csize);
L
Linus Torvalds 已提交
1469
			break;
1470
		case XBRW_WRITE:
D
Dave Chinner 已提交
1471
			memcpy(page_address(page) + page_offset, data, csize);
L
Linus Torvalds 已提交
1472 1473 1474 1475 1476 1477 1478 1479
		}

		boff += csize;
		data += csize;
	}
}

/*
1480
 *	Handling of buffer targets (buftargs).
L
Linus Torvalds 已提交
1481 1482 1483
 */

/*
1484 1485 1486
 * Wait for any bufs with callbacks that have been submitted but have not yet
 * returned. These buffers will have an elevated hold count, so wait on those
 * while freeing all the buffers only held by the LRU.
L
Linus Torvalds 已提交
1487
 */
1488 1489 1490
static enum lru_status
xfs_buftarg_wait_rele(
	struct list_head	*item,
1491
	struct list_lru_one	*lru,
1492 1493 1494
	spinlock_t		*lru_lock,
	void			*arg)

L
Linus Torvalds 已提交
1495
{
1496
	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
1497
	struct list_head	*dispose = arg;
1498

1499
	if (atomic_read(&bp->b_hold) > 1) {
1500
		/* need to wait, so skip it this pass */
1501
		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
1502
		return LRU_SKIP;
L
Linus Torvalds 已提交
1503
	}
1504 1505
	if (!spin_trylock(&bp->b_lock))
		return LRU_SKIP;
1506

1507 1508 1509 1510 1511 1512
	/*
	 * clear the LRU reference count so the buffer doesn't get
	 * ignored in xfs_buf_rele().
	 */
	atomic_set(&bp->b_lru_ref, 0);
	bp->b_state |= XFS_BSTATE_DISPOSE;
1513
	list_lru_isolate_move(lru, item, dispose);
1514 1515
	spin_unlock(&bp->b_lock);
	return LRU_REMOVED;
L
Linus Torvalds 已提交
1516 1517
}

1518 1519 1520 1521
void
xfs_wait_buftarg(
	struct xfs_buftarg	*btp)
{
1522 1523 1524 1525 1526
	LIST_HEAD(dispose);
	int loop = 0;

	/* loop until there is nothing left on the lru list. */
	while (list_lru_count(&btp->bt_lru)) {
1527
		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
1528 1529 1530 1531 1532 1533
			      &dispose, LONG_MAX);

		while (!list_empty(&dispose)) {
			struct xfs_buf *bp;
			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
			list_del_init(&bp->b_lru);
1534 1535 1536 1537 1538 1539
			if (bp->b_flags & XBF_WRITE_FAIL) {
				xfs_alert(btp->bt_mount,
"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
"Please run xfs_repair to determine the extent of the problem.",
					(long long)bp->b_bn);
			}
1540 1541 1542 1543 1544
			xfs_buf_rele(bp);
		}
		if (loop++ != 0)
			delay(100);
	}
1545 1546 1547 1548 1549
}

static enum lru_status
xfs_buftarg_isolate(
	struct list_head	*item,
1550
	struct list_lru_one	*lru,
1551 1552 1553 1554 1555 1556
	spinlock_t		*lru_lock,
	void			*arg)
{
	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
	struct list_head	*dispose = arg;

1557 1558 1559 1560 1561 1562
	/*
	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
	 * If we fail to get the lock, just skip it.
	 */
	if (!spin_trylock(&bp->b_lock))
		return LRU_SKIP;
1563 1564 1565 1566 1567
	/*
	 * Decrement the b_lru_ref count unless the value is already
	 * zero. If the value is already zero, we need to reclaim the
	 * buffer, otherwise it gets another trip through the LRU.
	 */
1568 1569
	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
		spin_unlock(&bp->b_lock);
1570
		return LRU_ROTATE;
1571
	}
1572

1573
	bp->b_state |= XFS_BSTATE_DISPOSE;
1574
	list_lru_isolate_move(lru, item, dispose);
1575
	spin_unlock(&bp->b_lock);
1576 1577 1578
	return LRU_REMOVED;
}

1579
static unsigned long
1580
xfs_buftarg_shrink_scan(
1581
	struct shrinker		*shrink,
1582
	struct shrink_control	*sc)
1583
{
1584 1585
	struct xfs_buftarg	*btp = container_of(shrink,
					struct xfs_buftarg, bt_shrinker);
1586
	LIST_HEAD(dispose);
1587
	unsigned long		freed;
1588

1589 1590
	freed = list_lru_shrink_walk(&btp->bt_lru, sc,
				     xfs_buftarg_isolate, &dispose);
1591 1592

	while (!list_empty(&dispose)) {
1593
		struct xfs_buf *bp;
1594 1595 1596 1597 1598
		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
		list_del_init(&bp->b_lru);
		xfs_buf_rele(bp);
	}

1599 1600 1601
	return freed;
}

1602
static unsigned long
1603 1604 1605 1606 1607 1608
xfs_buftarg_shrink_count(
	struct shrinker		*shrink,
	struct shrink_control	*sc)
{
	struct xfs_buftarg	*btp = container_of(shrink,
					struct xfs_buftarg, bt_shrinker);
1609
	return list_lru_shrink_count(&btp->bt_lru, sc);
1610 1611
}

L
Linus Torvalds 已提交
1612 1613
void
xfs_free_buftarg(
1614 1615
	struct xfs_mount	*mp,
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1616
{
1617
	unregister_shrinker(&btp->bt_shrinker);
G
Glauber Costa 已提交
1618
	list_lru_destroy(&btp->bt_lru);
1619

1620 1621
	if (mp->m_flags & XFS_MOUNT_BARRIER)
		xfs_blkdev_issue_flush(btp);
1622

1623
	kmem_free(btp);
L
Linus Torvalds 已提交
1624 1625
}

1626 1627
int
xfs_setsize_buftarg(
L
Linus Torvalds 已提交
1628
	xfs_buftarg_t		*btp,
1629
	unsigned int		sectorsize)
L
Linus Torvalds 已提交
1630
{
1631
	/* Set up metadata sector size info */
1632 1633
	btp->bt_meta_sectorsize = sectorsize;
	btp->bt_meta_sectormask = sectorsize - 1;
L
Linus Torvalds 已提交
1634

1635
	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1636 1637 1638 1639
		char name[BDEVNAME_SIZE];

		bdevname(btp->bt_bdev, name);

1640
		xfs_warn(btp->bt_mount,
1641
			"Cannot set_blocksize to %u on device %s",
1642
			sectorsize, name);
D
Dave Chinner 已提交
1643
		return -EINVAL;
L
Linus Torvalds 已提交
1644 1645
	}

1646 1647 1648 1649
	/* Set up device logical sector size mask */
	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;

L
Linus Torvalds 已提交
1650 1651 1652 1653
	return 0;
}

/*
1654 1655 1656
 * When allocating the initial buffer target we have not yet
 * read in the superblock, so don't know what sized sectors
 * are being used at this early stage.  Play safe.
1657
 */
L
Linus Torvalds 已提交
1658 1659 1660 1661 1662
STATIC int
xfs_setsize_buftarg_early(
	xfs_buftarg_t		*btp,
	struct block_device	*bdev)
{
1663
	return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
L
Linus Torvalds 已提交
1664 1665 1666 1667
}

xfs_buftarg_t *
xfs_alloc_buftarg(
1668
	struct xfs_mount	*mp,
1669
	struct block_device	*bdev)
L
Linus Torvalds 已提交
1670 1671 1672
{
	xfs_buftarg_t		*btp;

1673
	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP | KM_NOFS);
L
Linus Torvalds 已提交
1674

1675
	btp->bt_mount = mp;
1676 1677
	btp->bt_dev =  bdev->bd_dev;
	btp->bt_bdev = bdev;
1678 1679
	btp->bt_bdi = blk_get_backing_dev_info(bdev);

L
Linus Torvalds 已提交
1680 1681
	if (xfs_setsize_buftarg_early(btp, bdev))
		goto error;
1682 1683 1684 1685

	if (list_lru_init(&btp->bt_lru))
		goto error;

1686 1687
	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
1688
	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1689
	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
1690
	register_shrinker(&btp->bt_shrinker);
L
Linus Torvalds 已提交
1691 1692 1693
	return btp;

error:
1694
	kmem_free(btp);
L
Linus Torvalds 已提交
1695 1696 1697 1698
	return NULL;
}

/*
1699 1700 1701 1702 1703 1704 1705 1706 1707
 * Add a buffer to the delayed write list.
 *
 * This queues a buffer for writeout if it hasn't already been.  Note that
 * neither this routine nor the buffer list submission functions perform
 * any internal synchronization.  It is expected that the lists are thread-local
 * to the callers.
 *
 * Returns true if we queued up the buffer, or false if it already had
 * been on the buffer list.
L
Linus Torvalds 已提交
1708
 */
1709
bool
1710
xfs_buf_delwri_queue(
1711 1712
	struct xfs_buf		*bp,
	struct list_head	*list)
L
Linus Torvalds 已提交
1713
{
1714
	ASSERT(xfs_buf_islocked(bp));
1715
	ASSERT(!(bp->b_flags & XBF_READ));
L
Linus Torvalds 已提交
1716

1717 1718 1719 1720 1721 1722 1723 1724
	/*
	 * If the buffer is already marked delwri it already is queued up
	 * by someone else for imediate writeout.  Just ignore it in that
	 * case.
	 */
	if (bp->b_flags & _XBF_DELWRI_Q) {
		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
		return false;
L
Linus Torvalds 已提交
1725 1726
	}

1727
	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1728 1729

	/*
1730 1731 1732 1733 1734 1735
	 * If a buffer gets written out synchronously or marked stale while it
	 * is on a delwri list we lazily remove it. To do this, the other party
	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
	 * It remains referenced and on the list.  In a rare corner case it
	 * might get readded to a delwri list after the synchronous writeout, in
	 * which case we need just need to re-add the flag here.
1736
	 */
1737 1738 1739 1740
	bp->b_flags |= _XBF_DELWRI_Q;
	if (list_empty(&bp->b_list)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_list, list);
1741 1742
	}

1743
	return true;
1744 1745
}

1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760
/*
 * Compare function is more complex than it needs to be because
 * the return value is only 32 bits and we are doing comparisons
 * on 64 bit values
 */
static int
xfs_buf_cmp(
	void		*priv,
	struct list_head *a,
	struct list_head *b)
{
	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
	xfs_daddr_t		diff;

1761
	diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
1762 1763 1764 1765 1766 1767 1768
	if (diff < 0)
		return -1;
	if (diff > 0)
		return 1;
	return 0;
}

1769 1770 1771 1772 1773
static int
__xfs_buf_delwri_submit(
	struct list_head	*buffer_list,
	struct list_head	*io_list,
	bool			wait)
L
Linus Torvalds 已提交
1774
{
1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789
	struct blk_plug		plug;
	struct xfs_buf		*bp, *n;
	int			pinned = 0;

	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
		if (!wait) {
			if (xfs_buf_ispinned(bp)) {
				pinned++;
				continue;
			}
			if (!xfs_buf_trylock(bp))
				continue;
		} else {
			xfs_buf_lock(bp);
		}
1790

1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801
		/*
		 * Someone else might have written the buffer synchronously or
		 * marked it stale in the meantime.  In that case only the
		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
		 * reference and remove it from the list here.
		 */
		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
			list_del_init(&bp->b_list);
			xfs_buf_relse(bp);
			continue;
		}
D
Dave Chinner 已提交
1802

1803 1804 1805
		list_move_tail(&bp->b_list, io_list);
		trace_xfs_buf_delwri_split(bp, _RET_IP_);
	}
L
Linus Torvalds 已提交
1806

1807
	list_sort(NULL, io_list, xfs_buf_cmp);
L
Linus Torvalds 已提交
1808

1809 1810
	blk_start_plug(&plug);
	list_for_each_entry_safe(bp, n, io_list, b_list) {
1811
		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
1812
		bp->b_flags |= XBF_WRITE | XBF_ASYNC;
1813

1814 1815 1816 1817 1818 1819 1820 1821
		/*
		 * we do all Io submission async. This means if we need to wait
		 * for IO completion we need to take an extra reference so the
		 * buffer is still valid on the other side.
		 */
		if (wait)
			xfs_buf_hold(bp);
		else
1822
			list_del_init(&bp->b_list);
D
Dave Chinner 已提交
1823

1824
		xfs_buf_submit(bp);
1825 1826
	}
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
1827

1828
	return pinned;
L
Linus Torvalds 已提交
1829 1830 1831
}

/*
1832 1833 1834 1835 1836 1837 1838
 * Write out a buffer list asynchronously.
 *
 * This will take the @buffer_list, write all non-locked and non-pinned buffers
 * out and not wait for I/O completion on any of the buffers.  This interface
 * is only safely useable for callers that can track I/O completion by higher
 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
 * function.
L
Linus Torvalds 已提交
1839 1840
 */
int
1841 1842
xfs_buf_delwri_submit_nowait(
	struct list_head	*buffer_list)
L
Linus Torvalds 已提交
1843
{
1844 1845 1846
	LIST_HEAD		(io_list);
	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
}
L
Linus Torvalds 已提交
1847

1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
/*
 * Write out a buffer list synchronously.
 *
 * This will take the @buffer_list, write all buffers out and wait for I/O
 * completion on all of the buffers. @buffer_list is consumed by the function,
 * so callers must have some other way of tracking buffers if they require such
 * functionality.
 */
int
xfs_buf_delwri_submit(
	struct list_head	*buffer_list)
{
	LIST_HEAD		(io_list);
	int			error = 0, error2;
	struct xfs_buf		*bp;
L
Linus Torvalds 已提交
1863

1864
	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
L
Linus Torvalds 已提交
1865

1866 1867 1868
	/* Wait for IO to complete. */
	while (!list_empty(&io_list)) {
		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1869

1870
		list_del_init(&bp->b_list);
1871 1872 1873 1874

		/* locking the buffer will wait for async IO completion. */
		xfs_buf_lock(bp);
		error2 = bp->b_error;
1875 1876 1877
		xfs_buf_relse(bp);
		if (!error)
			error = error2;
L
Linus Torvalds 已提交
1878 1879
	}

1880
	return error;
L
Linus Torvalds 已提交
1881 1882
}

1883
int __init
1884
xfs_buf_init(void)
L
Linus Torvalds 已提交
1885
{
1886 1887
	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
						KM_ZONE_HWALIGN, NULL);
1888
	if (!xfs_buf_zone)
C
Christoph Hellwig 已提交
1889
		goto out;
1890

1891
	return 0;
L
Linus Torvalds 已提交
1892

C
Christoph Hellwig 已提交
1893
 out:
1894
	return -ENOMEM;
L
Linus Torvalds 已提交
1895 1896 1897
}

void
1898
xfs_buf_terminate(void)
L
Linus Torvalds 已提交
1899
{
1900
	kmem_zone_destroy(xfs_buf_zone);
L
Linus Torvalds 已提交
1901
}