xfs_buf.c 42.2 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3
 * All Rights Reserved.
L
Linus Torvalds 已提交
4
 *
5 6
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
L
Linus Torvalds 已提交
7 8
 * published by the Free Software Foundation.
 *
9 10 11 12
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
L
Linus Torvalds 已提交
13
 *
14 15 16
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
L
Linus Torvalds 已提交
17
 */
18
#include "xfs.h"
L
Linus Torvalds 已提交
19 20
#include <linux/stddef.h>
#include <linux/errno.h>
21
#include <linux/gfp.h>
L
Linus Torvalds 已提交
22 23 24 25 26 27 28 29 30 31
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/percpu.h>
#include <linux/blkdev.h>
#include <linux/hash.h>
32
#include <linux/kthread.h>
C
Christoph Lameter 已提交
33
#include <linux/migrate.h>
34
#include <linux/backing-dev.h>
35
#include <linux/freezer.h>
36
#include <linux/list_sort.h>
L
Linus Torvalds 已提交
37

38 39
#include "xfs_sb.h"
#include "xfs_inum.h"
40
#include "xfs_log.h"
41 42
#include "xfs_ag.h"
#include "xfs_mount.h"
C
Christoph Hellwig 已提交
43
#include "xfs_trace.h"
44

45
static kmem_zone_t *xfs_buf_zone;
46
STATIC int xfsbufd(void *);
47
STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
48

49
static struct workqueue_struct *xfslogd_workqueue;
50
struct workqueue_struct *xfsdatad_workqueue;
51
struct workqueue_struct *xfsconvertd_workqueue;
L
Linus Torvalds 已提交
52

53 54 55 56
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
L
Linus Torvalds 已提交
57
#else
58 59 60
# define XB_SET_OWNER(bp)	do { } while (0)
# define XB_CLEAR_OWNER(bp)	do { } while (0)
# define XB_GET_OWNER(bp)	do { } while (0)
L
Linus Torvalds 已提交
61 62
#endif

63 64 65
#define xb_to_gfp(flags) \
	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
L
Linus Torvalds 已提交
66

67 68
#define xb_to_km(flags) \
	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
L
Linus Torvalds 已提交
69

70 71 72 73
#define xfs_buf_allocate(flags) \
	kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
#define xfs_buf_deallocate(bp) \
	kmem_zone_free(xfs_buf_zone, (bp));
L
Linus Torvalds 已提交
74

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
static inline int
xfs_buf_is_vmapped(
	struct xfs_buf	*bp)
{
	/*
	 * Return true if the buffer is vmapped.
	 *
	 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
	 * code is clever enough to know it doesn't have to map a single page,
	 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
	 */
	return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
}

static inline int
xfs_buf_vmap_len(
	struct xfs_buf	*bp)
{
	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
}

L
Linus Torvalds 已提交
96
/*
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 * xfs_buf_lru_add - add a buffer to the LRU.
 *
 * The LRU takes a new reference to the buffer so that it will only be freed
 * once the shrinker takes the buffer off the LRU.
 */
STATIC void
xfs_buf_lru_add(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	spin_lock(&btp->bt_lru_lock);
	if (list_empty(&bp->b_lru)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_lru, &btp->bt_lru);
		btp->bt_lru_nr++;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * xfs_buf_lru_del - remove a buffer from the LRU
 *
 * The unlocked check is safe here because it only occurs when there are not
 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 * to optimise the shrinker removing the buffer from the LRU and calling
 * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
 * bt_lru_lock.
L
Linus Torvalds 已提交
125
 */
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
STATIC void
xfs_buf_lru_del(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	if (list_empty(&bp->b_lru))
		return;

	spin_lock(&btp->bt_lru_lock);
	if (!list_empty(&bp->b_lru)) {
		list_del_init(&bp->b_lru);
		btp->bt_lru_nr--;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
 * to remove the reference that LRU holds on the buffer.
 *
 * This prevents build-up of stale buffers on the LRU.
 */
void
xfs_buf_stale(
	struct xfs_buf	*bp)
{
	bp->b_flags |= XBF_STALE;
	atomic_set(&(bp)->b_lru_ref, 0);
	if (!list_empty(&bp->b_lru)) {
		struct xfs_buftarg *btp = bp->b_target;

		spin_lock(&btp->bt_lru_lock);
		if (!list_empty(&bp->b_lru)) {
			list_del_init(&bp->b_lru);
			btp->bt_lru_nr--;
			atomic_dec(&bp->b_hold);
		}
		spin_unlock(&btp->bt_lru_lock);
	}
	ASSERT(atomic_read(&bp->b_hold) >= 1);
}
L
Linus Torvalds 已提交
170 171

STATIC void
172 173
_xfs_buf_initialize(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
174
	xfs_buftarg_t		*target,
175
	xfs_off_t		range_base,
L
Linus Torvalds 已提交
176
	size_t			range_length,
177
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
178 179
{
	/*
180
	 * We don't want certain flags to appear in b_flags.
L
Linus Torvalds 已提交
181
	 */
182 183 184 185
	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);

	memset(bp, 0, sizeof(xfs_buf_t));
	atomic_set(&bp->b_hold, 1);
186
	atomic_set(&bp->b_lru_ref, 1);
187
	init_completion(&bp->b_iowait);
188
	INIT_LIST_HEAD(&bp->b_lru);
189
	INIT_LIST_HEAD(&bp->b_list);
190
	RB_CLEAR_NODE(&bp->b_rbnode);
T
Thomas Gleixner 已提交
191
	sema_init(&bp->b_sema, 0); /* held, no waiters */
192 193 194
	XB_SET_OWNER(bp);
	bp->b_target = target;
	bp->b_file_offset = range_base;
L
Linus Torvalds 已提交
195 196 197 198 199
	/*
	 * Set buffer_length and count_desired to the same value initially.
	 * I/O routines should use count_desired, which will be the same in
	 * most cases but may be reset (e.g. XFS recovery).
	 */
200 201 202 203 204 205 206
	bp->b_buffer_length = bp->b_count_desired = range_length;
	bp->b_flags = flags;
	bp->b_bn = XFS_BUF_DADDR_NULL;
	atomic_set(&bp->b_pin_count, 0);
	init_waitqueue_head(&bp->b_waiters);

	XFS_STATS_INC(xb_create);
C
Christoph Hellwig 已提交
207 208

	trace_xfs_buf_init(bp, _RET_IP_);
L
Linus Torvalds 已提交
209 210 211
}

/*
212 213
 *	Allocate a page array capable of holding a specified number
 *	of pages, and point the page buf at it.
L
Linus Torvalds 已提交
214 215
 */
STATIC int
216 217
_xfs_buf_get_pages(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
218
	int			page_count,
219
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
220 221
{
	/* Make sure that we have a page list */
222 223 224 225 226
	if (bp->b_pages == NULL) {
		bp->b_offset = xfs_buf_poff(bp->b_file_offset);
		bp->b_page_count = page_count;
		if (page_count <= XB_PAGES) {
			bp->b_pages = bp->b_page_array;
L
Linus Torvalds 已提交
227
		} else {
228 229 230
			bp->b_pages = kmem_alloc(sizeof(struct page *) *
					page_count, xb_to_km(flags));
			if (bp->b_pages == NULL)
L
Linus Torvalds 已提交
231 232
				return -ENOMEM;
		}
233
		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
L
Linus Torvalds 已提交
234 235 236 237 238
	}
	return 0;
}

/*
239
 *	Frees b_pages if it was allocated.
L
Linus Torvalds 已提交
240 241
 */
STATIC void
242
_xfs_buf_free_pages(
L
Linus Torvalds 已提交
243 244
	xfs_buf_t	*bp)
{
245
	if (bp->b_pages != bp->b_page_array) {
246
		kmem_free(bp->b_pages);
247
		bp->b_pages = NULL;
L
Linus Torvalds 已提交
248 249 250 251 252 253 254
	}
}

/*
 *	Releases the specified buffer.
 *
 * 	The modification state of any associated pages is left unchanged.
255
 * 	The buffer most not be on any hash - use xfs_buf_rele instead for
L
Linus Torvalds 已提交
256 257 258
 * 	hashed and refcounted buffers
 */
void
259
xfs_buf_free(
L
Linus Torvalds 已提交
260 261
	xfs_buf_t		*bp)
{
C
Christoph Hellwig 已提交
262
	trace_xfs_buf_free(bp, _RET_IP_);
L
Linus Torvalds 已提交
263

264 265
	ASSERT(list_empty(&bp->b_lru));

266
	if (bp->b_flags & _XBF_PAGES) {
L
Linus Torvalds 已提交
267 268
		uint		i;

269
		if (xfs_buf_is_vmapped(bp))
A
Alex Elder 已提交
270 271
			vm_unmap_ram(bp->b_addr - bp->b_offset,
					bp->b_page_count);
L
Linus Torvalds 已提交
272

273 274 275
		for (i = 0; i < bp->b_page_count; i++) {
			struct page	*page = bp->b_pages[i];

276
			__free_page(page);
277
		}
278 279
	} else if (bp->b_flags & _XBF_KMEM)
		kmem_free(bp->b_addr);
280
	_xfs_buf_free_pages(bp);
281
	xfs_buf_deallocate(bp);
L
Linus Torvalds 已提交
282 283 284
}

/*
285
 * Allocates all the pages for buffer in question and builds it's page list.
L
Linus Torvalds 已提交
286 287
 */
STATIC int
288
xfs_buf_allocate_memory(
L
Linus Torvalds 已提交
289 290 291
	xfs_buf_t		*bp,
	uint			flags)
{
292
	size_t			size = bp->b_count_desired;
L
Linus Torvalds 已提交
293
	size_t			nbytes, offset;
294
	gfp_t			gfp_mask = xb_to_gfp(flags);
L
Linus Torvalds 已提交
295 296
	unsigned short		page_count, i;
	pgoff_t			first;
297
	xfs_off_t		end;
L
Linus Torvalds 已提交
298 299
	int			error;

300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
	/*
	 * for buffers that are contained within a single page, just allocate
	 * the memory from the heap - there's no need for the complexity of
	 * page arrays to keep allocation down to order 0.
	 */
	if (bp->b_buffer_length < PAGE_SIZE) {
		bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
		if (!bp->b_addr) {
			/* low memory - use alloc_page loop instead */
			goto use_alloc_page;
		}

		if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
								PAGE_MASK) !=
		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
			/* b_addr spans two pages - use alloc_page instead */
			kmem_free(bp->b_addr);
			bp->b_addr = NULL;
			goto use_alloc_page;
		}
		bp->b_offset = offset_in_page(bp->b_addr);
		bp->b_pages = bp->b_page_array;
		bp->b_pages[0] = virt_to_page(bp->b_addr);
		bp->b_page_count = 1;
		bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
		return 0;
	}

use_alloc_page:
329 330 331
	end = bp->b_file_offset + bp->b_buffer_length;
	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
	error = _xfs_buf_get_pages(bp, page_count, flags);
L
Linus Torvalds 已提交
332 333 334
	if (unlikely(error))
		return error;

335
	offset = bp->b_offset;
336 337
	first = bp->b_file_offset >> PAGE_SHIFT;
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
338

339
	for (i = 0; i < bp->b_page_count; i++) {
L
Linus Torvalds 已提交
340 341
		struct page	*page;
		uint		retries = 0;
342 343
retry:
		page = alloc_page(gfp_mask);
L
Linus Torvalds 已提交
344
		if (unlikely(page == NULL)) {
345 346
			if (flags & XBF_READ_AHEAD) {
				bp->b_page_count = i;
347 348
				error = ENOMEM;
				goto out_free_pages;
L
Linus Torvalds 已提交
349 350 351 352 353 354 355 356 357
			}

			/*
			 * This could deadlock.
			 *
			 * But until all the XFS lowlevel code is revamped to
			 * handle buffer allocation failures we can't do much.
			 */
			if (!(++retries % 100))
358 359
				xfs_err(NULL,
		"possible memory allocation deadlock in %s (mode:0x%x)",
360
					__func__, gfp_mask);
L
Linus Torvalds 已提交
361

362
			XFS_STATS_INC(xb_page_retries);
363
			congestion_wait(BLK_RW_ASYNC, HZ/50);
L
Linus Torvalds 已提交
364 365 366
			goto retry;
		}

367
		XFS_STATS_INC(xb_page_found);
L
Linus Torvalds 已提交
368

369
		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
L
Linus Torvalds 已提交
370
		size -= nbytes;
371
		bp->b_pages[i] = page;
L
Linus Torvalds 已提交
372 373
		offset = 0;
	}
374
	return 0;
L
Linus Torvalds 已提交
375

376 377 378
out_free_pages:
	for (i = 0; i < bp->b_page_count; i++)
		__free_page(bp->b_pages[i]);
L
Linus Torvalds 已提交
379 380 381 382 383 384 385
	return error;
}

/*
 *	Map buffer into kernel address-space if nessecary.
 */
STATIC int
386
_xfs_buf_map_pages(
L
Linus Torvalds 已提交
387 388 389
	xfs_buf_t		*bp,
	uint			flags)
{
390
	ASSERT(bp->b_flags & _XBF_PAGES);
391
	if (bp->b_page_count == 1) {
392
		/* A single page buffer is always mappable */
393 394 395
		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
		bp->b_flags |= XBF_MAPPED;
	} else if (flags & XBF_MAPPED) {
396 397 398 399 400 401 402 403 404 405 406
		int retried = 0;

		do {
			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
						-1, PAGE_KERNEL);
			if (bp->b_addr)
				break;
			vm_unmap_aliases();
		} while (retried++ <= 1);

		if (!bp->b_addr)
L
Linus Torvalds 已提交
407
			return -ENOMEM;
408 409
		bp->b_addr += bp->b_offset;
		bp->b_flags |= XBF_MAPPED;
L
Linus Torvalds 已提交
410 411 412 413 414 415 416 417 418 419
	}

	return 0;
}

/*
 *	Finding and Reading Buffers
 */

/*
420
 *	Look up, and creates if absent, a lockable buffer for
L
Linus Torvalds 已提交
421 422 423 424 425 426 427
 *	a given range of an inode.  The buffer is returned
 *	locked.	 If other overlapping buffers exist, they are
 *	released before the new buffer is created and locked,
 *	which may imply that this call will block until those buffers
 *	are unlocked.  No I/O is implied by this call.
 */
xfs_buf_t *
428
_xfs_buf_find(
L
Linus Torvalds 已提交
429
	xfs_buftarg_t		*btp,	/* block device target		*/
430
	xfs_off_t		ioff,	/* starting offset of range	*/
L
Linus Torvalds 已提交
431
	size_t			isize,	/* length of range		*/
432 433
	xfs_buf_flags_t		flags,
	xfs_buf_t		*new_bp)
L
Linus Torvalds 已提交
434
{
435
	xfs_off_t		range_base;
L
Linus Torvalds 已提交
436
	size_t			range_length;
437 438 439 440
	struct xfs_perag	*pag;
	struct rb_node		**rbp;
	struct rb_node		*parent;
	xfs_buf_t		*bp;
L
Linus Torvalds 已提交
441 442 443 444 445

	range_base = (ioff << BBSHIFT);
	range_length = (isize << BBSHIFT);

	/* Check for IOs smaller than the sector size / not sector aligned */
446
	ASSERT(!(range_length < (1 << btp->bt_sshift)));
447
	ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
L
Linus Torvalds 已提交
448

449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
	/* get tree root */
	pag = xfs_perag_get(btp->bt_mount,
				xfs_daddr_to_agno(btp->bt_mount, ioff));

	/* walk tree */
	spin_lock(&pag->pag_buf_lock);
	rbp = &pag->pag_buf_tree.rb_node;
	parent = NULL;
	bp = NULL;
	while (*rbp) {
		parent = *rbp;
		bp = rb_entry(parent, struct xfs_buf, b_rbnode);

		if (range_base < bp->b_file_offset)
			rbp = &(*rbp)->rb_left;
		else if (range_base > bp->b_file_offset)
			rbp = &(*rbp)->rb_right;
		else {
			/*
			 * found a block offset match. If the range doesn't
			 * match, the only way this is allowed is if the buffer
			 * in the cache is stale and the transaction that made
			 * it stale has not yet committed. i.e. we are
			 * reallocating a busy extent. Skip this buffer and
			 * continue searching to the right for an exact match.
			 */
			if (bp->b_buffer_length != range_length) {
				ASSERT(bp->b_flags & XBF_STALE);
				rbp = &(*rbp)->rb_right;
				continue;
			}
480
			atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
481 482 483 484 485
			goto found;
		}
	}

	/* No match found */
486 487
	if (new_bp) {
		_xfs_buf_initialize(new_bp, btp, range_base,
L
Linus Torvalds 已提交
488
				range_length, flags);
489 490 491 492 493
		rb_link_node(&new_bp->b_rbnode, parent, rbp);
		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
		/* the buffer keeps the perag reference until it is freed */
		new_bp->b_pag = pag;
		spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
494
	} else {
495
		XFS_STATS_INC(xb_miss_locked);
496 497
		spin_unlock(&pag->pag_buf_lock);
		xfs_perag_put(pag);
L
Linus Torvalds 已提交
498
	}
499
	return new_bp;
L
Linus Torvalds 已提交
500 501

found:
502 503
	spin_unlock(&pag->pag_buf_lock);
	xfs_perag_put(pag);
L
Linus Torvalds 已提交
504

505 506
	if (xfs_buf_cond_lock(bp)) {
		/* failed, so wait for the lock if requested. */
507 508 509
		if (!(flags & XBF_TRYLOCK)) {
			xfs_buf_lock(bp);
			XFS_STATS_INC(xb_get_locked_waited);
L
Linus Torvalds 已提交
510
		} else {
511 512 513
			xfs_buf_rele(bp);
			XFS_STATS_INC(xb_busy_locked);
			return NULL;
L
Linus Torvalds 已提交
514 515 516
		}
	}

517 518 519 520 521
	/*
	 * if the buffer is stale, clear all the external state associated with
	 * it. We need to keep flags such as how we allocated the buffer memory
	 * intact here.
	 */
522 523
	if (bp->b_flags & XBF_STALE) {
		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
524
		bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
525
	}
C
Christoph Hellwig 已提交
526 527

	trace_xfs_buf_find(bp, flags, _RET_IP_);
528 529
	XFS_STATS_INC(xb_get_locked);
	return bp;
L
Linus Torvalds 已提交
530 531 532
}

/*
533
 *	Assembles a buffer covering the specified range.
L
Linus Torvalds 已提交
534 535 536 537
 *	Storage in memory for all portions of the buffer will be allocated,
 *	although backing storage may not be.
 */
xfs_buf_t *
538
xfs_buf_get(
L
Linus Torvalds 已提交
539
	xfs_buftarg_t		*target,/* target for buffer		*/
540
	xfs_off_t		ioff,	/* starting offset of range	*/
L
Linus Torvalds 已提交
541
	size_t			isize,	/* length of range		*/
542
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
543
{
544
	xfs_buf_t		*bp, *new_bp;
545
	int			error = 0;
L
Linus Torvalds 已提交
546

547 548
	new_bp = xfs_buf_allocate(flags);
	if (unlikely(!new_bp))
L
Linus Torvalds 已提交
549 550
		return NULL;

551 552
	bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
	if (bp == new_bp) {
553
		error = xfs_buf_allocate_memory(bp, flags);
L
Linus Torvalds 已提交
554 555 556
		if (error)
			goto no_buffer;
	} else {
557 558
		xfs_buf_deallocate(new_bp);
		if (unlikely(bp == NULL))
L
Linus Torvalds 已提交
559 560 561
			return NULL;
	}

562 563
	if (!(bp->b_flags & XBF_MAPPED)) {
		error = _xfs_buf_map_pages(bp, flags);
L
Linus Torvalds 已提交
564
		if (unlikely(error)) {
565 566
			xfs_warn(target->bt_mount,
				"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
567 568 569 570
			goto no_buffer;
		}
	}

571
	XFS_STATS_INC(xb_get);
L
Linus Torvalds 已提交
572 573 574 575 576

	/*
	 * Always fill in the block number now, the mapped cases can do
	 * their own overlay of this later.
	 */
577 578
	bp->b_bn = ioff;
	bp->b_count_desired = bp->b_buffer_length;
L
Linus Torvalds 已提交
579

C
Christoph Hellwig 已提交
580
	trace_xfs_buf_get(bp, flags, _RET_IP_);
581
	return bp;
L
Linus Torvalds 已提交
582 583

 no_buffer:
584 585 586
	if (flags & (XBF_LOCK | XBF_TRYLOCK))
		xfs_buf_unlock(bp);
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
587 588 589
	return NULL;
}

C
Christoph Hellwig 已提交
590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605
STATIC int
_xfs_buf_read(
	xfs_buf_t		*bp,
	xfs_buf_flags_t		flags)
{
	int			status;

	ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);

	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
			XBF_READ_AHEAD | _XBF_RUN_QUEUES);
	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
			XBF_READ_AHEAD | _XBF_RUN_QUEUES);

	status = xfs_buf_iorequest(bp);
606 607 608
	if (status || XFS_BUF_ISERROR(bp) || (flags & XBF_ASYNC))
		return status;
	return xfs_buf_iowait(bp);
C
Christoph Hellwig 已提交
609 610
}

L
Linus Torvalds 已提交
611
xfs_buf_t *
612
xfs_buf_read(
L
Linus Torvalds 已提交
613
	xfs_buftarg_t		*target,
614
	xfs_off_t		ioff,
L
Linus Torvalds 已提交
615
	size_t			isize,
616
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
617
{
618 619 620 621
	xfs_buf_t		*bp;

	flags |= XBF_READ;

622
	bp = xfs_buf_get(target, ioff, isize, flags);
623
	if (bp) {
C
Christoph Hellwig 已提交
624 625
		trace_xfs_buf_read(bp, flags, _RET_IP_);

626 627
		if (!XFS_BUF_ISDONE(bp)) {
			XFS_STATS_INC(xb_get_read);
C
Christoph Hellwig 已提交
628
			_xfs_buf_read(bp, flags);
629
		} else if (flags & XBF_ASYNC) {
L
Linus Torvalds 已提交
630 631 632 633 634 635 636
			/*
			 * Read ahead call which is already satisfied,
			 * drop the buffer
			 */
			goto no_buffer;
		} else {
			/* We do not want read in the flags */
637
			bp->b_flags &= ~XBF_READ;
L
Linus Torvalds 已提交
638 639 640
		}
	}

641
	return bp;
L
Linus Torvalds 已提交
642 643

 no_buffer:
644 645 646
	if (flags & (XBF_LOCK | XBF_TRYLOCK))
		xfs_buf_unlock(bp);
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
647 648 649 650
	return NULL;
}

/*
651 652
 *	If we are not low on memory then do the readahead in a deadlock
 *	safe manner.
L
Linus Torvalds 已提交
653 654
 */
void
655
xfs_buf_readahead(
L
Linus Torvalds 已提交
656
	xfs_buftarg_t		*target,
657
	xfs_off_t		ioff,
C
Christoph Hellwig 已提交
658
	size_t			isize)
L
Linus Torvalds 已提交
659
{
660
	if (bdi_read_congested(target->bt_bdi))
L
Linus Torvalds 已提交
661 662
		return;

C
Christoph Hellwig 已提交
663 664
	xfs_buf_read(target, ioff, isize,
		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
L
Linus Torvalds 已提交
665 666
}

667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
/*
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
struct xfs_buf *
xfs_buf_read_uncached(
	struct xfs_mount	*mp,
	struct xfs_buftarg	*target,
	xfs_daddr_t		daddr,
	size_t			length,
	int			flags)
{
	xfs_buf_t		*bp;
	int			error;

	bp = xfs_buf_get_uncached(target, length, flags);
	if (!bp)
		return NULL;

	/* set up the buffer for a read IO */
	xfs_buf_lock(bp);
	XFS_BUF_SET_ADDR(bp, daddr);
	XFS_BUF_READ(bp);
	XFS_BUF_BUSY(bp);

	xfsbdstrat(mp, bp);
C
Christoph Hellwig 已提交
693
	error = xfs_buf_iowait(bp);
694 695 696 697 698
	if (error || bp->b_error) {
		xfs_buf_relse(bp);
		return NULL;
	}
	return bp;
L
Linus Torvalds 已提交
699 700 701
}

xfs_buf_t *
702
xfs_buf_get_empty(
L
Linus Torvalds 已提交
703 704 705
	size_t			len,
	xfs_buftarg_t		*target)
{
706
	xfs_buf_t		*bp;
L
Linus Torvalds 已提交
707

708 709 710 711
	bp = xfs_buf_allocate(0);
	if (bp)
		_xfs_buf_initialize(bp, target, 0, len, 0);
	return bp;
L
Linus Torvalds 已提交
712 713 714 715 716 717
}

static inline struct page *
mem_to_page(
	void			*addr)
{
718
	if ((!is_vmalloc_addr(addr))) {
L
Linus Torvalds 已提交
719 720 721 722 723 724 725
		return virt_to_page(addr);
	} else {
		return vmalloc_to_page(addr);
	}
}

int
726 727
xfs_buf_associate_memory(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
728 729 730 731 732
	void			*mem,
	size_t			len)
{
	int			rval;
	int			i = 0;
733 734 735
	unsigned long		pageaddr;
	unsigned long		offset;
	size_t			buflen;
L
Linus Torvalds 已提交
736 737
	int			page_count;

738
	pageaddr = (unsigned long)mem & PAGE_MASK;
739
	offset = (unsigned long)mem - pageaddr;
740 741
	buflen = PAGE_ALIGN(len + offset);
	page_count = buflen >> PAGE_SHIFT;
L
Linus Torvalds 已提交
742 743

	/* Free any previous set of page pointers */
744 745
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
746

747 748
	bp->b_pages = NULL;
	bp->b_addr = mem;
L
Linus Torvalds 已提交
749

750
	rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
L
Linus Torvalds 已提交
751 752 753
	if (rval)
		return rval;

754
	bp->b_offset = offset;
755 756 757

	for (i = 0; i < bp->b_page_count; i++) {
		bp->b_pages[i] = mem_to_page((void *)pageaddr);
758
		pageaddr += PAGE_SIZE;
L
Linus Torvalds 已提交
759 760
	}

761 762
	bp->b_count_desired = len;
	bp->b_buffer_length = buflen;
763
	bp->b_flags |= XBF_MAPPED;
L
Linus Torvalds 已提交
764 765 766 767 768

	return 0;
}

xfs_buf_t *
769 770
xfs_buf_get_uncached(
	struct xfs_buftarg	*target,
L
Linus Torvalds 已提交
771
	size_t			len,
772
	int			flags)
L
Linus Torvalds 已提交
773
{
774 775
	unsigned long		page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
	int			error, i;
L
Linus Torvalds 已提交
776 777
	xfs_buf_t		*bp;

778
	bp = xfs_buf_allocate(0);
L
Linus Torvalds 已提交
779 780
	if (unlikely(bp == NULL))
		goto fail;
781
	_xfs_buf_initialize(bp, target, 0, len, 0);
L
Linus Torvalds 已提交
782

783 784
	error = _xfs_buf_get_pages(bp, page_count, 0);
	if (error)
L
Linus Torvalds 已提交
785 786
		goto fail_free_buf;

787
	for (i = 0; i < page_count; i++) {
788
		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
789 790
		if (!bp->b_pages[i])
			goto fail_free_mem;
L
Linus Torvalds 已提交
791
	}
792
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
793

794 795
	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
	if (unlikely(error)) {
796 797
		xfs_warn(target->bt_mount,
			"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
798
		goto fail_free_mem;
799
	}
L
Linus Torvalds 已提交
800

801
	xfs_buf_unlock(bp);
L
Linus Torvalds 已提交
802

803
	trace_xfs_buf_get_uncached(bp, _RET_IP_);
L
Linus Torvalds 已提交
804
	return bp;
805

L
Linus Torvalds 已提交
806
 fail_free_mem:
807 808
	while (--i >= 0)
		__free_page(bp->b_pages[i]);
809
	_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
810
 fail_free_buf:
811
	xfs_buf_deallocate(bp);
L
Linus Torvalds 已提交
812 813 814 815 816 817 818 819 820 821
 fail:
	return NULL;
}

/*
 *	Increment reference count on buffer, to hold the buffer concurrently
 *	with another thread which may release (free) the buffer asynchronously.
 *	Must hold the buffer already to call this function.
 */
void
822 823
xfs_buf_hold(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
824
{
C
Christoph Hellwig 已提交
825
	trace_xfs_buf_hold(bp, _RET_IP_);
826
	atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
827 828 829
}

/*
830 831
 *	Releases a hold on the specified buffer.  If the
 *	the hold count is 1, calls xfs_buf_free.
L
Linus Torvalds 已提交
832 833
 */
void
834 835
xfs_buf_rele(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
836
{
837
	struct xfs_perag	*pag = bp->b_pag;
L
Linus Torvalds 已提交
838

C
Christoph Hellwig 已提交
839
	trace_xfs_buf_rele(bp, _RET_IP_);
L
Linus Torvalds 已提交
840

841
	if (!pag) {
842
		ASSERT(list_empty(&bp->b_lru));
843
		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
844 845 846 847 848
		if (atomic_dec_and_test(&bp->b_hold))
			xfs_buf_free(bp);
		return;
	}

849
	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
850

851
	ASSERT(atomic_read(&bp->b_hold) > 0);
852
	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
853
		if (!(bp->b_flags & XBF_STALE) &&
854 855 856
			   atomic_read(&bp->b_lru_ref)) {
			xfs_buf_lru_add(bp);
			spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
857
		} else {
858
			xfs_buf_lru_del(bp);
859
			ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
860 861 862
			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			spin_unlock(&pag->pag_buf_lock);
			xfs_perag_put(pag);
863
			xfs_buf_free(bp);
L
Linus Torvalds 已提交
864 865 866 867 868 869
		}
	}
}


/*
870
 *	Lock a buffer object, if it is not already locked.
871 872 873 874 875 876 877 878
 *
 *	If we come across a stale, pinned, locked buffer, we know that we are
 *	being asked to lock a buffer that has been reallocated. Because it is
 *	pinned, we know that the log has not been pushed to disk and hence it
 *	will still be locked.  Rather than continuing to have trylock attempts
 *	fail until someone else pushes the log, push it ourselves before
 *	returning.  This means that the xfsaild will not get stuck trying
 *	to push on stale inode buffers.
L
Linus Torvalds 已提交
879 880
 */
int
881 882
xfs_buf_cond_lock(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
883 884 885
{
	int			locked;

886
	locked = down_trylock(&bp->b_sema) == 0;
C
Christoph Hellwig 已提交
887
	if (locked)
888
		XB_SET_OWNER(bp);
889 890
	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
		xfs_log_force(bp->b_target->bt_mount, 0);
C
Christoph Hellwig 已提交
891 892

	trace_xfs_buf_cond_lock(bp, _RET_IP_);
893
	return locked ? 0 : -EBUSY;
L
Linus Torvalds 已提交
894 895 896
}

int
897 898
xfs_buf_lock_value(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
899
{
900
	return bp->b_sema.count;
L
Linus Torvalds 已提交
901 902 903
}

/*
904
 *	Lock a buffer object.
905 906 907 908 909 910
 *
 *	If we come across a stale, pinned, locked buffer, we know that we
 *	are being asked to lock a buffer that has been reallocated. Because
 *	it is pinned, we know that the log has not been pushed to disk and
 *	hence it will still be locked. Rather than sleeping until someone
 *	else pushes the log, push it ourselves before trying to get the lock.
L
Linus Torvalds 已提交
911
 */
912 913 914
void
xfs_buf_lock(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
915
{
C
Christoph Hellwig 已提交
916 917
	trace_xfs_buf_lock(bp, _RET_IP_);

918
	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
919
		xfs_log_force(bp->b_target->bt_mount, 0);
920
	if (atomic_read(&bp->b_io_remaining))
J
Jens Axboe 已提交
921
		blk_flush_plug(current);
922 923
	down(&bp->b_sema);
	XB_SET_OWNER(bp);
C
Christoph Hellwig 已提交
924 925

	trace_xfs_buf_lock_done(bp, _RET_IP_);
L
Linus Torvalds 已提交
926 927 928
}

/*
929
 *	Releases the lock on the buffer object.
930
 *	If the buffer is marked delwri but is not queued, do so before we
931
 *	unlock the buffer as we need to set flags correctly.  We also need to
932 933
 *	take a reference for the delwri queue because the unlocker is going to
 *	drop their's and they don't know we just queued it.
L
Linus Torvalds 已提交
934 935
 */
void
936 937
xfs_buf_unlock(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
938
{
939 940 941 942
	if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
		atomic_inc(&bp->b_hold);
		bp->b_flags |= XBF_ASYNC;
		xfs_buf_delwri_queue(bp, 0);
943 944
	}

945 946
	XB_CLEAR_OWNER(bp);
	up(&bp->b_sema);
C
Christoph Hellwig 已提交
947 948

	trace_xfs_buf_unlock(bp, _RET_IP_);
L
Linus Torvalds 已提交
949 950
}

951 952 953
STATIC void
xfs_buf_wait_unpin(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
954 955 956
{
	DECLARE_WAITQUEUE	(wait, current);

957
	if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
958 959
		return;

960
	add_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
961 962
	for (;;) {
		set_current_state(TASK_UNINTERRUPTIBLE);
963
		if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
964
			break;
J
Jens Axboe 已提交
965
		io_schedule();
L
Linus Torvalds 已提交
966
	}
967
	remove_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
968 969 970 971 972 973 974 975
	set_current_state(TASK_RUNNING);
}

/*
 *	Buffer Utility Routines
 */

STATIC void
976
xfs_buf_iodone_work(
D
David Howells 已提交
977
	struct work_struct	*work)
L
Linus Torvalds 已提交
978
{
D
David Howells 已提交
979 980
	xfs_buf_t		*bp =
		container_of(work, xfs_buf_t, b_iodone_work);
L
Linus Torvalds 已提交
981

982
	if (bp->b_iodone)
983 984
		(*(bp->b_iodone))(bp);
	else if (bp->b_flags & XBF_ASYNC)
L
Linus Torvalds 已提交
985 986 987 988
		xfs_buf_relse(bp);
}

void
989 990
xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
991 992
	int			schedule)
{
C
Christoph Hellwig 已提交
993 994
	trace_xfs_buf_iodone(bp, _RET_IP_);

995
	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
996 997
	if (bp->b_error == 0)
		bp->b_flags |= XBF_DONE;
L
Linus Torvalds 已提交
998

999
	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
L
Linus Torvalds 已提交
1000
		if (schedule) {
D
David Howells 已提交
1001
			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1002
			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
L
Linus Torvalds 已提交
1003
		} else {
D
David Howells 已提交
1004
			xfs_buf_iodone_work(&bp->b_iodone_work);
L
Linus Torvalds 已提交
1005 1006
		}
	} else {
1007
		complete(&bp->b_iowait);
L
Linus Torvalds 已提交
1008 1009 1010 1011
	}
}

void
1012 1013 1014
xfs_buf_ioerror(
	xfs_buf_t		*bp,
	int			error)
L
Linus Torvalds 已提交
1015 1016
{
	ASSERT(error >= 0 && error <= 0xffff);
1017
	bp->b_error = (unsigned short)error;
C
Christoph Hellwig 已提交
1018
	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
L
Linus Torvalds 已提交
1019 1020 1021
}

int
C
Christoph Hellwig 已提交
1022 1023
xfs_bwrite(
	struct xfs_mount	*mp,
C
Christoph Hellwig 已提交
1024
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
1025
{
1026
	int			error;
L
Linus Torvalds 已提交
1027

C
Christoph Hellwig 已提交
1028
	bp->b_flags |= XBF_WRITE;
1029
	bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
L
Linus Torvalds 已提交
1030

C
Christoph Hellwig 已提交
1031
	xfs_buf_delwri_dequeue(bp);
1032
	xfs_bdstrat_cb(bp);
L
Linus Torvalds 已提交
1033

1034 1035 1036 1037
	error = xfs_buf_iowait(bp);
	if (error)
		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
	xfs_buf_relse(bp);
C
Christoph Hellwig 已提交
1038
	return error;
C
Christoph Hellwig 已提交
1039
}
L
Linus Torvalds 已提交
1040

C
Christoph Hellwig 已提交
1041 1042 1043 1044 1045
void
xfs_bdwrite(
	void			*mp,
	struct xfs_buf		*bp)
{
C
Christoph Hellwig 已提交
1046
	trace_xfs_buf_bdwrite(bp, _RET_IP_);
L
Linus Torvalds 已提交
1047

C
Christoph Hellwig 已提交
1048 1049 1050 1051
	bp->b_flags &= ~XBF_READ;
	bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);

	xfs_buf_delwri_queue(bp, 1);
L
Linus Torvalds 已提交
1052 1053
}

1054 1055
/*
 * Called when we want to stop a buffer from getting written or read.
C
Christoph Hellwig 已提交
1056
 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
 * so that the proper iodone callbacks get called.
 */
STATIC int
xfs_bioerror(
	xfs_buf_t *bp)
{
#ifdef XFSERRORDEBUG
	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
#endif

	/*
	 * No need to wait until the buffer is unpinned, we aren't flushing it.
	 */
	XFS_BUF_ERROR(bp, EIO);

	/*
C
Christoph Hellwig 已提交
1073
	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1074 1075 1076 1077 1078 1079
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDELAYWRITE(bp);
	XFS_BUF_UNDONE(bp);
	XFS_BUF_STALE(bp);

C
Christoph Hellwig 已提交
1080
	xfs_buf_ioend(bp, 0);
1081 1082 1083 1084 1085 1086

	return EIO;
}

/*
 * Same as xfs_bioerror, except that we are releasing the buffer
C
Christoph Hellwig 已提交
1087
 * here ourselves, and avoiding the xfs_buf_ioend call.
1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
STATIC int
xfs_bioerror_relse(
	struct xfs_buf	*bp)
{
	int64_t		fl = XFS_BUF_BFLAGS(bp);
	/*
	 * No need to wait until the buffer is unpinned.
	 * We aren't flushing it.
	 *
	 * chunkhold expects B_DONE to be set, whether
	 * we actually finish the I/O or not. We don't want to
	 * change that interface.
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDELAYWRITE(bp);
	XFS_BUF_DONE(bp);
	XFS_BUF_STALE(bp);
	XFS_BUF_CLR_IODONE_FUNC(bp);
1109
	if (!(fl & XBF_ASYNC)) {
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
		/*
		 * Mark b_error and B_ERROR _both_.
		 * Lot's of chunkcache code assumes that.
		 * There's no reason to mark error for
		 * ASYNC buffers.
		 */
		XFS_BUF_ERROR(bp, EIO);
		XFS_BUF_FINISH_IOWAIT(bp);
	} else {
		xfs_buf_relse(bp);
	}

	return EIO;
}


/*
 * All xfs metadata buffers except log state machine buffers
 * get this attached as their b_bdstrat callback function.
 * This is so that we can catch a buffer
 * after prematurely unpinning it to forcibly shutdown the filesystem.
 */
int
xfs_bdstrat_cb(
	struct xfs_buf	*bp)
{
1136
	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		/*
		 * Metadata write that didn't get logged but
		 * written delayed anyway. These aren't associated
		 * with a transaction, and can be ignored.
		 */
		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
			return xfs_bioerror_relse(bp);
		else
			return xfs_bioerror(bp);
	}

	xfs_buf_iorequest(bp);
	return 0;
}

/*
 * Wrapper around bdstrat so that we can stop data from going to disk in case
 * we are shutting down the filesystem.  Typically user data goes thru this
 * path; one of the exceptions is the superblock.
 */
void
xfsbdstrat(
	struct xfs_mount	*mp,
	struct xfs_buf		*bp)
{
	if (XFS_FORCED_SHUTDOWN(mp)) {
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		xfs_bioerror_relse(bp);
		return;
	}

	xfs_buf_iorequest(bp);
}

1172
STATIC void
1173 1174
_xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1175 1176
	int			schedule)
{
1177
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1178
		xfs_buf_ioend(bp, schedule);
L
Linus Torvalds 已提交
1179 1180
}

A
Al Viro 已提交
1181
STATIC void
1182
xfs_buf_bio_end_io(
L
Linus Torvalds 已提交
1183 1184 1185
	struct bio		*bio,
	int			error)
{
1186
	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
L
Linus Torvalds 已提交
1187

1188
	xfs_buf_ioerror(bp, -error);
L
Linus Torvalds 已提交
1189

1190 1191 1192
	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));

1193
	_xfs_buf_ioend(bp, 1);
L
Linus Torvalds 已提交
1194 1195 1196 1197
	bio_put(bio);
}

STATIC void
1198 1199
_xfs_buf_ioapply(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1200
{
1201
	int			rw, map_i, total_nr_pages, nr_pages;
L
Linus Torvalds 已提交
1202
	struct bio		*bio;
1203 1204 1205
	int			offset = bp->b_offset;
	int			size = bp->b_count_desired;
	sector_t		sector = bp->b_bn;
L
Linus Torvalds 已提交
1206

1207
	total_nr_pages = bp->b_page_count;
L
Linus Torvalds 已提交
1208 1209
	map_i = 0;

1210 1211
	if (bp->b_flags & XBF_ORDERED) {
		ASSERT(!(bp->b_flags & XBF_READ));
1212
		rw = WRITE_FLUSH_FUA;
1213
	} else if (bp->b_flags & XBF_LOG_BUFFER) {
1214 1215 1216
		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
		bp->b_flags &= ~_XBF_RUN_QUEUES;
		rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
1217 1218 1219 1220
	} else if (bp->b_flags & _XBF_RUN_QUEUES) {
		ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
		bp->b_flags &= ~_XBF_RUN_QUEUES;
		rw = (bp->b_flags & XBF_WRITE) ? WRITE_META : READ_META;
1221 1222 1223
	} else {
		rw = (bp->b_flags & XBF_WRITE) ? WRITE :
		     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1224 1225
	}

L
Linus Torvalds 已提交
1226 1227

next_chunk:
1228
	atomic_inc(&bp->b_io_remaining);
L
Linus Torvalds 已提交
1229 1230 1231 1232 1233
	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
	if (nr_pages > total_nr_pages)
		nr_pages = total_nr_pages;

	bio = bio_alloc(GFP_NOIO, nr_pages);
1234
	bio->bi_bdev = bp->b_target->bt_bdev;
L
Linus Torvalds 已提交
1235
	bio->bi_sector = sector;
1236 1237
	bio->bi_end_io = xfs_buf_bio_end_io;
	bio->bi_private = bp;
L
Linus Torvalds 已提交
1238

1239

L
Linus Torvalds 已提交
1240
	for (; size && nr_pages; nr_pages--, map_i++) {
1241
		int	rbytes, nbytes = PAGE_SIZE - offset;
L
Linus Torvalds 已提交
1242 1243 1244 1245

		if (nbytes > size)
			nbytes = size;

1246 1247
		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
		if (rbytes < nbytes)
L
Linus Torvalds 已提交
1248 1249 1250 1251 1252 1253 1254 1255 1256
			break;

		offset = 0;
		sector += nbytes >> BBSHIFT;
		size -= nbytes;
		total_nr_pages--;
	}

	if (likely(bio->bi_size)) {
1257 1258 1259 1260
		if (xfs_buf_is_vmapped(bp)) {
			flush_kernel_vmap_range(bp->b_addr,
						xfs_buf_vmap_len(bp));
		}
L
Linus Torvalds 已提交
1261 1262 1263 1264
		submit_bio(rw, bio);
		if (size)
			goto next_chunk;
	} else {
1265
		xfs_buf_ioerror(bp, EIO);
1266
		bio_put(bio);
L
Linus Torvalds 已提交
1267 1268 1269 1270
	}
}

int
1271 1272
xfs_buf_iorequest(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1273
{
C
Christoph Hellwig 已提交
1274
	trace_xfs_buf_iorequest(bp, _RET_IP_);
L
Linus Torvalds 已提交
1275

1276 1277
	if (bp->b_flags & XBF_DELWRI) {
		xfs_buf_delwri_queue(bp, 1);
L
Linus Torvalds 已提交
1278 1279 1280
		return 0;
	}

1281 1282
	if (bp->b_flags & XBF_WRITE) {
		xfs_buf_wait_unpin(bp);
L
Linus Torvalds 已提交
1283 1284
	}

1285
	xfs_buf_hold(bp);
L
Linus Torvalds 已提交
1286 1287 1288

	/* Set the count to 1 initially, this will stop an I/O
	 * completion callout which happens before we have started
1289
	 * all the I/O from calling xfs_buf_ioend too early.
L
Linus Torvalds 已提交
1290
	 */
1291 1292 1293
	atomic_set(&bp->b_io_remaining, 1);
	_xfs_buf_ioapply(bp);
	_xfs_buf_ioend(bp, 0);
L
Linus Torvalds 已提交
1294

1295
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
1296 1297 1298 1299
	return 0;
}

/*
1300 1301 1302
 *	Waits for I/O to complete on the buffer supplied.
 *	It returns immediately if no I/O is pending.
 *	It returns the I/O error code, if any, or 0 if there was no error.
L
Linus Torvalds 已提交
1303 1304
 */
int
1305 1306
xfs_buf_iowait(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1307
{
C
Christoph Hellwig 已提交
1308 1309
	trace_xfs_buf_iowait(bp, _RET_IP_);

1310
	if (atomic_read(&bp->b_io_remaining))
J
Jens Axboe 已提交
1311
		blk_flush_plug(current);
1312
	wait_for_completion(&bp->b_iowait);
C
Christoph Hellwig 已提交
1313 1314

	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1315
	return bp->b_error;
L
Linus Torvalds 已提交
1316 1317
}

1318 1319 1320
xfs_caddr_t
xfs_buf_offset(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1321 1322 1323 1324
	size_t			offset)
{
	struct page		*page;

1325 1326
	if (bp->b_flags & XBF_MAPPED)
		return XFS_BUF_PTR(bp) + offset;
L
Linus Torvalds 已提交
1327

1328
	offset += bp->b_offset;
1329 1330
	page = bp->b_pages[offset >> PAGE_SHIFT];
	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
L
Linus Torvalds 已提交
1331 1332 1333 1334 1335 1336
}

/*
 *	Move data into or out of a buffer.
 */
void
1337 1338
xfs_buf_iomove(
	xfs_buf_t		*bp,	/* buffer to process		*/
L
Linus Torvalds 已提交
1339 1340
	size_t			boff,	/* starting buffer offset	*/
	size_t			bsize,	/* length to copy		*/
1341
	void			*data,	/* data address			*/
1342
	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
L
Linus Torvalds 已提交
1343 1344 1345 1346 1347 1348
{
	size_t			bend, cpoff, csize;
	struct page		*page;

	bend = boff + bsize;
	while (boff < bend) {
1349 1350
		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
		cpoff = xfs_buf_poff(boff + bp->b_offset);
L
Linus Torvalds 已提交
1351
		csize = min_t(size_t,
1352
			      PAGE_SIZE-cpoff, bp->b_count_desired-boff);
L
Linus Torvalds 已提交
1353

1354
		ASSERT(((csize + cpoff) <= PAGE_SIZE));
L
Linus Torvalds 已提交
1355 1356

		switch (mode) {
1357
		case XBRW_ZERO:
L
Linus Torvalds 已提交
1358 1359
			memset(page_address(page) + cpoff, 0, csize);
			break;
1360
		case XBRW_READ:
L
Linus Torvalds 已提交
1361 1362
			memcpy(data, page_address(page) + cpoff, csize);
			break;
1363
		case XBRW_WRITE:
L
Linus Torvalds 已提交
1364 1365 1366 1367 1368 1369 1370 1371 1372
			memcpy(page_address(page) + cpoff, data, csize);
		}

		boff += csize;
		data += csize;
	}
}

/*
1373
 *	Handling of buffer targets (buftargs).
L
Linus Torvalds 已提交
1374 1375 1376
 */

/*
1377 1378 1379
 * Wait for any bufs with callbacks that have been submitted but have not yet
 * returned. These buffers will have an elevated hold count, so wait on those
 * while freeing all the buffers only held by the LRU.
L
Linus Torvalds 已提交
1380 1381 1382
 */
void
xfs_wait_buftarg(
1383
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1384
{
1385 1386 1387 1388 1389 1390 1391 1392
	struct xfs_buf		*bp;

restart:
	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
		if (atomic_read(&bp->b_hold) > 1) {
			spin_unlock(&btp->bt_lru_lock);
D
Dave Chinner 已提交
1393
			delay(100);
1394
			goto restart;
L
Linus Torvalds 已提交
1395
		}
1396 1397 1398 1399 1400 1401 1402 1403
		/*
		 * clear the LRU reference count so the bufer doesn't get
		 * ignored in xfs_buf_rele().
		 */
		atomic_set(&bp->b_lru_ref, 0);
		spin_unlock(&btp->bt_lru_lock);
		xfs_buf_rele(bp);
		spin_lock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1404
	}
1405
	spin_unlock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1406 1407
}

1408 1409 1410 1411 1412
int
xfs_buftarg_shrink(
	struct shrinker		*shrink,
	int			nr_to_scan,
	gfp_t			mask)
1413
{
1414 1415
	struct xfs_buftarg	*btp = container_of(shrink,
					struct xfs_buftarg, bt_shrinker);
1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444
	struct xfs_buf		*bp;
	LIST_HEAD(dispose);

	if (!nr_to_scan)
		return btp->bt_lru_nr;

	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		if (nr_to_scan-- <= 0)
			break;

		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);

		/*
		 * Decrement the b_lru_ref count unless the value is already
		 * zero. If the value is already zero, we need to reclaim the
		 * buffer, otherwise it gets another trip through the LRU.
		 */
		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
			list_move_tail(&bp->b_lru, &btp->bt_lru);
			continue;
		}

		/*
		 * remove the buffer from the LRU now to avoid needing another
		 * lock round trip inside xfs_buf_rele().
		 */
		list_move(&bp->b_lru, &dispose);
		btp->bt_lru_nr--;
1445
	}
1446 1447 1448 1449 1450 1451 1452 1453 1454
	spin_unlock(&btp->bt_lru_lock);

	while (!list_empty(&dispose)) {
		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
		list_del_init(&bp->b_lru);
		xfs_buf_rele(bp);
	}

	return btp->bt_lru_nr;
1455 1456
}

L
Linus Torvalds 已提交
1457 1458
void
xfs_free_buftarg(
1459 1460
	struct xfs_mount	*mp,
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1461
{
1462 1463
	unregister_shrinker(&btp->bt_shrinker);

L
Linus Torvalds 已提交
1464
	xfs_flush_buftarg(btp, 1);
1465 1466
	if (mp->m_flags & XFS_MOUNT_BARRIER)
		xfs_blkdev_issue_flush(btp);
1467 1468

	kthread_stop(btp->bt_task);
1469
	kmem_free(btp);
L
Linus Torvalds 已提交
1470 1471 1472 1473 1474 1475 1476 1477 1478
}

STATIC int
xfs_setsize_buftarg_flags(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize,
	int			verbose)
{
1479 1480 1481
	btp->bt_bsize = blocksize;
	btp->bt_sshift = ffs(sectorsize) - 1;
	btp->bt_smask = sectorsize - 1;
L
Linus Torvalds 已提交
1482

1483
	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1484 1485
		xfs_warn(btp->bt_mount,
			"Cannot set_blocksize to %u on device %s\n",
L
Linus Torvalds 已提交
1486 1487 1488 1489 1490 1491 1492 1493
			sectorsize, XFS_BUFTARG_NAME(btp));
		return EINVAL;
	}

	return 0;
}

/*
1494 1495 1496 1497
 *	When allocating the initial buffer target we have not yet
 *	read in the superblock, so don't know what sized sectors
 *	are being used is at this early stage.  Play safe.
 */
L
Linus Torvalds 已提交
1498 1499 1500 1501 1502 1503
STATIC int
xfs_setsize_buftarg_early(
	xfs_buftarg_t		*btp,
	struct block_device	*bdev)
{
	return xfs_setsize_buftarg_flags(btp,
1504
			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
L
Linus Torvalds 已提交
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515
}

int
xfs_setsize_buftarg(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize)
{
	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
}

1516 1517
STATIC int
xfs_alloc_delwrite_queue(
1518 1519
	xfs_buftarg_t		*btp,
	const char		*fsname)
1520 1521
{
	INIT_LIST_HEAD(&btp->bt_delwrite_queue);
E
Eric Sandeen 已提交
1522
	spin_lock_init(&btp->bt_delwrite_lock);
1523
	btp->bt_flags = 0;
1524
	btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1525 1526 1527
	if (IS_ERR(btp->bt_task))
		return PTR_ERR(btp->bt_task);
	return 0;
1528 1529
}

L
Linus Torvalds 已提交
1530 1531
xfs_buftarg_t *
xfs_alloc_buftarg(
1532
	struct xfs_mount	*mp,
L
Linus Torvalds 已提交
1533
	struct block_device	*bdev,
1534 1535
	int			external,
	const char		*fsname)
L
Linus Torvalds 已提交
1536 1537 1538 1539 1540
{
	xfs_buftarg_t		*btp;

	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);

1541
	btp->bt_mount = mp;
1542 1543
	btp->bt_dev =  bdev->bd_dev;
	btp->bt_bdev = bdev;
1544 1545 1546 1547
	btp->bt_bdi = blk_get_backing_dev_info(bdev);
	if (!btp->bt_bdi)
		goto error;

1548 1549
	INIT_LIST_HEAD(&btp->bt_lru);
	spin_lock_init(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1550 1551
	if (xfs_setsize_buftarg_early(btp, bdev))
		goto error;
1552
	if (xfs_alloc_delwrite_queue(btp, fsname))
1553
		goto error;
1554 1555 1556
	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
	register_shrinker(&btp->bt_shrinker);
L
Linus Torvalds 已提交
1557 1558 1559
	return btp;

error:
1560
	kmem_free(btp);
L
Linus Torvalds 已提交
1561 1562 1563 1564 1565
	return NULL;
}


/*
1566
 *	Delayed write buffer handling
L
Linus Torvalds 已提交
1567 1568
 */
STATIC void
1569 1570
xfs_buf_delwri_queue(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1571 1572
	int			unlock)
{
1573 1574
	struct list_head	*dwq = &bp->b_target->bt_delwrite_queue;
	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
1575

C
Christoph Hellwig 已提交
1576 1577
	trace_xfs_buf_delwri_queue(bp, _RET_IP_);

1578
	ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
L
Linus Torvalds 已提交
1579

1580
	spin_lock(dwlk);
L
Linus Torvalds 已提交
1581
	/* If already in the queue, dequeue and place at tail */
1582 1583 1584 1585 1586
	if (!list_empty(&bp->b_list)) {
		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
		if (unlock)
			atomic_dec(&bp->b_hold);
		list_del(&bp->b_list);
L
Linus Torvalds 已提交
1587 1588
	}

D
Dave Chinner 已提交
1589 1590 1591 1592 1593
	if (list_empty(dwq)) {
		/* start xfsbufd as it is about to have something to do */
		wake_up_process(bp->b_target->bt_task);
	}

1594 1595 1596
	bp->b_flags |= _XBF_DELWRI_Q;
	list_add_tail(&bp->b_list, dwq);
	bp->b_queuetime = jiffies;
1597
	spin_unlock(dwlk);
L
Linus Torvalds 已提交
1598 1599

	if (unlock)
1600
		xfs_buf_unlock(bp);
L
Linus Torvalds 已提交
1601 1602 1603
}

void
1604 1605
xfs_buf_delwri_dequeue(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1606
{
1607
	spinlock_t		*dwlk = &bp->b_target->bt_delwrite_lock;
L
Linus Torvalds 已提交
1608 1609
	int			dequeued = 0;

1610
	spin_lock(dwlk);
1611 1612 1613
	if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
		ASSERT(bp->b_flags & _XBF_DELWRI_Q);
		list_del_init(&bp->b_list);
L
Linus Torvalds 已提交
1614 1615
		dequeued = 1;
	}
1616
	bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1617
	spin_unlock(dwlk);
L
Linus Torvalds 已提交
1618 1619

	if (dequeued)
1620
		xfs_buf_rele(bp);
L
Linus Torvalds 已提交
1621

C
Christoph Hellwig 已提交
1622
	trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
L
Linus Torvalds 已提交
1623 1624
}

1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653
/*
 * If a delwri buffer needs to be pushed before it has aged out, then promote
 * it to the head of the delwri queue so that it will be flushed on the next
 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
 * than the age currently needed to flush the buffer. Hence the next time the
 * xfsbufd sees it is guaranteed to be considered old enough to flush.
 */
void
xfs_buf_delwri_promote(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;
	long		age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;

	ASSERT(bp->b_flags & XBF_DELWRI);
	ASSERT(bp->b_flags & _XBF_DELWRI_Q);

	/*
	 * Check the buffer age before locking the delayed write queue as we
	 * don't need to promote buffers that are already past the flush age.
	 */
	if (bp->b_queuetime < jiffies - age)
		return;
	bp->b_queuetime = jiffies - age;
	spin_lock(&btp->bt_delwrite_lock);
	list_move(&bp->b_list, &btp->bt_delwrite_queue);
	spin_unlock(&btp->bt_delwrite_lock);
}

L
Linus Torvalds 已提交
1654
STATIC void
1655
xfs_buf_runall_queues(
L
Linus Torvalds 已提交
1656 1657 1658 1659 1660
	struct workqueue_struct	*queue)
{
	flush_workqueue(queue);
}

1661 1662 1663 1664 1665 1666 1667 1668
/*
 * Move as many buffers as specified to the supplied list
 * idicating if we skipped any buffers to prevent deadlocks.
 */
STATIC int
xfs_buf_delwri_split(
	xfs_buftarg_t	*target,
	struct list_head *list,
1669
	unsigned long	age)
1670 1671 1672 1673 1674
{
	xfs_buf_t	*bp, *n;
	struct list_head *dwq = &target->bt_delwrite_queue;
	spinlock_t	*dwlk = &target->bt_delwrite_lock;
	int		skipped = 0;
1675
	int		force;
1676

1677
	force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1678 1679 1680 1681 1682
	INIT_LIST_HEAD(list);
	spin_lock(dwlk);
	list_for_each_entry_safe(bp, n, dwq, b_list) {
		ASSERT(bp->b_flags & XBF_DELWRI);

C
Christoph Hellwig 已提交
1683
		if (!XFS_BUF_ISPINNED(bp) && !xfs_buf_cond_lock(bp)) {
1684
			if (!force &&
1685 1686 1687 1688 1689 1690 1691 1692 1693
			    time_before(jiffies, bp->b_queuetime + age)) {
				xfs_buf_unlock(bp);
				break;
			}

			bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
					 _XBF_RUN_QUEUES);
			bp->b_flags |= XBF_WRITE;
			list_move_tail(&bp->b_list, list);
1694
			trace_xfs_buf_delwri_split(bp, _RET_IP_);
1695 1696 1697 1698 1699 1700 1701 1702 1703
		} else
			skipped++;
	}
	spin_unlock(dwlk);

	return skipped;

}

1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
/*
 * Compare function is more complex than it needs to be because
 * the return value is only 32 bits and we are doing comparisons
 * on 64 bit values
 */
static int
xfs_buf_cmp(
	void		*priv,
	struct list_head *a,
	struct list_head *b)
{
	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
	xfs_daddr_t		diff;

	diff = ap->b_bn - bp->b_bn;
	if (diff < 0)
		return -1;
	if (diff > 0)
		return 1;
	return 0;
}

void
xfs_buf_delwri_sort(
	xfs_buftarg_t	*target,
	struct list_head *list)
{
	list_sort(NULL, list, xfs_buf_cmp);
}

L
Linus Torvalds 已提交
1735
STATIC int
1736
xfsbufd(
1737
	void		*data)
L
Linus Torvalds 已提交
1738
{
1739
	xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
L
Linus Torvalds 已提交
1740 1741 1742

	current->flags |= PF_MEMALLOC;

1743 1744
	set_freezable();

L
Linus Torvalds 已提交
1745
	do {
D
Dave Chinner 已提交
1746 1747
		long	age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
		long	tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1748 1749
		int	count = 0;
		struct list_head tmp;
D
Dave Chinner 已提交
1750

1751
		if (unlikely(freezing(current))) {
1752
			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1753
			refrigerator();
1754
		} else {
1755
			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1756
		}
L
Linus Torvalds 已提交
1757

D
Dave Chinner 已提交
1758 1759 1760 1761
		/* sleep for a long time if there is nothing to do. */
		if (list_empty(&target->bt_delwrite_queue))
			tout = MAX_SCHEDULE_TIMEOUT;
		schedule_timeout_interruptible(tout);
L
Linus Torvalds 已提交
1762

D
Dave Chinner 已提交
1763
		xfs_buf_delwri_split(target, &tmp, age);
1764
		list_sort(NULL, &tmp, xfs_buf_cmp);
L
Linus Torvalds 已提交
1765
		while (!list_empty(&tmp)) {
1766 1767
			struct xfs_buf *bp;
			bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1768
			list_del_init(&bp->b_list);
1769
			xfs_bdstrat_cb(bp);
1770
			count++;
L
Linus Torvalds 已提交
1771
		}
1772
		if (count)
J
Jens Axboe 已提交
1773
			blk_flush_plug(current);
L
Linus Torvalds 已提交
1774

1775
	} while (!kthread_should_stop());
L
Linus Torvalds 已提交
1776

1777
	return 0;
L
Linus Torvalds 已提交
1778 1779 1780
}

/*
1781 1782 1783
 *	Go through all incore buffers, and release buffers if they belong to
 *	the given device. This is used in filesystem error handling to
 *	preserve the consistency of its metadata.
L
Linus Torvalds 已提交
1784 1785 1786
 */
int
xfs_flush_buftarg(
1787 1788
	xfs_buftarg_t	*target,
	int		wait)
L
Linus Torvalds 已提交
1789
{
1790
	xfs_buf_t	*bp;
1791
	int		pincount = 0;
1792 1793
	LIST_HEAD(tmp_list);
	LIST_HEAD(wait_list);
L
Linus Torvalds 已提交
1794

1795
	xfs_buf_runall_queues(xfsconvertd_workqueue);
1796 1797
	xfs_buf_runall_queues(xfsdatad_workqueue);
	xfs_buf_runall_queues(xfslogd_workqueue);
L
Linus Torvalds 已提交
1798

1799
	set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1800
	pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
L
Linus Torvalds 已提交
1801 1802

	/*
1803 1804 1805
	 * Dropped the delayed write list lock, now walk the temporary list.
	 * All I/O is issued async and then if we need to wait for completion
	 * we do that after issuing all the IO.
L
Linus Torvalds 已提交
1806
	 */
1807 1808 1809
	list_sort(NULL, &tmp_list, xfs_buf_cmp);
	while (!list_empty(&tmp_list)) {
		bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1810
		ASSERT(target == bp->b_target);
1811 1812
		list_del_init(&bp->b_list);
		if (wait) {
1813
			bp->b_flags &= ~XBF_ASYNC;
1814 1815
			list_add(&bp->b_list, &wait_list);
		}
1816
		xfs_bdstrat_cb(bp);
L
Linus Torvalds 已提交
1817 1818
	}

1819 1820
	if (wait) {
		/* Expedite and wait for IO to complete. */
J
Jens Axboe 已提交
1821
		blk_flush_plug(current);
1822 1823
		while (!list_empty(&wait_list)) {
			bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1824

1825
			list_del_init(&bp->b_list);
C
Christoph Hellwig 已提交
1826
			xfs_buf_iowait(bp);
1827 1828
			xfs_buf_relse(bp);
		}
L
Linus Torvalds 已提交
1829 1830 1831 1832 1833
	}

	return pincount;
}

1834
int __init
1835
xfs_buf_init(void)
L
Linus Torvalds 已提交
1836
{
1837 1838
	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
						KM_ZONE_HWALIGN, NULL);
1839
	if (!xfs_buf_zone)
C
Christoph Hellwig 已提交
1840
		goto out;
1841

1842
	xfslogd_workqueue = alloc_workqueue("xfslogd",
1843
					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1844
	if (!xfslogd_workqueue)
1845
		goto out_free_buf_zone;
L
Linus Torvalds 已提交
1846

T
Tejun Heo 已提交
1847
	xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
1848 1849
	if (!xfsdatad_workqueue)
		goto out_destroy_xfslogd_workqueue;
L
Linus Torvalds 已提交
1850

T
Tejun Heo 已提交
1851 1852
	xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
						WQ_MEM_RECLAIM, 1);
1853 1854 1855
	if (!xfsconvertd_workqueue)
		goto out_destroy_xfsdatad_workqueue;

1856
	return 0;
L
Linus Torvalds 已提交
1857

1858 1859
 out_destroy_xfsdatad_workqueue:
	destroy_workqueue(xfsdatad_workqueue);
1860 1861 1862
 out_destroy_xfslogd_workqueue:
	destroy_workqueue(xfslogd_workqueue);
 out_free_buf_zone:
1863
	kmem_zone_destroy(xfs_buf_zone);
C
Christoph Hellwig 已提交
1864
 out:
1865
	return -ENOMEM;
L
Linus Torvalds 已提交
1866 1867 1868
}

void
1869
xfs_buf_terminate(void)
L
Linus Torvalds 已提交
1870
{
1871
	destroy_workqueue(xfsconvertd_workqueue);
1872 1873
	destroy_workqueue(xfsdatad_workqueue);
	destroy_workqueue(xfslogd_workqueue);
1874
	kmem_zone_destroy(xfs_buf_zone);
L
Linus Torvalds 已提交
1875
}
1876 1877 1878 1879 1880 1881 1882 1883

#ifdef CONFIG_KDB_MODULES
struct list_head *
xfs_get_buftarg_list(void)
{
	return &xfs_buftarg_list;
}
#endif