xfs_buf.c 38.3 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3
 * All Rights Reserved.
L
Linus Torvalds 已提交
4
 *
5 6
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
L
Linus Torvalds 已提交
7 8
 * published by the Free Software Foundation.
 *
9 10 11 12
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
L
Linus Torvalds 已提交
13
 *
14 15 16
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
L
Linus Torvalds 已提交
17
 */
18
#include "xfs.h"
L
Linus Torvalds 已提交
19 20
#include <linux/stddef.h>
#include <linux/errno.h>
21
#include <linux/gfp.h>
L
Linus Torvalds 已提交
22 23 24 25 26 27 28 29 30 31
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/bio.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include <linux/workqueue.h>
#include <linux/percpu.h>
#include <linux/blkdev.h>
#include <linux/hash.h>
32
#include <linux/kthread.h>
C
Christoph Lameter 已提交
33
#include <linux/migrate.h>
34
#include <linux/backing-dev.h>
35
#include <linux/freezer.h>
L
Linus Torvalds 已提交
36

37 38
#include "xfs_sb.h"
#include "xfs_inum.h"
39
#include "xfs_log.h"
40 41
#include "xfs_ag.h"
#include "xfs_mount.h"
C
Christoph Hellwig 已提交
42
#include "xfs_trace.h"
43

44
static kmem_zone_t *xfs_buf_zone;
45

46
static struct workqueue_struct *xfslogd_workqueue;
L
Linus Torvalds 已提交
47

48 49 50 51
#ifdef XFS_BUF_LOCK_TRACKING
# define XB_SET_OWNER(bp)	((bp)->b_last_holder = current->pid)
# define XB_CLEAR_OWNER(bp)	((bp)->b_last_holder = -1)
# define XB_GET_OWNER(bp)	((bp)->b_last_holder)
L
Linus Torvalds 已提交
52
#else
53 54 55
# define XB_SET_OWNER(bp)	do { } while (0)
# define XB_CLEAR_OWNER(bp)	do { } while (0)
# define XB_GET_OWNER(bp)	do { } while (0)
L
Linus Torvalds 已提交
56 57
#endif

58 59 60
#define xb_to_gfp(flags) \
	((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
	  ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
L
Linus Torvalds 已提交
61

62 63
#define xb_to_km(flags) \
	 (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
L
Linus Torvalds 已提交
64 65


66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
static inline int
xfs_buf_is_vmapped(
	struct xfs_buf	*bp)
{
	/*
	 * Return true if the buffer is vmapped.
	 *
	 * The XBF_MAPPED flag is set if the buffer should be mapped, but the
	 * code is clever enough to know it doesn't have to map a single page,
	 * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
	 */
	return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
}

static inline int
xfs_buf_vmap_len(
	struct xfs_buf	*bp)
{
	return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
}

L
Linus Torvalds 已提交
87
/*
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 * xfs_buf_lru_add - add a buffer to the LRU.
 *
 * The LRU takes a new reference to the buffer so that it will only be freed
 * once the shrinker takes the buffer off the LRU.
 */
STATIC void
xfs_buf_lru_add(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	spin_lock(&btp->bt_lru_lock);
	if (list_empty(&bp->b_lru)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_lru, &btp->bt_lru);
		btp->bt_lru_nr++;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * xfs_buf_lru_del - remove a buffer from the LRU
 *
 * The unlocked check is safe here because it only occurs when there are not
 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 * to optimise the shrinker removing the buffer from the LRU and calling
L
Lucas De Marchi 已提交
114
 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
115
 * bt_lru_lock.
L
Linus Torvalds 已提交
116
 */
117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
STATIC void
xfs_buf_lru_del(
	struct xfs_buf	*bp)
{
	struct xfs_buftarg *btp = bp->b_target;

	if (list_empty(&bp->b_lru))
		return;

	spin_lock(&btp->bt_lru_lock);
	if (!list_empty(&bp->b_lru)) {
		list_del_init(&bp->b_lru);
		btp->bt_lru_nr--;
	}
	spin_unlock(&btp->bt_lru_lock);
}

/*
 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 * b_lru_ref count so that the buffer is freed immediately when the buffer
 * reference count falls to zero. If the buffer is already on the LRU, we need
 * to remove the reference that LRU holds on the buffer.
 *
 * This prevents build-up of stale buffers on the LRU.
 */
void
xfs_buf_stale(
	struct xfs_buf	*bp)
{
146 147
	ASSERT(xfs_buf_islocked(bp));

148
	bp->b_flags |= XBF_STALE;
149 150 151 152 153 154 155 156

	/*
	 * Clear the delwri status so that a delwri queue walker will not
	 * flush this buffer to disk now that it is stale. The delwri queue has
	 * a reference to the buffer, so this is safe to do.
	 */
	bp->b_flags &= ~_XBF_DELWRI_Q;

157 158 159 160 161 162 163 164 165 166 167 168 169 170
	atomic_set(&(bp)->b_lru_ref, 0);
	if (!list_empty(&bp->b_lru)) {
		struct xfs_buftarg *btp = bp->b_target;

		spin_lock(&btp->bt_lru_lock);
		if (!list_empty(&bp->b_lru)) {
			list_del_init(&bp->b_lru);
			btp->bt_lru_nr--;
			atomic_dec(&bp->b_hold);
		}
		spin_unlock(&btp->bt_lru_lock);
	}
	ASSERT(atomic_read(&bp->b_hold) >= 1);
}
L
Linus Torvalds 已提交
171

172 173 174
struct xfs_buf *
xfs_buf_alloc(
	struct xfs_buftarg	*target,
175 176
	xfs_daddr_t		blkno,
	size_t			numblks,
177
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
178
{
179 180
	struct xfs_buf		*bp;

181
	bp = kmem_zone_zalloc(xfs_buf_zone, xb_to_km(flags));
182 183 184
	if (unlikely(!bp))
		return NULL;

L
Linus Torvalds 已提交
185
	/*
186
	 * We don't want certain flags to appear in b_flags.
L
Linus Torvalds 已提交
187
	 */
188 189 190
	flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);

	atomic_set(&bp->b_hold, 1);
191
	atomic_set(&bp->b_lru_ref, 1);
192
	init_completion(&bp->b_iowait);
193
	INIT_LIST_HEAD(&bp->b_lru);
194
	INIT_LIST_HEAD(&bp->b_list);
195
	RB_CLEAR_NODE(&bp->b_rbnode);
T
Thomas Gleixner 已提交
196
	sema_init(&bp->b_sema, 0); /* held, no waiters */
197 198
	XB_SET_OWNER(bp);
	bp->b_target = target;
199
	bp->b_file_offset = blkno << BBSHIFT;
L
Linus Torvalds 已提交
200 201 202 203 204
	/*
	 * Set buffer_length and count_desired to the same value initially.
	 * I/O routines should use count_desired, which will be the same in
	 * most cases but may be reset (e.g. XFS recovery).
	 */
205
	bp->b_buffer_length = bp->b_count_desired = numblks << BBSHIFT;
206
	bp->b_flags = flags;
207 208 209 210 211 212 213

	/*
	 * We do not set the block number here in the buffer because we have not
	 * finished initialising the buffer. We insert the buffer into the cache
	 * in this state, so this ensures that we are unable to do IO on a
	 * buffer that hasn't been fully initialised.
	 */
214 215 216 217 218
	bp->b_bn = XFS_BUF_DADDR_NULL;
	atomic_set(&bp->b_pin_count, 0);
	init_waitqueue_head(&bp->b_waiters);

	XFS_STATS_INC(xb_create);
C
Christoph Hellwig 已提交
219
	trace_xfs_buf_init(bp, _RET_IP_);
220 221

	return bp;
L
Linus Torvalds 已提交
222 223 224
}

/*
225 226
 *	Allocate a page array capable of holding a specified number
 *	of pages, and point the page buf at it.
L
Linus Torvalds 已提交
227 228
 */
STATIC int
229 230
_xfs_buf_get_pages(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
231
	int			page_count,
232
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
233 234
{
	/* Make sure that we have a page list */
235 236 237 238
	if (bp->b_pages == NULL) {
		bp->b_page_count = page_count;
		if (page_count <= XB_PAGES) {
			bp->b_pages = bp->b_page_array;
L
Linus Torvalds 已提交
239
		} else {
240 241 242
			bp->b_pages = kmem_alloc(sizeof(struct page *) *
					page_count, xb_to_km(flags));
			if (bp->b_pages == NULL)
L
Linus Torvalds 已提交
243 244
				return -ENOMEM;
		}
245
		memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
L
Linus Torvalds 已提交
246 247 248 249 250
	}
	return 0;
}

/*
251
 *	Frees b_pages if it was allocated.
L
Linus Torvalds 已提交
252 253
 */
STATIC void
254
_xfs_buf_free_pages(
L
Linus Torvalds 已提交
255 256
	xfs_buf_t	*bp)
{
257
	if (bp->b_pages != bp->b_page_array) {
258
		kmem_free(bp->b_pages);
259
		bp->b_pages = NULL;
L
Linus Torvalds 已提交
260 261 262 263 264 265 266
	}
}

/*
 *	Releases the specified buffer.
 *
 * 	The modification state of any associated pages is left unchanged.
267
 * 	The buffer most not be on any hash - use xfs_buf_rele instead for
L
Linus Torvalds 已提交
268 269 270
 * 	hashed and refcounted buffers
 */
void
271
xfs_buf_free(
L
Linus Torvalds 已提交
272 273
	xfs_buf_t		*bp)
{
C
Christoph Hellwig 已提交
274
	trace_xfs_buf_free(bp, _RET_IP_);
L
Linus Torvalds 已提交
275

276 277
	ASSERT(list_empty(&bp->b_lru));

278
	if (bp->b_flags & _XBF_PAGES) {
L
Linus Torvalds 已提交
279 280
		uint		i;

281
		if (xfs_buf_is_vmapped(bp))
A
Alex Elder 已提交
282 283
			vm_unmap_ram(bp->b_addr - bp->b_offset,
					bp->b_page_count);
L
Linus Torvalds 已提交
284

285 286 287
		for (i = 0; i < bp->b_page_count; i++) {
			struct page	*page = bp->b_pages[i];

288
			__free_page(page);
289
		}
290 291
	} else if (bp->b_flags & _XBF_KMEM)
		kmem_free(bp->b_addr);
292
	_xfs_buf_free_pages(bp);
293
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
294 295 296
}

/*
297
 * Allocates all the pages for buffer in question and builds it's page list.
L
Linus Torvalds 已提交
298 299
 */
STATIC int
300
xfs_buf_allocate_memory(
L
Linus Torvalds 已提交
301 302 303
	xfs_buf_t		*bp,
	uint			flags)
{
304
	size_t			size = bp->b_count_desired;
L
Linus Torvalds 已提交
305
	size_t			nbytes, offset;
306
	gfp_t			gfp_mask = xb_to_gfp(flags);
L
Linus Torvalds 已提交
307
	unsigned short		page_count, i;
308
	xfs_off_t		end;
L
Linus Torvalds 已提交
309 310
	int			error;

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
	/*
	 * for buffers that are contained within a single page, just allocate
	 * the memory from the heap - there's no need for the complexity of
	 * page arrays to keep allocation down to order 0.
	 */
	if (bp->b_buffer_length < PAGE_SIZE) {
		bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
		if (!bp->b_addr) {
			/* low memory - use alloc_page loop instead */
			goto use_alloc_page;
		}

		if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
								PAGE_MASK) !=
		    ((unsigned long)bp->b_addr & PAGE_MASK)) {
			/* b_addr spans two pages - use alloc_page instead */
			kmem_free(bp->b_addr);
			bp->b_addr = NULL;
			goto use_alloc_page;
		}
		bp->b_offset = offset_in_page(bp->b_addr);
		bp->b_pages = bp->b_page_array;
		bp->b_pages[0] = virt_to_page(bp->b_addr);
		bp->b_page_count = 1;
		bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
		return 0;
	}

use_alloc_page:
340 341 342
	end = bp->b_file_offset + bp->b_buffer_length;
	page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
	error = _xfs_buf_get_pages(bp, page_count, flags);
L
Linus Torvalds 已提交
343 344 345
	if (unlikely(error))
		return error;

346
	offset = bp->b_offset;
347
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
348

349
	for (i = 0; i < bp->b_page_count; i++) {
L
Linus Torvalds 已提交
350 351
		struct page	*page;
		uint		retries = 0;
352 353
retry:
		page = alloc_page(gfp_mask);
L
Linus Torvalds 已提交
354
		if (unlikely(page == NULL)) {
355 356
			if (flags & XBF_READ_AHEAD) {
				bp->b_page_count = i;
357 358
				error = ENOMEM;
				goto out_free_pages;
L
Linus Torvalds 已提交
359 360 361 362 363 364 365 366 367
			}

			/*
			 * This could deadlock.
			 *
			 * But until all the XFS lowlevel code is revamped to
			 * handle buffer allocation failures we can't do much.
			 */
			if (!(++retries % 100))
368 369
				xfs_err(NULL,
		"possible memory allocation deadlock in %s (mode:0x%x)",
370
					__func__, gfp_mask);
L
Linus Torvalds 已提交
371

372
			XFS_STATS_INC(xb_page_retries);
373
			congestion_wait(BLK_RW_ASYNC, HZ/50);
L
Linus Torvalds 已提交
374 375 376
			goto retry;
		}

377
		XFS_STATS_INC(xb_page_found);
L
Linus Torvalds 已提交
378

379
		nbytes = min_t(size_t, size, PAGE_SIZE - offset);
L
Linus Torvalds 已提交
380
		size -= nbytes;
381
		bp->b_pages[i] = page;
L
Linus Torvalds 已提交
382 383
		offset = 0;
	}
384
	return 0;
L
Linus Torvalds 已提交
385

386 387 388
out_free_pages:
	for (i = 0; i < bp->b_page_count; i++)
		__free_page(bp->b_pages[i]);
L
Linus Torvalds 已提交
389 390 391 392
	return error;
}

/*
L
Lucas De Marchi 已提交
393
 *	Map buffer into kernel address-space if necessary.
L
Linus Torvalds 已提交
394 395
 */
STATIC int
396
_xfs_buf_map_pages(
L
Linus Torvalds 已提交
397 398 399
	xfs_buf_t		*bp,
	uint			flags)
{
400
	ASSERT(bp->b_flags & _XBF_PAGES);
401
	if (bp->b_page_count == 1) {
402
		/* A single page buffer is always mappable */
403 404 405
		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
		bp->b_flags |= XBF_MAPPED;
	} else if (flags & XBF_MAPPED) {
406 407 408 409 410 411 412 413 414 415 416
		int retried = 0;

		do {
			bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
						-1, PAGE_KERNEL);
			if (bp->b_addr)
				break;
			vm_unmap_aliases();
		} while (retried++ <= 1);

		if (!bp->b_addr)
L
Linus Torvalds 已提交
417
			return -ENOMEM;
418 419
		bp->b_addr += bp->b_offset;
		bp->b_flags |= XBF_MAPPED;
L
Linus Torvalds 已提交
420 421 422 423 424 425 426 427 428 429
	}

	return 0;
}

/*
 *	Finding and Reading Buffers
 */

/*
430
 *	Look up, and creates if absent, a lockable buffer for
L
Linus Torvalds 已提交
431
 *	a given range of an inode.  The buffer is returned
432
 *	locked.	No I/O is implied by this call.
L
Linus Torvalds 已提交
433 434
 */
xfs_buf_t *
435
_xfs_buf_find(
436 437 438
	struct xfs_buftarg	*btp,
	xfs_daddr_t		blkno,
	size_t			numblks,
439 440
	xfs_buf_flags_t		flags,
	xfs_buf_t		*new_bp)
L
Linus Torvalds 已提交
441
{
442 443
	xfs_off_t		offset;
	size_t			numbytes;
444 445 446 447
	struct xfs_perag	*pag;
	struct rb_node		**rbp;
	struct rb_node		*parent;
	xfs_buf_t		*bp;
L
Linus Torvalds 已提交
448

449 450
	offset = BBTOB(blkno);
	numbytes = BBTOB(numblks);
L
Linus Torvalds 已提交
451 452

	/* Check for IOs smaller than the sector size / not sector aligned */
453 454
	ASSERT(!(numbytes < (1 << btp->bt_sshift)));
	ASSERT(!(offset & (xfs_off_t)btp->bt_smask));
L
Linus Torvalds 已提交
455

456 457
	/* get tree root */
	pag = xfs_perag_get(btp->bt_mount,
458
				xfs_daddr_to_agno(btp->bt_mount, blkno));
459 460 461 462 463 464 465 466 467 468

	/* walk tree */
	spin_lock(&pag->pag_buf_lock);
	rbp = &pag->pag_buf_tree.rb_node;
	parent = NULL;
	bp = NULL;
	while (*rbp) {
		parent = *rbp;
		bp = rb_entry(parent, struct xfs_buf, b_rbnode);

469
		if (offset < bp->b_file_offset)
470
			rbp = &(*rbp)->rb_left;
471
		else if (offset > bp->b_file_offset)
472 473 474 475 476 477 478 479 480 481
			rbp = &(*rbp)->rb_right;
		else {
			/*
			 * found a block offset match. If the range doesn't
			 * match, the only way this is allowed is if the buffer
			 * in the cache is stale and the transaction that made
			 * it stale has not yet committed. i.e. we are
			 * reallocating a busy extent. Skip this buffer and
			 * continue searching to the right for an exact match.
			 */
482
			if (bp->b_buffer_length != numbytes) {
483 484 485 486
				ASSERT(bp->b_flags & XBF_STALE);
				rbp = &(*rbp)->rb_right;
				continue;
			}
487
			atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
488 489 490 491 492
			goto found;
		}
	}

	/* No match found */
493
	if (new_bp) {
494 495 496 497 498
		rb_link_node(&new_bp->b_rbnode, parent, rbp);
		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
		/* the buffer keeps the perag reference until it is freed */
		new_bp->b_pag = pag;
		spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
499
	} else {
500
		XFS_STATS_INC(xb_miss_locked);
501 502
		spin_unlock(&pag->pag_buf_lock);
		xfs_perag_put(pag);
L
Linus Torvalds 已提交
503
	}
504
	return new_bp;
L
Linus Torvalds 已提交
505 506

found:
507 508
	spin_unlock(&pag->pag_buf_lock);
	xfs_perag_put(pag);
L
Linus Torvalds 已提交
509

510 511
	if (!xfs_buf_trylock(bp)) {
		if (flags & XBF_TRYLOCK) {
512 513 514
			xfs_buf_rele(bp);
			XFS_STATS_INC(xb_busy_locked);
			return NULL;
L
Linus Torvalds 已提交
515
		}
516 517
		xfs_buf_lock(bp);
		XFS_STATS_INC(xb_get_locked_waited);
L
Linus Torvalds 已提交
518 519
	}

520 521 522 523 524
	/*
	 * if the buffer is stale, clear all the external state associated with
	 * it. We need to keep flags such as how we allocated the buffer memory
	 * intact here.
	 */
525 526
	if (bp->b_flags & XBF_STALE) {
		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
527
		bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
528
	}
C
Christoph Hellwig 已提交
529 530

	trace_xfs_buf_find(bp, flags, _RET_IP_);
531 532
	XFS_STATS_INC(xb_get_locked);
	return bp;
L
Linus Torvalds 已提交
533 534 535
}

/*
536 537 538
 * Assembles a buffer covering the specified range. The code is optimised for
 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 * more hits than misses.
L
Linus Torvalds 已提交
539
 */
540
struct xfs_buf *
541
xfs_buf_get(
542 543 544
	xfs_buftarg_t		*target,
	xfs_daddr_t		blkno,
	size_t			numblks,
545
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
546
{
547 548
	struct xfs_buf		*bp;
	struct xfs_buf		*new_bp;
549
	int			error = 0;
L
Linus Torvalds 已提交
550

551
	bp = _xfs_buf_find(target, blkno, numblks, flags, NULL);
552 553 554
	if (likely(bp))
		goto found;

555
	new_bp = xfs_buf_alloc(target, blkno, numblks, flags);
556
	if (unlikely(!new_bp))
L
Linus Torvalds 已提交
557 558
		return NULL;

559 560 561 562 563 564
	error = xfs_buf_allocate_memory(new_bp, flags);
	if (error) {
		kmem_zone_free(xfs_buf_zone, new_bp);
		return NULL;
	}

565
	bp = _xfs_buf_find(target, blkno, numblks, flags, new_bp);
566
	if (!bp) {
567
		xfs_buf_free(new_bp);
568 569 570
		return NULL;
	}

571 572
	if (bp != new_bp)
		xfs_buf_free(new_bp);
L
Linus Torvalds 已提交
573

574 575 576 577
	/*
	 * Now we have a workable buffer, fill in the block number so
	 * that we can do IO on it.
	 */
578
	bp->b_bn = blkno;
579 580 581
	bp->b_count_desired = bp->b_buffer_length;

found:
582 583
	if (!(bp->b_flags & XBF_MAPPED)) {
		error = _xfs_buf_map_pages(bp, flags);
L
Linus Torvalds 已提交
584
		if (unlikely(error)) {
585 586
			xfs_warn(target->bt_mount,
				"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
587 588 589 590
			goto no_buffer;
		}
	}

591
	XFS_STATS_INC(xb_get);
C
Christoph Hellwig 已提交
592
	trace_xfs_buf_get(bp, flags, _RET_IP_);
593
	return bp;
L
Linus Torvalds 已提交
594

595
no_buffer:
596 597 598
	if (flags & (XBF_LOCK | XBF_TRYLOCK))
		xfs_buf_unlock(bp);
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
599 600 601
	return NULL;
}

C
Christoph Hellwig 已提交
602 603 604 605 606
STATIC int
_xfs_buf_read(
	xfs_buf_t		*bp,
	xfs_buf_flags_t		flags)
{
607
	ASSERT(!(flags & XBF_WRITE));
C
Christoph Hellwig 已提交
608 609
	ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);

610
	bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
611
	bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
C
Christoph Hellwig 已提交
612

613 614 615
	xfs_buf_iorequest(bp);
	if (flags & XBF_ASYNC)
		return 0;
616
	return xfs_buf_iowait(bp);
C
Christoph Hellwig 已提交
617 618
}

L
Linus Torvalds 已提交
619
xfs_buf_t *
620
xfs_buf_read(
L
Linus Torvalds 已提交
621
	xfs_buftarg_t		*target,
622 623
	xfs_daddr_t		blkno,
	size_t			numblks,
624
	xfs_buf_flags_t		flags)
L
Linus Torvalds 已提交
625
{
626 627 628 629
	xfs_buf_t		*bp;

	flags |= XBF_READ;

630
	bp = xfs_buf_get(target, blkno, numblks, flags);
631
	if (bp) {
C
Christoph Hellwig 已提交
632 633
		trace_xfs_buf_read(bp, flags, _RET_IP_);

634 635
		if (!XFS_BUF_ISDONE(bp)) {
			XFS_STATS_INC(xb_get_read);
C
Christoph Hellwig 已提交
636
			_xfs_buf_read(bp, flags);
637
		} else if (flags & XBF_ASYNC) {
L
Linus Torvalds 已提交
638 639 640 641 642 643 644
			/*
			 * Read ahead call which is already satisfied,
			 * drop the buffer
			 */
			goto no_buffer;
		} else {
			/* We do not want read in the flags */
645
			bp->b_flags &= ~XBF_READ;
L
Linus Torvalds 已提交
646 647 648
		}
	}

649
	return bp;
L
Linus Torvalds 已提交
650 651

 no_buffer:
652 653 654
	if (flags & (XBF_LOCK | XBF_TRYLOCK))
		xfs_buf_unlock(bp);
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
655 656 657 658
	return NULL;
}

/*
659 660
 *	If we are not low on memory then do the readahead in a deadlock
 *	safe manner.
L
Linus Torvalds 已提交
661 662
 */
void
663
xfs_buf_readahead(
L
Linus Torvalds 已提交
664
	xfs_buftarg_t		*target,
665 666
	xfs_daddr_t		blkno,
	size_t			numblks)
L
Linus Torvalds 已提交
667
{
668
	if (bdi_read_congested(target->bt_bdi))
L
Linus Torvalds 已提交
669 670
		return;

671
	xfs_buf_read(target, blkno, numblks,
C
Christoph Hellwig 已提交
672
		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
L
Linus Torvalds 已提交
673 674
}

675 676 677 678 679 680 681 682
/*
 * Read an uncached buffer from disk. Allocates and returns a locked
 * buffer containing the disk contents or nothing.
 */
struct xfs_buf *
xfs_buf_read_uncached(
	struct xfs_buftarg	*target,
	xfs_daddr_t		daddr,
683
	size_t			numblks,
684 685 686 687 688
	int			flags)
{
	xfs_buf_t		*bp;
	int			error;

689
	bp = xfs_buf_get_uncached(target, numblks, flags);
690 691 692 693 694 695 696
	if (!bp)
		return NULL;

	/* set up the buffer for a read IO */
	XFS_BUF_SET_ADDR(bp, daddr);
	XFS_BUF_READ(bp);

697
	xfsbdstrat(target->bt_mount, bp);
C
Christoph Hellwig 已提交
698
	error = xfs_buf_iowait(bp);
699
	if (error) {
700 701 702 703
		xfs_buf_relse(bp);
		return NULL;
	}
	return bp;
L
Linus Torvalds 已提交
704 705
}

706 707 708 709 710 711 712
/*
 * Return a buffer allocated as an empty buffer and associated to external
 * memory via xfs_buf_associate_memory() back to it's empty state.
 */
void
xfs_buf_set_empty(
	struct xfs_buf		*bp,
713
	size_t			numblks)
714 715 716 717 718 719 720 721
{
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);

	bp->b_pages = NULL;
	bp->b_page_count = 0;
	bp->b_addr = NULL;
	bp->b_file_offset = 0;
722
	bp->b_buffer_length = bp->b_count_desired = numblks << BBSHIFT;
723 724 725 726
	bp->b_bn = XFS_BUF_DADDR_NULL;
	bp->b_flags &= ~XBF_MAPPED;
}

L
Linus Torvalds 已提交
727 728 729 730
static inline struct page *
mem_to_page(
	void			*addr)
{
731
	if ((!is_vmalloc_addr(addr))) {
L
Linus Torvalds 已提交
732 733 734 735 736 737 738
		return virt_to_page(addr);
	} else {
		return vmalloc_to_page(addr);
	}
}

int
739 740
xfs_buf_associate_memory(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
741 742 743 744 745
	void			*mem,
	size_t			len)
{
	int			rval;
	int			i = 0;
746 747 748
	unsigned long		pageaddr;
	unsigned long		offset;
	size_t			buflen;
L
Linus Torvalds 已提交
749 750
	int			page_count;

751
	pageaddr = (unsigned long)mem & PAGE_MASK;
752
	offset = (unsigned long)mem - pageaddr;
753 754
	buflen = PAGE_ALIGN(len + offset);
	page_count = buflen >> PAGE_SHIFT;
L
Linus Torvalds 已提交
755 756

	/* Free any previous set of page pointers */
757 758
	if (bp->b_pages)
		_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
759

760 761
	bp->b_pages = NULL;
	bp->b_addr = mem;
L
Linus Torvalds 已提交
762

763
	rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
L
Linus Torvalds 已提交
764 765 766
	if (rval)
		return rval;

767
	bp->b_offset = offset;
768 769 770

	for (i = 0; i < bp->b_page_count; i++) {
		bp->b_pages[i] = mem_to_page((void *)pageaddr);
771
		pageaddr += PAGE_SIZE;
L
Linus Torvalds 已提交
772 773
	}

774 775
	bp->b_count_desired = len;
	bp->b_buffer_length = buflen;
776
	bp->b_flags |= XBF_MAPPED;
L
Linus Torvalds 已提交
777 778 779 780 781

	return 0;
}

xfs_buf_t *
782 783
xfs_buf_get_uncached(
	struct xfs_buftarg	*target,
784
	size_t			numblks,
785
	int			flags)
L
Linus Torvalds 已提交
786
{
787
	unsigned long		page_count;
788
	int			error, i;
L
Linus Torvalds 已提交
789 790
	xfs_buf_t		*bp;

791
	bp = xfs_buf_alloc(target, 0, numblks, 0);
L
Linus Torvalds 已提交
792 793 794
	if (unlikely(bp == NULL))
		goto fail;

795
	page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
796 797
	error = _xfs_buf_get_pages(bp, page_count, 0);
	if (error)
L
Linus Torvalds 已提交
798 799
		goto fail_free_buf;

800
	for (i = 0; i < page_count; i++) {
801
		bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
802 803
		if (!bp->b_pages[i])
			goto fail_free_mem;
L
Linus Torvalds 已提交
804
	}
805
	bp->b_flags |= _XBF_PAGES;
L
Linus Torvalds 已提交
806

807 808
	error = _xfs_buf_map_pages(bp, XBF_MAPPED);
	if (unlikely(error)) {
809 810
		xfs_warn(target->bt_mount,
			"%s: failed to map pages\n", __func__);
L
Linus Torvalds 已提交
811
		goto fail_free_mem;
812
	}
L
Linus Torvalds 已提交
813

814
	trace_xfs_buf_get_uncached(bp, _RET_IP_);
L
Linus Torvalds 已提交
815
	return bp;
816

L
Linus Torvalds 已提交
817
 fail_free_mem:
818 819
	while (--i >= 0)
		__free_page(bp->b_pages[i]);
820
	_xfs_buf_free_pages(bp);
L
Linus Torvalds 已提交
821
 fail_free_buf:
822
	kmem_zone_free(xfs_buf_zone, bp);
L
Linus Torvalds 已提交
823 824 825 826 827 828 829 830 831 832
 fail:
	return NULL;
}

/*
 *	Increment reference count on buffer, to hold the buffer concurrently
 *	with another thread which may release (free) the buffer asynchronously.
 *	Must hold the buffer already to call this function.
 */
void
833 834
xfs_buf_hold(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
835
{
C
Christoph Hellwig 已提交
836
	trace_xfs_buf_hold(bp, _RET_IP_);
837
	atomic_inc(&bp->b_hold);
L
Linus Torvalds 已提交
838 839 840
}

/*
841 842
 *	Releases a hold on the specified buffer.  If the
 *	the hold count is 1, calls xfs_buf_free.
L
Linus Torvalds 已提交
843 844
 */
void
845 846
xfs_buf_rele(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
847
{
848
	struct xfs_perag	*pag = bp->b_pag;
L
Linus Torvalds 已提交
849

C
Christoph Hellwig 已提交
850
	trace_xfs_buf_rele(bp, _RET_IP_);
L
Linus Torvalds 已提交
851

852
	if (!pag) {
853
		ASSERT(list_empty(&bp->b_lru));
854
		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
855 856 857 858 859
		if (atomic_dec_and_test(&bp->b_hold))
			xfs_buf_free(bp);
		return;
	}

860
	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
861

862
	ASSERT(atomic_read(&bp->b_hold) > 0);
863
	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
864
		if (!(bp->b_flags & XBF_STALE) &&
865 866 867
			   atomic_read(&bp->b_lru_ref)) {
			xfs_buf_lru_add(bp);
			spin_unlock(&pag->pag_buf_lock);
L
Linus Torvalds 已提交
868
		} else {
869
			xfs_buf_lru_del(bp);
870
			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
871 872 873
			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
			spin_unlock(&pag->pag_buf_lock);
			xfs_perag_put(pag);
874
			xfs_buf_free(bp);
L
Linus Torvalds 已提交
875 876 877 878 879 880
		}
	}
}


/*
881
 *	Lock a buffer object, if it is not already locked.
882 883 884 885 886 887 888 889
 *
 *	If we come across a stale, pinned, locked buffer, we know that we are
 *	being asked to lock a buffer that has been reallocated. Because it is
 *	pinned, we know that the log has not been pushed to disk and hence it
 *	will still be locked.  Rather than continuing to have trylock attempts
 *	fail until someone else pushes the log, push it ourselves before
 *	returning.  This means that the xfsaild will not get stuck trying
 *	to push on stale inode buffers.
L
Linus Torvalds 已提交
890 891
 */
int
892 893
xfs_buf_trylock(
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
894 895 896
{
	int			locked;

897
	locked = down_trylock(&bp->b_sema) == 0;
C
Christoph Hellwig 已提交
898
	if (locked)
899
		XB_SET_OWNER(bp);
900 901
	else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
		xfs_log_force(bp->b_target->bt_mount, 0);
C
Christoph Hellwig 已提交
902

903 904
	trace_xfs_buf_trylock(bp, _RET_IP_);
	return locked;
L
Linus Torvalds 已提交
905 906 907
}

/*
908
 *	Lock a buffer object.
909 910 911 912 913 914
 *
 *	If we come across a stale, pinned, locked buffer, we know that we
 *	are being asked to lock a buffer that has been reallocated. Because
 *	it is pinned, we know that the log has not been pushed to disk and
 *	hence it will still be locked. Rather than sleeping until someone
 *	else pushes the log, push it ourselves before trying to get the lock.
L
Linus Torvalds 已提交
915
 */
916 917
void
xfs_buf_lock(
918
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
919
{
C
Christoph Hellwig 已提交
920 921
	trace_xfs_buf_lock(bp, _RET_IP_);

922
	if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
923
		xfs_log_force(bp->b_target->bt_mount, 0);
924 925
	down(&bp->b_sema);
	XB_SET_OWNER(bp);
C
Christoph Hellwig 已提交
926 927

	trace_xfs_buf_lock_done(bp, _RET_IP_);
L
Linus Torvalds 已提交
928 929 930
}

void
931
xfs_buf_unlock(
932
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
933
{
934 935
	XB_CLEAR_OWNER(bp);
	up(&bp->b_sema);
C
Christoph Hellwig 已提交
936 937

	trace_xfs_buf_unlock(bp, _RET_IP_);
L
Linus Torvalds 已提交
938 939
}

940 941 942
STATIC void
xfs_buf_wait_unpin(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
943 944 945
{
	DECLARE_WAITQUEUE	(wait, current);

946
	if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
947 948
		return;

949
	add_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
950 951
	for (;;) {
		set_current_state(TASK_UNINTERRUPTIBLE);
952
		if (atomic_read(&bp->b_pin_count) == 0)
L
Linus Torvalds 已提交
953
			break;
J
Jens Axboe 已提交
954
		io_schedule();
L
Linus Torvalds 已提交
955
	}
956
	remove_wait_queue(&bp->b_waiters, &wait);
L
Linus Torvalds 已提交
957 958 959 960 961 962 963 964
	set_current_state(TASK_RUNNING);
}

/*
 *	Buffer Utility Routines
 */

STATIC void
965
xfs_buf_iodone_work(
D
David Howells 已提交
966
	struct work_struct	*work)
L
Linus Torvalds 已提交
967
{
D
David Howells 已提交
968 969
	xfs_buf_t		*bp =
		container_of(work, xfs_buf_t, b_iodone_work);
L
Linus Torvalds 已提交
970

971
	if (bp->b_iodone)
972 973
		(*(bp->b_iodone))(bp);
	else if (bp->b_flags & XBF_ASYNC)
L
Linus Torvalds 已提交
974 975 976 977
		xfs_buf_relse(bp);
}

void
978 979
xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
980 981
	int			schedule)
{
C
Christoph Hellwig 已提交
982 983
	trace_xfs_buf_iodone(bp, _RET_IP_);

984
	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
985 986
	if (bp->b_error == 0)
		bp->b_flags |= XBF_DONE;
L
Linus Torvalds 已提交
987

988
	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
L
Linus Torvalds 已提交
989
		if (schedule) {
D
David Howells 已提交
990
			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
991
			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
L
Linus Torvalds 已提交
992
		} else {
D
David Howells 已提交
993
			xfs_buf_iodone_work(&bp->b_iodone_work);
L
Linus Torvalds 已提交
994 995
		}
	} else {
996
		complete(&bp->b_iowait);
L
Linus Torvalds 已提交
997 998 999 1000
	}
}

void
1001 1002 1003
xfs_buf_ioerror(
	xfs_buf_t		*bp,
	int			error)
L
Linus Torvalds 已提交
1004 1005
{
	ASSERT(error >= 0 && error <= 0xffff);
1006
	bp->b_error = (unsigned short)error;
C
Christoph Hellwig 已提交
1007
	trace_xfs_buf_ioerror(bp, error, _RET_IP_);
L
Linus Torvalds 已提交
1008 1009
}

1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
void
xfs_buf_ioerror_alert(
	struct xfs_buf		*bp,
	const char		*func)
{
	xfs_alert(bp->b_target->bt_mount,
"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
		(__uint64_t)XFS_BUF_ADDR(bp), func,
		bp->b_error, XFS_BUF_COUNT(bp));
}

L
Linus Torvalds 已提交
1021
int
C
Christoph Hellwig 已提交
1022
xfs_bwrite(
C
Christoph Hellwig 已提交
1023
	struct xfs_buf		*bp)
L
Linus Torvalds 已提交
1024
{
1025
	int			error;
L
Linus Torvalds 已提交
1026

1027 1028
	ASSERT(xfs_buf_islocked(bp));

C
Christoph Hellwig 已提交
1029
	bp->b_flags |= XBF_WRITE;
1030
	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
L
Linus Torvalds 已提交
1031

1032
	xfs_bdstrat_cb(bp);
L
Linus Torvalds 已提交
1033

1034
	error = xfs_buf_iowait(bp);
1035 1036 1037 1038
	if (error) {
		xfs_force_shutdown(bp->b_target->bt_mount,
				   SHUTDOWN_META_IO_ERROR);
	}
C
Christoph Hellwig 已提交
1039
	return error;
C
Christoph Hellwig 已提交
1040
}
L
Linus Torvalds 已提交
1041

1042 1043
/*
 * Called when we want to stop a buffer from getting written or read.
C
Christoph Hellwig 已提交
1044
 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
 * so that the proper iodone callbacks get called.
 */
STATIC int
xfs_bioerror(
	xfs_buf_t *bp)
{
#ifdef XFSERRORDEBUG
	ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
#endif

	/*
	 * No need to wait until the buffer is unpinned, we aren't flushing it.
	 */
1058
	xfs_buf_ioerror(bp, EIO);
1059 1060

	/*
C
Christoph Hellwig 已提交
1061
	 * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1062 1063 1064
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_UNDONE(bp);
1065
	xfs_buf_stale(bp);
1066

C
Christoph Hellwig 已提交
1067
	xfs_buf_ioend(bp, 0);
1068 1069 1070 1071 1072 1073

	return EIO;
}

/*
 * Same as xfs_bioerror, except that we are releasing the buffer
C
Christoph Hellwig 已提交
1074
 * here ourselves, and avoiding the xfs_buf_ioend call.
1075 1076 1077 1078 1079 1080 1081
 * This is meant for userdata errors; metadata bufs come with
 * iodone functions attached, so that we can track down errors.
 */
STATIC int
xfs_bioerror_relse(
	struct xfs_buf	*bp)
{
1082
	int64_t		fl = bp->b_flags;
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
	/*
	 * No need to wait until the buffer is unpinned.
	 * We aren't flushing it.
	 *
	 * chunkhold expects B_DONE to be set, whether
	 * we actually finish the I/O or not. We don't want to
	 * change that interface.
	 */
	XFS_BUF_UNREAD(bp);
	XFS_BUF_DONE(bp);
1093
	xfs_buf_stale(bp);
1094
	bp->b_iodone = NULL;
1095
	if (!(fl & XBF_ASYNC)) {
1096 1097 1098 1099 1100 1101
		/*
		 * Mark b_error and B_ERROR _both_.
		 * Lot's of chunkcache code assumes that.
		 * There's no reason to mark error for
		 * ASYNC buffers.
		 */
1102
		xfs_buf_ioerror(bp, EIO);
1103
		complete(&bp->b_iowait);
1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121
	} else {
		xfs_buf_relse(bp);
	}

	return EIO;
}


/*
 * All xfs metadata buffers except log state machine buffers
 * get this attached as their b_bdstrat callback function.
 * This is so that we can catch a buffer
 * after prematurely unpinning it to forcibly shutdown the filesystem.
 */
int
xfs_bdstrat_cb(
	struct xfs_buf	*bp)
{
1122
	if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		/*
		 * Metadata write that didn't get logged but
		 * written delayed anyway. These aren't associated
		 * with a transaction, and can be ignored.
		 */
		if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
			return xfs_bioerror_relse(bp);
		else
			return xfs_bioerror(bp);
	}

	xfs_buf_iorequest(bp);
	return 0;
}

/*
 * Wrapper around bdstrat so that we can stop data from going to disk in case
 * we are shutting down the filesystem.  Typically user data goes thru this
 * path; one of the exceptions is the superblock.
 */
void
xfsbdstrat(
	struct xfs_mount	*mp,
	struct xfs_buf		*bp)
{
	if (XFS_FORCED_SHUTDOWN(mp)) {
		trace_xfs_bdstrat_shut(bp, _RET_IP_);
		xfs_bioerror_relse(bp);
		return;
	}

	xfs_buf_iorequest(bp);
}

1158
STATIC void
1159 1160
_xfs_buf_ioend(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1161 1162
	int			schedule)
{
1163
	if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1164
		xfs_buf_ioend(bp, schedule);
L
Linus Torvalds 已提交
1165 1166
}

A
Al Viro 已提交
1167
STATIC void
1168
xfs_buf_bio_end_io(
L
Linus Torvalds 已提交
1169 1170 1171
	struct bio		*bio,
	int			error)
{
1172
	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
L
Linus Torvalds 已提交
1173

1174
	xfs_buf_ioerror(bp, -error);
L
Linus Torvalds 已提交
1175

1176 1177 1178
	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));

1179
	_xfs_buf_ioend(bp, 1);
L
Linus Torvalds 已提交
1180 1181 1182 1183
	bio_put(bio);
}

STATIC void
1184 1185
_xfs_buf_ioapply(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1186
{
1187
	int			rw, map_i, total_nr_pages, nr_pages;
L
Linus Torvalds 已提交
1188
	struct bio		*bio;
1189 1190 1191
	int			offset = bp->b_offset;
	int			size = bp->b_count_desired;
	sector_t		sector = bp->b_bn;
L
Linus Torvalds 已提交
1192

1193
	total_nr_pages = bp->b_page_count;
L
Linus Torvalds 已提交
1194 1195
	map_i = 0;

1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206
	if (bp->b_flags & XBF_WRITE) {
		if (bp->b_flags & XBF_SYNCIO)
			rw = WRITE_SYNC;
		else
			rw = WRITE;
		if (bp->b_flags & XBF_FUA)
			rw |= REQ_FUA;
		if (bp->b_flags & XBF_FLUSH)
			rw |= REQ_FLUSH;
	} else if (bp->b_flags & XBF_READ_AHEAD) {
		rw = READA;
1207
	} else {
1208
		rw = READ;
1209 1210
	}

1211 1212 1213
	/* we only use the buffer cache for meta-data */
	rw |= REQ_META;

L
Linus Torvalds 已提交
1214
next_chunk:
1215
	atomic_inc(&bp->b_io_remaining);
L
Linus Torvalds 已提交
1216 1217 1218 1219 1220
	nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
	if (nr_pages > total_nr_pages)
		nr_pages = total_nr_pages;

	bio = bio_alloc(GFP_NOIO, nr_pages);
1221
	bio->bi_bdev = bp->b_target->bt_bdev;
L
Linus Torvalds 已提交
1222
	bio->bi_sector = sector;
1223 1224
	bio->bi_end_io = xfs_buf_bio_end_io;
	bio->bi_private = bp;
L
Linus Torvalds 已提交
1225

1226

L
Linus Torvalds 已提交
1227
	for (; size && nr_pages; nr_pages--, map_i++) {
1228
		int	rbytes, nbytes = PAGE_SIZE - offset;
L
Linus Torvalds 已提交
1229 1230 1231 1232

		if (nbytes > size)
			nbytes = size;

1233 1234
		rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
		if (rbytes < nbytes)
L
Linus Torvalds 已提交
1235 1236 1237 1238 1239 1240 1241 1242 1243
			break;

		offset = 0;
		sector += nbytes >> BBSHIFT;
		size -= nbytes;
		total_nr_pages--;
	}

	if (likely(bio->bi_size)) {
1244 1245 1246 1247
		if (xfs_buf_is_vmapped(bp)) {
			flush_kernel_vmap_range(bp->b_addr,
						xfs_buf_vmap_len(bp));
		}
L
Linus Torvalds 已提交
1248 1249 1250 1251
		submit_bio(rw, bio);
		if (size)
			goto next_chunk;
	} else {
1252
		xfs_buf_ioerror(bp, EIO);
1253
		bio_put(bio);
L
Linus Torvalds 已提交
1254 1255 1256
	}
}

1257
void
1258 1259
xfs_buf_iorequest(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1260
{
C
Christoph Hellwig 已提交
1261
	trace_xfs_buf_iorequest(bp, _RET_IP_);
L
Linus Torvalds 已提交
1262

1263
	ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
L
Linus Torvalds 已提交
1264

1265
	if (bp->b_flags & XBF_WRITE)
1266 1267
		xfs_buf_wait_unpin(bp);
	xfs_buf_hold(bp);
L
Linus Torvalds 已提交
1268 1269 1270

	/* Set the count to 1 initially, this will stop an I/O
	 * completion callout which happens before we have started
1271
	 * all the I/O from calling xfs_buf_ioend too early.
L
Linus Torvalds 已提交
1272
	 */
1273 1274 1275
	atomic_set(&bp->b_io_remaining, 1);
	_xfs_buf_ioapply(bp);
	_xfs_buf_ioend(bp, 0);
L
Linus Torvalds 已提交
1276

1277
	xfs_buf_rele(bp);
L
Linus Torvalds 已提交
1278 1279 1280
}

/*
1281 1282 1283
 * Waits for I/O to complete on the buffer supplied.  It returns immediately if
 * no I/O is pending or there is already a pending error on the buffer.  It
 * returns the I/O error code, if any, or 0 if there was no error.
L
Linus Torvalds 已提交
1284 1285
 */
int
1286 1287
xfs_buf_iowait(
	xfs_buf_t		*bp)
L
Linus Torvalds 已提交
1288
{
C
Christoph Hellwig 已提交
1289 1290
	trace_xfs_buf_iowait(bp, _RET_IP_);

1291 1292
	if (!bp->b_error)
		wait_for_completion(&bp->b_iowait);
C
Christoph Hellwig 已提交
1293 1294

	trace_xfs_buf_iowait_done(bp, _RET_IP_);
1295
	return bp->b_error;
L
Linus Torvalds 已提交
1296 1297
}

1298 1299 1300
xfs_caddr_t
xfs_buf_offset(
	xfs_buf_t		*bp,
L
Linus Torvalds 已提交
1301 1302 1303 1304
	size_t			offset)
{
	struct page		*page;

1305
	if (bp->b_flags & XBF_MAPPED)
1306
		return bp->b_addr + offset;
L
Linus Torvalds 已提交
1307

1308
	offset += bp->b_offset;
1309 1310
	page = bp->b_pages[offset >> PAGE_SHIFT];
	return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
L
Linus Torvalds 已提交
1311 1312 1313 1314 1315 1316
}

/*
 *	Move data into or out of a buffer.
 */
void
1317 1318
xfs_buf_iomove(
	xfs_buf_t		*bp,	/* buffer to process		*/
L
Linus Torvalds 已提交
1319 1320
	size_t			boff,	/* starting buffer offset	*/
	size_t			bsize,	/* length to copy		*/
1321
	void			*data,	/* data address			*/
1322
	xfs_buf_rw_t		mode)	/* read/write/zero flag		*/
L
Linus Torvalds 已提交
1323 1324 1325 1326 1327 1328
{
	size_t			bend, cpoff, csize;
	struct page		*page;

	bend = boff + bsize;
	while (boff < bend) {
1329 1330
		page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
		cpoff = xfs_buf_poff(boff + bp->b_offset);
L
Linus Torvalds 已提交
1331
		csize = min_t(size_t,
1332
			      PAGE_SIZE-cpoff, bp->b_count_desired-boff);
L
Linus Torvalds 已提交
1333

1334
		ASSERT(((csize + cpoff) <= PAGE_SIZE));
L
Linus Torvalds 已提交
1335 1336

		switch (mode) {
1337
		case XBRW_ZERO:
L
Linus Torvalds 已提交
1338 1339
			memset(page_address(page) + cpoff, 0, csize);
			break;
1340
		case XBRW_READ:
L
Linus Torvalds 已提交
1341 1342
			memcpy(data, page_address(page) + cpoff, csize);
			break;
1343
		case XBRW_WRITE:
L
Linus Torvalds 已提交
1344 1345 1346 1347 1348 1349 1350 1351 1352
			memcpy(page_address(page) + cpoff, data, csize);
		}

		boff += csize;
		data += csize;
	}
}

/*
1353
 *	Handling of buffer targets (buftargs).
L
Linus Torvalds 已提交
1354 1355 1356
 */

/*
1357 1358 1359
 * Wait for any bufs with callbacks that have been submitted but have not yet
 * returned. These buffers will have an elevated hold count, so wait on those
 * while freeing all the buffers only held by the LRU.
L
Linus Torvalds 已提交
1360 1361 1362
 */
void
xfs_wait_buftarg(
1363
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1364
{
1365 1366 1367 1368 1369 1370 1371 1372
	struct xfs_buf		*bp;

restart:
	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
		if (atomic_read(&bp->b_hold) > 1) {
			spin_unlock(&btp->bt_lru_lock);
D
Dave Chinner 已提交
1373
			delay(100);
1374
			goto restart;
L
Linus Torvalds 已提交
1375
		}
1376
		/*
1377
		 * clear the LRU reference count so the buffer doesn't get
1378 1379 1380 1381 1382 1383
		 * ignored in xfs_buf_rele().
		 */
		atomic_set(&bp->b_lru_ref, 0);
		spin_unlock(&btp->bt_lru_lock);
		xfs_buf_rele(bp);
		spin_lock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1384
	}
1385
	spin_unlock(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1386 1387
}

1388 1389 1390
int
xfs_buftarg_shrink(
	struct shrinker		*shrink,
1391
	struct shrink_control	*sc)
1392
{
1393 1394
	struct xfs_buftarg	*btp = container_of(shrink,
					struct xfs_buftarg, bt_shrinker);
1395
	struct xfs_buf		*bp;
1396
	int nr_to_scan = sc->nr_to_scan;
1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424
	LIST_HEAD(dispose);

	if (!nr_to_scan)
		return btp->bt_lru_nr;

	spin_lock(&btp->bt_lru_lock);
	while (!list_empty(&btp->bt_lru)) {
		if (nr_to_scan-- <= 0)
			break;

		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);

		/*
		 * Decrement the b_lru_ref count unless the value is already
		 * zero. If the value is already zero, we need to reclaim the
		 * buffer, otherwise it gets another trip through the LRU.
		 */
		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
			list_move_tail(&bp->b_lru, &btp->bt_lru);
			continue;
		}

		/*
		 * remove the buffer from the LRU now to avoid needing another
		 * lock round trip inside xfs_buf_rele().
		 */
		list_move(&bp->b_lru, &dispose);
		btp->bt_lru_nr--;
1425
	}
1426 1427 1428 1429 1430 1431 1432 1433 1434
	spin_unlock(&btp->bt_lru_lock);

	while (!list_empty(&dispose)) {
		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
		list_del_init(&bp->b_lru);
		xfs_buf_rele(bp);
	}

	return btp->bt_lru_nr;
1435 1436
}

L
Linus Torvalds 已提交
1437 1438
void
xfs_free_buftarg(
1439 1440
	struct xfs_mount	*mp,
	struct xfs_buftarg	*btp)
L
Linus Torvalds 已提交
1441
{
1442 1443
	unregister_shrinker(&btp->bt_shrinker);

1444 1445
	if (mp->m_flags & XFS_MOUNT_BARRIER)
		xfs_blkdev_issue_flush(btp);
1446

1447
	kmem_free(btp);
L
Linus Torvalds 已提交
1448 1449 1450 1451 1452 1453 1454 1455 1456
}

STATIC int
xfs_setsize_buftarg_flags(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize,
	int			verbose)
{
1457 1458 1459
	btp->bt_bsize = blocksize;
	btp->bt_sshift = ffs(sectorsize) - 1;
	btp->bt_smask = sectorsize - 1;
L
Linus Torvalds 已提交
1460

1461
	if (set_blocksize(btp->bt_bdev, sectorsize)) {
1462 1463 1464 1465
		char name[BDEVNAME_SIZE];

		bdevname(btp->bt_bdev, name);

1466 1467
		xfs_warn(btp->bt_mount,
			"Cannot set_blocksize to %u on device %s\n",
1468
			sectorsize, name);
L
Linus Torvalds 已提交
1469 1470 1471 1472 1473 1474 1475
		return EINVAL;
	}

	return 0;
}

/*
1476 1477 1478 1479
 *	When allocating the initial buffer target we have not yet
 *	read in the superblock, so don't know what sized sectors
 *	are being used is at this early stage.  Play safe.
 */
L
Linus Torvalds 已提交
1480 1481 1482 1483 1484 1485
STATIC int
xfs_setsize_buftarg_early(
	xfs_buftarg_t		*btp,
	struct block_device	*bdev)
{
	return xfs_setsize_buftarg_flags(btp,
1486
			PAGE_SIZE, bdev_logical_block_size(bdev), 0);
L
Linus Torvalds 已提交
1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
}

int
xfs_setsize_buftarg(
	xfs_buftarg_t		*btp,
	unsigned int		blocksize,
	unsigned int		sectorsize)
{
	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
}

xfs_buftarg_t *
xfs_alloc_buftarg(
1500
	struct xfs_mount	*mp,
L
Linus Torvalds 已提交
1501
	struct block_device	*bdev,
1502 1503
	int			external,
	const char		*fsname)
L
Linus Torvalds 已提交
1504 1505 1506 1507 1508
{
	xfs_buftarg_t		*btp;

	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);

1509
	btp->bt_mount = mp;
1510 1511
	btp->bt_dev =  bdev->bd_dev;
	btp->bt_bdev = bdev;
1512 1513 1514 1515
	btp->bt_bdi = blk_get_backing_dev_info(bdev);
	if (!btp->bt_bdi)
		goto error;

1516 1517
	INIT_LIST_HEAD(&btp->bt_lru);
	spin_lock_init(&btp->bt_lru_lock);
L
Linus Torvalds 已提交
1518 1519
	if (xfs_setsize_buftarg_early(btp, bdev))
		goto error;
1520 1521 1522
	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
	register_shrinker(&btp->bt_shrinker);
L
Linus Torvalds 已提交
1523 1524 1525
	return btp;

error:
1526
	kmem_free(btp);
L
Linus Torvalds 已提交
1527 1528 1529 1530
	return NULL;
}

/*
1531 1532 1533 1534 1535 1536 1537 1538 1539
 * Add a buffer to the delayed write list.
 *
 * This queues a buffer for writeout if it hasn't already been.  Note that
 * neither this routine nor the buffer list submission functions perform
 * any internal synchronization.  It is expected that the lists are thread-local
 * to the callers.
 *
 * Returns true if we queued up the buffer, or false if it already had
 * been on the buffer list.
L
Linus Torvalds 已提交
1540
 */
1541
bool
1542
xfs_buf_delwri_queue(
1543 1544
	struct xfs_buf		*bp,
	struct list_head	*list)
L
Linus Torvalds 已提交
1545
{
1546
	ASSERT(xfs_buf_islocked(bp));
1547
	ASSERT(!(bp->b_flags & XBF_READ));
L
Linus Torvalds 已提交
1548

1549 1550 1551 1552 1553 1554 1555 1556
	/*
	 * If the buffer is already marked delwri it already is queued up
	 * by someone else for imediate writeout.  Just ignore it in that
	 * case.
	 */
	if (bp->b_flags & _XBF_DELWRI_Q) {
		trace_xfs_buf_delwri_queued(bp, _RET_IP_);
		return false;
L
Linus Torvalds 已提交
1557 1558
	}

1559
	trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1560 1561

	/*
1562 1563 1564 1565 1566 1567
	 * If a buffer gets written out synchronously or marked stale while it
	 * is on a delwri list we lazily remove it. To do this, the other party
	 * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
	 * It remains referenced and on the list.  In a rare corner case it
	 * might get readded to a delwri list after the synchronous writeout, in
	 * which case we need just need to re-add the flag here.
1568
	 */
1569 1570 1571 1572
	bp->b_flags |= _XBF_DELWRI_Q;
	if (list_empty(&bp->b_list)) {
		atomic_inc(&bp->b_hold);
		list_add_tail(&bp->b_list, list);
1573 1574
	}

1575
	return true;
1576 1577
}

1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600
/*
 * Compare function is more complex than it needs to be because
 * the return value is only 32 bits and we are doing comparisons
 * on 64 bit values
 */
static int
xfs_buf_cmp(
	void		*priv,
	struct list_head *a,
	struct list_head *b)
{
	struct xfs_buf	*ap = container_of(a, struct xfs_buf, b_list);
	struct xfs_buf	*bp = container_of(b, struct xfs_buf, b_list);
	xfs_daddr_t		diff;

	diff = ap->b_bn - bp->b_bn;
	if (diff < 0)
		return -1;
	if (diff > 0)
		return 1;
	return 0;
}

1601 1602 1603 1604 1605
static int
__xfs_buf_delwri_submit(
	struct list_head	*buffer_list,
	struct list_head	*io_list,
	bool			wait)
L
Linus Torvalds 已提交
1606
{
1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621
	struct blk_plug		plug;
	struct xfs_buf		*bp, *n;
	int			pinned = 0;

	list_for_each_entry_safe(bp, n, buffer_list, b_list) {
		if (!wait) {
			if (xfs_buf_ispinned(bp)) {
				pinned++;
				continue;
			}
			if (!xfs_buf_trylock(bp))
				continue;
		} else {
			xfs_buf_lock(bp);
		}
1622

1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633
		/*
		 * Someone else might have written the buffer synchronously or
		 * marked it stale in the meantime.  In that case only the
		 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
		 * reference and remove it from the list here.
		 */
		if (!(bp->b_flags & _XBF_DELWRI_Q)) {
			list_del_init(&bp->b_list);
			xfs_buf_relse(bp);
			continue;
		}
D
Dave Chinner 已提交
1634

1635 1636 1637
		list_move_tail(&bp->b_list, io_list);
		trace_xfs_buf_delwri_split(bp, _RET_IP_);
	}
L
Linus Torvalds 已提交
1638

1639
	list_sort(NULL, io_list, xfs_buf_cmp);
L
Linus Torvalds 已提交
1640

1641 1642 1643 1644
	blk_start_plug(&plug);
	list_for_each_entry_safe(bp, n, io_list, b_list) {
		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
		bp->b_flags |= XBF_WRITE;
1645

1646 1647
		if (!wait) {
			bp->b_flags |= XBF_ASYNC;
1648
			list_del_init(&bp->b_list);
L
Linus Torvalds 已提交
1649
		}
1650 1651 1652
		xfs_bdstrat_cb(bp);
	}
	blk_finish_plug(&plug);
L
Linus Torvalds 已提交
1653

1654
	return pinned;
L
Linus Torvalds 已提交
1655 1656 1657
}

/*
1658 1659 1660 1661 1662 1663 1664
 * Write out a buffer list asynchronously.
 *
 * This will take the @buffer_list, write all non-locked and non-pinned buffers
 * out and not wait for I/O completion on any of the buffers.  This interface
 * is only safely useable for callers that can track I/O completion by higher
 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
 * function.
L
Linus Torvalds 已提交
1665 1666
 */
int
1667 1668
xfs_buf_delwri_submit_nowait(
	struct list_head	*buffer_list)
L
Linus Torvalds 已提交
1669
{
1670 1671 1672
	LIST_HEAD		(io_list);
	return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
}
L
Linus Torvalds 已提交
1673

1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688
/*
 * Write out a buffer list synchronously.
 *
 * This will take the @buffer_list, write all buffers out and wait for I/O
 * completion on all of the buffers. @buffer_list is consumed by the function,
 * so callers must have some other way of tracking buffers if they require such
 * functionality.
 */
int
xfs_buf_delwri_submit(
	struct list_head	*buffer_list)
{
	LIST_HEAD		(io_list);
	int			error = 0, error2;
	struct xfs_buf		*bp;
L
Linus Torvalds 已提交
1689

1690
	__xfs_buf_delwri_submit(buffer_list, &io_list, true);
L
Linus Torvalds 已提交
1691

1692 1693 1694
	/* Wait for IO to complete. */
	while (!list_empty(&io_list)) {
		bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1695

1696
		list_del_init(&bp->b_list);
1697 1698 1699 1700
		error2 = xfs_buf_iowait(bp);
		xfs_buf_relse(bp);
		if (!error)
			error = error2;
L
Linus Torvalds 已提交
1701 1702
	}

1703
	return error;
L
Linus Torvalds 已提交
1704 1705
}

1706
int __init
1707
xfs_buf_init(void)
L
Linus Torvalds 已提交
1708
{
1709 1710
	xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
						KM_ZONE_HWALIGN, NULL);
1711
	if (!xfs_buf_zone)
C
Christoph Hellwig 已提交
1712
		goto out;
1713

1714
	xfslogd_workqueue = alloc_workqueue("xfslogd",
1715
					WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1716
	if (!xfslogd_workqueue)
1717
		goto out_free_buf_zone;
L
Linus Torvalds 已提交
1718

1719
	return 0;
L
Linus Torvalds 已提交
1720

1721
 out_free_buf_zone:
1722
	kmem_zone_destroy(xfs_buf_zone);
C
Christoph Hellwig 已提交
1723
 out:
1724
	return -ENOMEM;
L
Linus Torvalds 已提交
1725 1726 1727
}

void
1728
xfs_buf_terminate(void)
L
Linus Torvalds 已提交
1729
{
1730
	destroy_workqueue(xfslogd_workqueue);
1731
	kmem_zone_destroy(xfs_buf_zone);
L
Linus Torvalds 已提交
1732
}