log.c 23.8 KB
Newer Older
D
David Teigland 已提交
1 2
/*
 * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
B
Bob Peterson 已提交
3
 * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
D
David Teigland 已提交
4 5 6
 *
 * This copyrighted material is made available to anyone wishing to use,
 * modify, copy, or redistribute it subject to the terms and conditions
7
 * of the GNU General Public License version 2.
D
David Teigland 已提交
8 9 10 11 12 13 14
 */

#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
15
#include <linux/gfs2_ondisk.h>
16
#include <linux/crc32.h>
17
#include <linux/delay.h>
18 19
#include <linux/kthread.h>
#include <linux/freezer.h>
20
#include <linux/bio.h>
D
David Teigland 已提交
21 22

#include "gfs2.h"
23
#include "incore.h"
D
David Teigland 已提交
24 25 26 27 28
#include "bmap.h"
#include "glock.h"
#include "log.h"
#include "lops.h"
#include "meta_io.h"
29
#include "util.h"
30
#include "dir.h"
S
Steven Whitehouse 已提交
31
#include "trace_gfs2.h"
D
David Teigland 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53

#define PULL 1

/**
 * gfs2_struct2blk - compute stuff
 * @sdp: the filesystem
 * @nstruct: the number of structures
 * @ssize: the size of the structures
 *
 * Compute the number of log descriptor blocks needed to hold a certain number
 * of structures of a certain size.
 *
 * Returns: the number of blocks needed (minimum is always 1)
 */

unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
			     unsigned int ssize)
{
	unsigned int blks;
	unsigned int first, second;

	blks = 1;
S
Steven Whitehouse 已提交
54
	first = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / ssize;
D
David Teigland 已提交
55 56

	if (nstruct > first) {
57 58
		second = (sdp->sd_sb.sb_bsize -
			  sizeof(struct gfs2_meta_header)) / ssize;
59
		blks += DIV_ROUND_UP(nstruct - first, second);
D
David Teigland 已提交
60 61 62 63 64
	}

	return blks;
}

65 66 67 68 69
/**
 * gfs2_remove_from_ail - Remove an entry from the ail lists, updating counters
 * @mapping: The associated mapping (maybe NULL)
 * @bd: The gfs2_bufdata to remove
 *
70
 * The ail lock _must_ be held when calling this function
71 72 73
 *
 */

74
void gfs2_remove_from_ail(struct gfs2_bufdata *bd)
75 76
{
	bd->bd_ail = NULL;
77 78
	list_del_init(&bd->bd_ail_st_list);
	list_del_init(&bd->bd_ail_gl_list);
79 80 81 82
	atomic_dec(&bd->bd_gl->gl_ail_count);
	brelse(bd->bd_bh);
}

83 84 85 86 87 88 89 90
/**
 * gfs2_ail1_start_one - Start I/O on a part of the AIL
 * @sdp: the filesystem
 * @tr: the part of the AIL
 *
 */

static void gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
D
Dave Chinner 已提交
91 92
__releases(&sdp->sd_ail_lock)
__acquires(&sdp->sd_ail_lock)
93
{
94
	struct gfs2_glock *gl = NULL;
95 96 97 98 99 100 101 102 103 104 105 106 107 108
	struct gfs2_bufdata *bd, *s;
	struct buffer_head *bh;
	int retry;

	do {
		retry = 0;

		list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
						 bd_ail_st_list) {
			bh = bd->bd_bh;

			gfs2_assert(sdp, bd->bd_ail == ai);

			if (!buffer_busy(bh)) {
109
				if (!buffer_uptodate(bh))
110 111 112 113 114 115 116
					gfs2_io_error_bh(sdp, bh);
				list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
				continue;
			}

			if (!buffer_dirty(bh))
				continue;
117 118 119
			if (gl == bd->bd_gl)
				continue;
			gl = bd->bd_gl;
120 121
			list_move(&bd->bd_ail_st_list, &ai->ai_ail1_list);

D
Dave Chinner 已提交
122
			spin_unlock(&sdp->sd_ail_lock);
123
			filemap_fdatawrite(gfs2_glock2aspace(gl));
D
Dave Chinner 已提交
124
			spin_lock(&sdp->sd_ail_lock);
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

			retry = 1;
			break;
		}
	} while (retry);
}

/**
 * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced
 * @sdp: the filesystem
 * @ai: the AIL entry
 *
 */

static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai, int flags)
{
	struct gfs2_bufdata *bd, *s;
	struct buffer_head *bh;

	list_for_each_entry_safe_reverse(bd, s, &ai->ai_ail1_list,
					 bd_ail_st_list) {
		bh = bd->bd_bh;

		gfs2_assert(sdp, bd->bd_ail == ai);

		if (buffer_busy(bh)) {
			if (flags & DIO_ALL)
				continue;
			else
				break;
		}

		if (!buffer_uptodate(bh))
			gfs2_io_error_bh(sdp, bh);

		list_move(&bd->bd_ail_st_list, &ai->ai_ail2_list);
	}

	return list_empty(&ai->ai_ail1_list);
}

166
static void gfs2_ail1_start(struct gfs2_sbd *sdp)
D
David Teigland 已提交
167
{
168
	struct list_head *head;
169
	u64 sync_gen;
170
	struct gfs2_ail *ai;
171
	int done = 0;
D
David Teigland 已提交
172

D
Dave Chinner 已提交
173
	spin_lock(&sdp->sd_ail_lock);
174
	head = &sdp->sd_ail1_list;
D
David Teigland 已提交
175
	if (list_empty(head)) {
D
Dave Chinner 已提交
176
		spin_unlock(&sdp->sd_ail_lock);
D
David Teigland 已提交
177 178 179 180
		return;
	}
	sync_gen = sdp->sd_ail_sync_gen++;

181 182
	while(!done) {
		done = 1;
183
		list_for_each_entry_reverse(ai, head, ai_list) {
D
David Teigland 已提交
184 185 186
			if (ai->ai_sync_gen >= sync_gen)
				continue;
			ai->ai_sync_gen = sync_gen;
D
Dave Chinner 已提交
187
			gfs2_ail1_start_one(sdp, ai); /* This may drop ail lock */
188
			done = 0;
D
David Teigland 已提交
189 190 191 192
			break;
		}
	}

D
Dave Chinner 已提交
193
	spin_unlock(&sdp->sd_ail_lock);
D
David Teigland 已提交
194 195
}

196
static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int flags)
D
David Teigland 已提交
197 198 199 200
{
	struct gfs2_ail *ai, *s;
	int ret;

D
Dave Chinner 已提交
201
	spin_lock(&sdp->sd_ail_lock);
D
David Teigland 已提交
202 203 204 205 206 207 208 209 210 211

	list_for_each_entry_safe_reverse(ai, s, &sdp->sd_ail1_list, ai_list) {
		if (gfs2_ail1_empty_one(sdp, ai, flags))
			list_move(&ai->ai_list, &sdp->sd_ail2_list);
		else if (!(flags & DIO_ALL))
			break;
	}

	ret = list_empty(&sdp->sd_ail1_list);

D
Dave Chinner 已提交
212
	spin_unlock(&sdp->sd_ail_lock);
D
David Teigland 已提交
213 214 215 216

	return ret;
}

217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

/**
 * gfs2_ail2_empty_one - Check whether or not a trans in the AIL has been synced
 * @sdp: the filesystem
 * @ai: the AIL entry
 *
 */

static void gfs2_ail2_empty_one(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
{
	struct list_head *head = &ai->ai_ail2_list;
	struct gfs2_bufdata *bd;

	while (!list_empty(head)) {
		bd = list_entry(head->prev, struct gfs2_bufdata,
				bd_ail_st_list);
		gfs2_assert(sdp, bd->bd_ail == ai);
234
		gfs2_remove_from_ail(bd);
235 236 237
	}
}

D
David Teigland 已提交
238 239 240 241 242 243 244
static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
{
	struct gfs2_ail *ai, *safe;
	unsigned int old_tail = sdp->sd_log_tail;
	int wrap = (new_tail < old_tail);
	int a, b, rm;

D
Dave Chinner 已提交
245
	spin_lock(&sdp->sd_ail_lock);
D
David Teigland 已提交
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260

	list_for_each_entry_safe(ai, safe, &sdp->sd_ail2_list, ai_list) {
		a = (old_tail <= ai->ai_first);
		b = (ai->ai_first < new_tail);
		rm = (wrap) ? (a || b) : (a && b);
		if (!rm)
			continue;

		gfs2_ail2_empty_one(sdp, ai);
		list_del(&ai->ai_list);
		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail1_list));
		gfs2_assert_warn(sdp, list_empty(&ai->ai_ail2_list));
		kfree(ai);
	}

D
Dave Chinner 已提交
261
	spin_unlock(&sdp->sd_ail_lock);
D
David Teigland 已提交
262 263 264 265 266 267 268
}

/**
 * gfs2_log_reserve - Make a log reservation
 * @sdp: The GFS2 superblock
 * @blks: The number of blocks to reserve
 *
269
 * Note that we never give out the last few blocks of the journal. Thats
270
 * due to the fact that there is a small number of header blocks
271 272 273 274
 * associated with each log flush. The exact number can't be known until
 * flush time, so we ensure that we have just enough free blocks at all
 * times to avoid running out during a log flush.
 *
275 276 277 278 279 280
 * We no longer flush the log here, instead we wake up logd to do that
 * for us. To avoid the thundering herd and to ensure that we deal fairly
 * with queued waiters, we use an exclusive wait. This means that when we
 * get woken with enough journal space to get our reservation, we need to
 * wake the next waiter on the list.
 *
D
David Teigland 已提交
281 282 283 284 285
 * Returns: errno
 */

int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks)
{
286
	unsigned reserved_blks = 6 * (4096 / sdp->sd_vfs->s_blocksize);
287 288 289 290
	unsigned wanted = blks + reserved_blks;
	DEFINE_WAIT(wait);
	int did_wait = 0;
	unsigned int free_blocks;
D
David Teigland 已提交
291 292 293 294

	if (gfs2_assert_warn(sdp, blks) ||
	    gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks))
		return -EINVAL;
295 296 297 298 299 300 301 302 303 304 305 306 307
retry:
	free_blocks = atomic_read(&sdp->sd_log_blks_free);
	if (unlikely(free_blocks <= wanted)) {
		do {
			prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait,
					TASK_UNINTERRUPTIBLE);
			wake_up(&sdp->sd_logd_waitq);
			did_wait = 1;
			if (atomic_read(&sdp->sd_log_blks_free) <= wanted)
				io_schedule();
			free_blocks = atomic_read(&sdp->sd_log_blks_free);
		} while(free_blocks <= wanted);
		finish_wait(&sdp->sd_log_waitq, &wait);
D
David Teigland 已提交
308
	}
309 310 311
	if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks,
				free_blocks - blks) != free_blocks)
		goto retry;
S
Steven Whitehouse 已提交
312
	trace_gfs2_log_blocks(sdp, -blks);
313 314 315 316 317 318 319

	/*
	 * If we waited, then so might others, wake them up _after_ we get
	 * our share of the log.
	 */
	if (unlikely(did_wait))
		wake_up(&sdp->sd_log_waitq);
320 321

	down_read(&sdp->sd_log_flush_lock);
D
David Teigland 已提交
322 323 324 325

	return 0;
}

326
static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn)
D
David Teigland 已提交
327
{
B
Bob Peterson 已提交
328 329 330 331
	struct gfs2_journal_extent *je;

	list_for_each_entry(je, &sdp->sd_jdesc->extent_list, extent_list) {
		if (lbn >= je->lblock && lbn < je->lblock + je->blocks)
S
Steven Whitehouse 已提交
332
			return je->dblock + lbn - je->lblock;
B
Bob Peterson 已提交
333 334 335
	}

	return -1;
D
David Teigland 已提交
336 337 338 339 340 341 342 343 344 345 346 347 348 349
}

/**
 * log_distance - Compute distance between two journal blocks
 * @sdp: The GFS2 superblock
 * @newer: The most recent journal block of the pair
 * @older: The older journal block of the pair
 *
 *   Compute the distance (in the journal direction) between two
 *   blocks in the journal
 *
 * Returns: the distance in blocks
 */

S
Steven Whitehouse 已提交
350
static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer,
D
David Teigland 已提交
351 352 353 354 355 356 357 358 359 360 361
					unsigned int older)
{
	int dist;

	dist = newer - older;
	if (dist < 0)
		dist += sdp->sd_jdesc->jd_blocks;

	return dist;
}

362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
/**
 * calc_reserved - Calculate the number of blocks to reserve when
 *                 refunding a transaction's unused buffers.
 * @sdp: The GFS2 superblock
 *
 * This is complex.  We need to reserve room for all our currently used
 * metadata buffers (e.g. normal file I/O rewriting file time stamps) and 
 * all our journaled data buffers for journaled files (e.g. files in the 
 * meta_fs like rindex, or files for which chattr +j was done.)
 * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush
 * will count it as free space (sd_log_blks_free) and corruption will follow.
 *
 * We can have metadata bufs and jdata bufs in the same journal.  So each
 * type gets its own log header, for which we need to reserve a block.
 * In fact, each type has the potential for needing more than one header 
 * in cases where we have more buffers than will fit on a journal page.
 * Metadata journal entries take up half the space of journaled buffer entries.
 * Thus, metadata entries have buf_limit (502) and journaled buffers have
 * databuf_limit (251) before they cause a wrap around.
 *
 * Also, we need to reserve blocks for revoke journal entries and one for an
 * overall header for the lot.
 *
 * Returns: the number of blocks reserved
 */
static unsigned int calc_reserved(struct gfs2_sbd *sdp)
{
	unsigned int reserved = 0;
	unsigned int mbuf_limit, metabufhdrs_needed;
	unsigned int dbuf_limit, databufhdrs_needed;
	unsigned int revokes = 0;

	mbuf_limit = buf_limit(sdp);
	metabufhdrs_needed = (sdp->sd_log_commited_buf +
			      (mbuf_limit - 1)) / mbuf_limit;
	dbuf_limit = databuf_limit(sdp);
	databufhdrs_needed = (sdp->sd_log_commited_databuf +
			      (dbuf_limit - 1)) / dbuf_limit;

401
	if (sdp->sd_log_commited_revoke > 0)
402 403 404 405 406 407 408 409 410 411 412 413
		revokes = gfs2_struct2blk(sdp, sdp->sd_log_commited_revoke,
					  sizeof(u64));

	reserved = sdp->sd_log_commited_buf + metabufhdrs_needed +
		sdp->sd_log_commited_databuf + databufhdrs_needed +
		revokes;
	/* One for the overall header */
	if (reserved)
		reserved++;
	return reserved;
}

D
David Teigland 已提交
414 415 416 417 418
static unsigned int current_tail(struct gfs2_sbd *sdp)
{
	struct gfs2_ail *ai;
	unsigned int tail;

D
Dave Chinner 已提交
419
	spin_lock(&sdp->sd_ail_lock);
D
David Teigland 已提交
420

S
Steven Whitehouse 已提交
421
	if (list_empty(&sdp->sd_ail1_list)) {
D
David Teigland 已提交
422
		tail = sdp->sd_log_head;
S
Steven Whitehouse 已提交
423 424
	} else {
		ai = list_entry(sdp->sd_ail1_list.prev, struct gfs2_ail, ai_list);
D
David Teigland 已提交
425 426 427
		tail = ai->ai_first;
	}

D
Dave Chinner 已提交
428
	spin_unlock(&sdp->sd_ail_lock);
D
David Teigland 已提交
429 430 431 432

	return tail;
}

433
void gfs2_log_incr_head(struct gfs2_sbd *sdp)
D
David Teigland 已提交
434 435
{
	if (sdp->sd_log_flush_head == sdp->sd_log_tail)
436
		BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head);
D
David Teigland 已提交
437 438 439 440 441 442 443

	if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) {
		sdp->sd_log_flush_head = 0;
		sdp->sd_log_flush_wrapped = 1;
	}
}

444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
/**
 * gfs2_log_write_endio - End of I/O for a log buffer
 * @bh: The buffer head
 * @uptodate: I/O Status
 *
 */

static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate)
{
	struct gfs2_sbd *sdp = bh->b_private;
	bh->b_private = NULL;

	end_buffer_write_sync(bh, uptodate);
	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
		wake_up(&sdp->sd_log_flush_wait);
}

D
David Teigland 已提交
461 462 463 464 465 466 467 468 469
/**
 * gfs2_log_get_buf - Get and initialize a buffer to use for log control data
 * @sdp: The GFS2 superblock
 *
 * Returns: the buffer_head
 */

struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp)
{
470
	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
D
David Teigland 已提交
471 472
	struct buffer_head *bh;

473
	bh = sb_getblk(sdp->sd_vfs, blkno);
D
David Teigland 已提交
474 475 476 477
	lock_buffer(bh);
	memset(bh->b_data, 0, bh->b_size);
	set_buffer_uptodate(bh);
	clear_buffer_dirty(bh);
478 479 480 481
	gfs2_log_incr_head(sdp);
	atomic_inc(&sdp->sd_log_in_flight);
	bh->b_private = sdp;
	bh->b_end_io = gfs2_log_write_endio;
D
David Teigland 已提交
482 483 484 485

	return bh;
}

486 487 488 489 490 491 492 493 494 495
/**
 * gfs2_fake_write_endio - 
 * @bh: The buffer head
 * @uptodate: The I/O Status
 *
 */

static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate)
{
	struct buffer_head *real_bh = bh->b_private;
496 497
	struct gfs2_bufdata *bd = real_bh->b_private;
	struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd;
498 499 500 501 502 503 504 505 506

	end_buffer_write_sync(bh, uptodate);
	free_buffer_head(bh);
	unlock_buffer(real_bh);
	brelse(real_bh);
	if (atomic_dec_and_test(&sdp->sd_log_in_flight))
		wake_up(&sdp->sd_log_flush_wait);
}

D
David Teigland 已提交
507 508 509 510 511 512 513 514 515 516 517
/**
 * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log
 * @sdp: the filesystem
 * @data: the data the buffer_head should point to
 *
 * Returns: the log buffer descriptor
 */

struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
				      struct buffer_head *real)
{
518
	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
D
David Teigland 已提交
519 520
	struct buffer_head *bh;

521
	bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL);
D
David Teigland 已提交
522
	atomic_set(&bh->b_count, 1);
523
	bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock);
524
	set_bh_page(bh, real->b_page, bh_offset(real));
D
David Teigland 已提交
525 526 527
	bh->b_blocknr = blkno;
	bh->b_size = sdp->sd_sb.sb_bsize;
	bh->b_bdev = sdp->sd_vfs->s_bdev;
528 529
	bh->b_private = real;
	bh->b_end_io = gfs2_fake_write_endio;
D
David Teigland 已提交
530

531 532
	gfs2_log_incr_head(sdp);
	atomic_inc(&sdp->sd_log_in_flight);
D
David Teigland 已提交
533 534 535 536

	return bh;
}

537
static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
D
David Teigland 已提交
538 539 540 541 542
{
	unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail);

	ail2_empty(sdp, new_tail);

543
	atomic_add(dist, &sdp->sd_log_blks_free);
S
Steven Whitehouse 已提交
544
	trace_gfs2_log_blocks(sdp, dist);
545 546
	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
			     sdp->sd_jdesc->jd_blocks);
D
David Teigland 已提交
547 548 549 550 551 552 553 554 555 556 557

	sdp->sd_log_tail = new_tail;
}

/**
 * log_write_header - Get and initialize a journal header buffer
 * @sdp: The GFS2 superblock
 *
 * Returns: the initialized log buffer descriptor
 */

558
static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
D
David Teigland 已提交
559
{
560
	u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head);
D
David Teigland 已提交
561 562 563
	struct buffer_head *bh;
	struct gfs2_log_header *lh;
	unsigned int tail;
564
	u32 hash;
D
David Teigland 已提交
565 566 567 568 569 570 571 572 573 574 575 576 577

	bh = sb_getblk(sdp->sd_vfs, blkno);
	lock_buffer(bh);
	memset(bh->b_data, 0, bh->b_size);
	set_buffer_uptodate(bh);
	clear_buffer_dirty(bh);

	gfs2_ail1_empty(sdp, 0);
	tail = current_tail(sdp);

	lh = (struct gfs2_log_header *)bh->b_data;
	memset(lh, 0, sizeof(struct gfs2_log_header));
	lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
578
	lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH);
579
	lh->lh_header.__pad0 = cpu_to_be64(0);
580
	lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH);
581
	lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
582 583 584 585
	lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++);
	lh->lh_flags = cpu_to_be32(flags);
	lh->lh_tail = cpu_to_be32(tail);
	lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head);
D
David Teigland 已提交
586 587 588
	hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header));
	lh->lh_hash = cpu_to_be32(hash);

589 590
	bh->b_end_io = end_buffer_write_sync;
	get_bh(bh);
591
	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
592
		submit_bh(WRITE_SYNC | REQ_META, bh);
593 594 595 596
	else
		submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
	wait_on_buffer(bh);

597
	if (!buffer_uptodate(bh))
D
David Teigland 已提交
598 599 600 601
		gfs2_io_error_bh(sdp, bh);
	brelse(bh);

	if (sdp->sd_log_tail != tail)
602
		log_pull_tail(sdp, tail);
D
David Teigland 已提交
603 604 605 606
	else
		gfs2_assert_withdraw(sdp, !pull);

	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
607
	gfs2_log_incr_head(sdp);
D
David Teigland 已提交
608 609 610 611
}

static void log_flush_commit(struct gfs2_sbd *sdp)
{
612 613 614 615 616 617 618 619 620 621
	DEFINE_WAIT(wait);

	if (atomic_read(&sdp->sd_log_in_flight)) {
		do {
			prepare_to_wait(&sdp->sd_log_flush_wait, &wait,
					TASK_UNINTERRUPTIBLE);
			if (atomic_read(&sdp->sd_log_in_flight))
				io_schedule();
		} while(atomic_read(&sdp->sd_log_in_flight));
		finish_wait(&sdp->sd_log_flush_wait, &wait);
D
David Teigland 已提交
622 623
	}

624
	log_write_header(sdp, 0, 0);
D
David Teigland 已提交
625 626
}

627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
static void gfs2_ordered_write(struct gfs2_sbd *sdp)
{
	struct gfs2_bufdata *bd;
	struct buffer_head *bh;
	LIST_HEAD(written);

	gfs2_log_lock(sdp);
	while (!list_empty(&sdp->sd_log_le_ordered)) {
		bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list);
		list_move(&bd->bd_le.le_list, &written);
		bh = bd->bd_bh;
		if (!buffer_dirty(bh))
			continue;
		get_bh(bh);
		gfs2_log_unlock(sdp);
		lock_buffer(bh);
643
		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
644
			bh->b_end_io = end_buffer_write_sync;
J
Jens Axboe 已提交
645
			submit_bh(WRITE_SYNC, bh);
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677
		} else {
			unlock_buffer(bh);
			brelse(bh);
		}
		gfs2_log_lock(sdp);
	}
	list_splice(&written, &sdp->sd_log_le_ordered);
	gfs2_log_unlock(sdp);
}

static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
{
	struct gfs2_bufdata *bd;
	struct buffer_head *bh;

	gfs2_log_lock(sdp);
	while (!list_empty(&sdp->sd_log_le_ordered)) {
		bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list);
		bh = bd->bd_bh;
		if (buffer_locked(bh)) {
			get_bh(bh);
			gfs2_log_unlock(sdp);
			wait_on_buffer(bh);
			brelse(bh);
			gfs2_log_lock(sdp);
			continue;
		}
		list_del_init(&bd->bd_le.le_list);
	}
	gfs2_log_unlock(sdp);
}

D
David Teigland 已提交
678
/**
679
 * gfs2_log_flush - flush incore transaction(s)
D
David Teigland 已提交
680 681 682 683 684
 * @sdp: the filesystem
 * @gl: The glock structure to flush.  If NULL, flush the whole incore log
 *
 */

685
void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
D
David Teigland 已提交
686 687 688
{
	struct gfs2_ail *ai;

689
	down_write(&sdp->sd_log_flush_lock);
690

691 692 693 694
	/* Log might have been flushed while we waited for the flush lock */
	if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) {
		up_write(&sdp->sd_log_flush_lock);
		return;
695
	}
S
Steven Whitehouse 已提交
696
	trace_gfs2_log_flush(sdp, 1);
697

698 699 700
	ai = kzalloc(sizeof(struct gfs2_ail), GFP_NOFS | __GFP_NOFAIL);
	INIT_LIST_HEAD(&ai->ai_ail1_list);
	INIT_LIST_HEAD(&ai->ai_ail2_list);
D
David Teigland 已提交
701

702 703 704 705 706 707 708 709 710 711
	if (sdp->sd_log_num_buf != sdp->sd_log_commited_buf) {
		printk(KERN_INFO "GFS2: log buf %u %u\n", sdp->sd_log_num_buf,
		       sdp->sd_log_commited_buf);
		gfs2_assert_withdraw(sdp, 0);
	}
	if (sdp->sd_log_num_databuf != sdp->sd_log_commited_databuf) {
		printk(KERN_INFO "GFS2: log databuf %u %u\n",
		       sdp->sd_log_num_databuf, sdp->sd_log_commited_databuf);
		gfs2_assert_withdraw(sdp, 0);
	}
D
David Teigland 已提交
712 713 714 715 716 717 718
	gfs2_assert_withdraw(sdp,
			sdp->sd_log_num_revoke == sdp->sd_log_commited_revoke);

	sdp->sd_log_flush_head = sdp->sd_log_head;
	sdp->sd_log_flush_wrapped = 0;
	ai->ai_first = sdp->sd_log_flush_head;

719
	gfs2_ordered_write(sdp);
D
David Teigland 已提交
720
	lops_before_commit(sdp);
721 722
	gfs2_ordered_wait(sdp);

723
	if (sdp->sd_log_head != sdp->sd_log_flush_head)
D
David Teigland 已提交
724
		log_flush_commit(sdp);
725 726
	else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){
		gfs2_log_lock(sdp);
727
		atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
S
Steven Whitehouse 已提交
728
		trace_gfs2_log_blocks(sdp, -1);
729
		gfs2_log_unlock(sdp);
D
David Teigland 已提交
730
		log_write_header(sdp, 0, PULL);
731
	}
D
David Teigland 已提交
732
	lops_after_commit(sdp, ai);
733

734 735
	gfs2_log_lock(sdp);
	sdp->sd_log_head = sdp->sd_log_flush_head;
S
Steven Whitehouse 已提交
736 737
	sdp->sd_log_blks_reserved = 0;
	sdp->sd_log_commited_buf = 0;
738
	sdp->sd_log_commited_databuf = 0;
S
Steven Whitehouse 已提交
739
	sdp->sd_log_commited_revoke = 0;
D
David Teigland 已提交
740

D
Dave Chinner 已提交
741
	spin_lock(&sdp->sd_ail_lock);
D
David Teigland 已提交
742 743 744 745
	if (!list_empty(&ai->ai_ail1_list)) {
		list_add(&ai->ai_list, &sdp->sd_ail1_list);
		ai = NULL;
	}
D
Dave Chinner 已提交
746
	spin_unlock(&sdp->sd_ail_lock);
D
David Teigland 已提交
747
	gfs2_log_unlock(sdp);
S
Steven Whitehouse 已提交
748
	trace_gfs2_log_flush(sdp, 0);
749
	up_write(&sdp->sd_log_flush_lock);
D
David Teigland 已提交
750 751 752 753 754 755

	kfree(ai);
}

static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
756
	unsigned int reserved;
757
	unsigned int unused;
D
David Teigland 已提交
758 759 760 761

	gfs2_log_lock(sdp);

	sdp->sd_log_commited_buf += tr->tr_num_buf_new - tr->tr_num_buf_rm;
762 763 764 765
	sdp->sd_log_commited_databuf += tr->tr_num_databuf_new -
		tr->tr_num_databuf_rm;
	gfs2_assert_withdraw(sdp, (((int)sdp->sd_log_commited_buf) >= 0) ||
			     (((int)sdp->sd_log_commited_databuf) >= 0));
D
David Teigland 已提交
766
	sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
767
	reserved = calc_reserved(sdp);
768
	gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
769 770
	unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
	atomic_add(unused, &sdp->sd_log_blks_free);
S
Steven Whitehouse 已提交
771
	trace_gfs2_log_blocks(sdp, unused);
772
	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
773
			     sdp->sd_jdesc->jd_blocks);
D
David Teigland 已提交
774 775 776 777 778
	sdp->sd_log_blks_reserved = reserved;

	gfs2_log_unlock(sdp);
}

779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
	struct list_head *head = &tr->tr_list_buf;
	struct gfs2_bufdata *bd;

	gfs2_log_lock(sdp);
	while (!list_empty(head)) {
		bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
		list_del_init(&bd->bd_list_tr);
		tr->tr_num_buf--;
	}
	gfs2_log_unlock(sdp);
	gfs2_assert_warn(sdp, !tr->tr_num_buf);
}

D
David Teigland 已提交
794 795 796 797 798
/**
 * gfs2_log_commit - Commit a transaction to the log
 * @sdp: the filesystem
 * @tr: the transaction
 *
799 800 801 802 803 804 805
 * We wake up gfs2_logd if the number of pinned blocks exceed thresh1
 * or the total number of used blocks (pinned blocks plus AIL blocks)
 * is greater than thresh2.
 *
 * At mount time thresh1 is 1/3rd of journal size, thresh2 is 2/3rd of
 * journal size.
 *
D
David Teigland 已提交
806 807 808 809 810 811
 * Returns: errno
 */

void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
{
	log_refund(sdp, tr);
812
	buf_lo_incore_commit(sdp, tr);
D
David Teigland 已提交
813

814
	up_read(&sdp->sd_log_flush_lock);
D
David Teigland 已提交
815

816 817 818 819
	if (atomic_read(&sdp->sd_log_pinned) > atomic_read(&sdp->sd_log_thresh1) ||
	    ((sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free)) >
	    atomic_read(&sdp->sd_log_thresh2)))
		wake_up(&sdp->sd_logd_waitq);
D
David Teigland 已提交
820 821 822 823 824 825 826 827 828 829
}

/**
 * gfs2_log_shutdown - write a shutdown header into a journal
 * @sdp: the filesystem
 *
 */

void gfs2_log_shutdown(struct gfs2_sbd *sdp)
{
830
	down_write(&sdp->sd_log_flush_lock);
D
David Teigland 已提交
831 832 833 834 835 836 837 838 839 840 841

	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_buf);
	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_rg);
	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_databuf);
	gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));

	sdp->sd_log_flush_head = sdp->sd_log_head;
	sdp->sd_log_flush_wrapped = 0;

842 843
	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT,
			 (sdp->sd_log_tail == current_tail(sdp)) ? 0 : PULL);
D
David Teigland 已提交
844

845
	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
846 847
	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
D
David Teigland 已提交
848 849 850 851

	sdp->sd_log_head = sdp->sd_log_flush_head;
	sdp->sd_log_tail = sdp->sd_log_head;

852
	up_write(&sdp->sd_log_flush_lock);
D
David Teigland 已提交
853 854
}

855 856 857 858 859 860 861 862 863 864 865

/**
 * gfs2_meta_syncfs - sync all the buffers in a filesystem
 * @sdp: the filesystem
 *
 */

void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
{
	gfs2_log_flush(sdp, NULL);
	for (;;) {
866
		gfs2_ail1_start(sdp);
867 868 869 870 871 872
		if (gfs2_ail1_empty(sdp, DIO_ALL))
			break;
		msleep(10);
	}
}

873 874 875 876 877 878 879 880 881 882
static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
{
	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
}

static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
{
	unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free);
	return used_blocks >= atomic_read(&sdp->sd_log_thresh2);
}
883 884 885 886 887 888 889 890 891 892 893 894

/**
 * gfs2_logd - Update log tail as Active Items get flushed to in-place blocks
 * @sdp: Pointer to GFS2 superblock
 *
 * Also, periodically check to make sure that we're using the most recent
 * journal index.
 */

int gfs2_logd(void *data)
{
	struct gfs2_sbd *sdp = data;
895 896 897
	unsigned long t = 1;
	DEFINE_WAIT(wait);
	unsigned preflush;
898 899 900

	while (!kthread_should_stop()) {

901 902 903 904 905 906
		preflush = atomic_read(&sdp->sd_log_pinned);
		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
			gfs2_ail1_empty(sdp, DIO_ALL);
			gfs2_log_flush(sdp, NULL);
			gfs2_ail1_empty(sdp, DIO_ALL);
		}
907

908 909 910 911
		if (gfs2_ail_flush_reqd(sdp)) {
			gfs2_ail1_start(sdp);
			io_schedule();
			gfs2_ail1_empty(sdp, 0);
912
			gfs2_log_flush(sdp, NULL);
913
			gfs2_ail1_empty(sdp, DIO_ALL);
914 915
		}

916
		wake_up(&sdp->sd_log_waitq);
917 918 919
		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
		if (freezing(current))
			refrigerator();
920 921 922

		do {
			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
923
					TASK_INTERRUPTIBLE);
924 925 926 927 928 929 930 931
			if (!gfs2_ail_flush_reqd(sdp) &&
			    !gfs2_jrnl_flush_reqd(sdp) &&
			    !kthread_should_stop())
				t = schedule_timeout(t);
		} while(t && !gfs2_ail_flush_reqd(sdp) &&
			!gfs2_jrnl_flush_reqd(sdp) &&
			!kthread_should_stop());
		finish_wait(&sdp->sd_logd_waitq, &wait);
932 933 934 935 936
	}

	return 0;
}