transaction.c 36.1 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
C
Chris Mason 已提交
25 26 27
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "inode-map.h"
C
Chris Mason 已提交
31

32 33
#define BTRFS_ROOT_TRANS_TAG 0

34
static noinline void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
35
{
36 37
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
38
		BUG_ON(!list_empty(&transaction->list));
C
Chris Mason 已提交
39 40
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
41
	}
C
Chris Mason 已提交
42 43
}

J
Josef Bacik 已提交
44 45 46 47 48 49
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
50 51 52
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
53
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
54 55
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
56 57 58 59 60 61 62 63 64

	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->trans_no_join) {
		if (!nofail) {
			spin_unlock(&root->fs_info->trans_lock);
			return -EBUSY;
		}
	}

C
Chris Mason 已提交
65
	cur_trans = root->fs_info->running_transaction;
J
Josef Bacik 已提交
66 67
	if (cur_trans) {
		atomic_inc(&cur_trans->use_count);
68
		atomic_inc(&cur_trans->num_writers);
69
		cur_trans->num_joined++;
J
Josef Bacik 已提交
70 71
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
C
Chris Mason 已提交
72
	}
J
Josef Bacik 已提交
73 74 75 76 77 78 79 80 81 82
	spin_unlock(&root->fs_info->trans_lock);

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->running_transaction) {
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		cur_trans = root->fs_info->running_transaction;
		atomic_inc(&cur_trans->use_count);
83
		atomic_inc(&cur_trans->num_writers);
84
		cur_trans->num_joined++;
J
Josef Bacik 已提交
85 86
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
C
Chris Mason 已提交
87
	}
J
Josef Bacik 已提交
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
	extent_io_tree_init(&cur_trans->dirty_pages,
C
Chris Mason 已提交
114
			     root->fs_info->btree_inode->i_mapping);
J
Josef Bacik 已提交
115 116 117 118
	root->fs_info->generation++;
	cur_trans->transid = root->fs_info->generation;
	root->fs_info->running_transaction = cur_trans;
	spin_unlock(&root->fs_info->trans_lock);
119

C
Chris Mason 已提交
120 121 122
	return 0;
}

C
Chris Mason 已提交
123
/*
C
Chris Mason 已提交
124 125 126 127
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
128
 */
C
Chris Mason 已提交
129
static int record_root_in_trans(struct btrfs_trans_handle *trans,
J
Josef Bacik 已提交
130
			       struct btrfs_root *root)
131
{
132
	if (root->ref_cows && root->last_trans < trans->transid) {
133
		WARN_ON(root == root->fs_info->extent_root);
134 135
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
136 137 138 139 140 141 142 143 144 145 146 147
		/*
		 * see below for in_trans_setup usage rules
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
		root->in_trans_setup = 1;

		/* make sure readers find in_trans_setup before
		 * they find our root->last_trans update
		 */
		smp_wmb();

J
Josef Bacik 已提交
148 149 150 151 152
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
153 154 155
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
156
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
		 * with root->in_trans_setup.  When this is 1, we're still
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
178
		btrfs_init_reloc_root(trans, root);
C
Chris Mason 已提交
179 180
		smp_wmb();
		root->in_trans_setup = 0;
181 182 183
	}
	return 0;
}
184

C
Chris Mason 已提交
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207

int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
	if (!root->ref_cows)
		return 0;

	/*
	 * see record_root_in_trans for comments about in_trans_setup usage
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
	    !root->in_trans_setup)
		return 0;

	mutex_lock(&root->fs_info->reloc_mutex);
	record_root_in_trans(trans, root);
	mutex_unlock(&root->fs_info->reloc_mutex);

	return 0;
}

C
Chris Mason 已提交
208 209 210 211
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
212
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
213
{
214
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
215

J
Josef Bacik 已提交
216
	spin_lock(&root->fs_info->trans_lock);
217
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
218
	if (cur_trans && cur_trans->blocked) {
219
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
220
		spin_unlock(&root->fs_info->trans_lock);
L
Li Zefan 已提交
221 222 223

		wait_event(root->fs_info->transaction_wait,
			   !cur_trans->blocked);
224
		put_transaction(cur_trans);
J
Josef Bacik 已提交
225 226
	} else {
		spin_unlock(&root->fs_info->trans_lock);
227
	}
C
Chris Mason 已提交
228 229
}

230 231 232 233
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
234
	TRANS_JOIN_NOLOCK,
235 236
};

237 238
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
239 240 241 242 243 244 245 246
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
247
		return 1;
J
Josef Bacik 已提交
248

249 250 251
	return 0;
}

252
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
253
						    u64 num_items, int type)
C
Chris Mason 已提交
254
{
255 256
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
257
	u64 num_bytes = 0;
C
Chris Mason 已提交
258
	int ret;
L
liubo 已提交
259 260 261

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
262 263 264 265 266 267 268 269 270

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
271 272 273 274 275 276 277

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
	if (num_items > 0 && root != root->fs_info->chunk_root) {
		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278
		ret = btrfs_block_rsv_add(root,
279 280 281 282 283
					  &root->fs_info->trans_block_rsv,
					  num_bytes);
		if (ret)
			return ERR_PTR(ret);
	}
284 285 286 287
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
288

289
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
290
		wait_current_trans(root);
291

J
Josef Bacik 已提交
292 293 294 295 296 297
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
298
	if (ret < 0) {
299
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
300 301
		return ERR_PTR(ret);
	}
302

303 304 305 306
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
307
	h->blocks_used = 0;
308
	h->bytes_reserved = 0;
309
	h->delayed_ref_updates = 0;
310
	h->use_count = 1;
311
	h->block_rsv = NULL;
312
	h->orig_rsv = NULL;
313

314 315 316 317 318 319
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

320 321 322
	if (num_bytes) {
		h->block_rsv = &root->fs_info->trans_block_rsv;
		h->bytes_reserved = num_bytes;
323
	}
J
Josef Bacik 已提交
324

325
got_it:
J
Josef Bacik 已提交
326
	btrfs_record_root_in_trans(h, root);
327 328 329

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
330 331 332
	return h;
}

333
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
334
						   int num_items)
335
{
336
	return start_transaction(root, num_items, TRANS_START);
337
}
338
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
339
{
340
	return start_transaction(root, 0, TRANS_JOIN);
341 342
}

343
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
344 345 346 347
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

348
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
349
{
350
	return start_transaction(root, 0, TRANS_USERSPACE);
351 352
}

C
Chris Mason 已提交
353
/* wait for a transaction commit to be fully complete */
354
static noinline void wait_for_commit(struct btrfs_root *root,
355 356
				    struct btrfs_transaction *commit)
{
L
Li Zefan 已提交
357
	wait_event(commit->commit_wait, commit->commit_done);
358 359
}

360 361 362 363 364 365 366 367
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
368
			goto out;
369 370

		/* find specified transaction */
J
Josef Bacik 已提交
371
		spin_lock(&root->fs_info->trans_lock);
372 373 374
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
375
				atomic_inc(&cur_trans->use_count);
376 377 378 379 380
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
381
		spin_unlock(&root->fs_info->trans_lock);
382 383
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
384
			goto out;  /* bad transid */
385 386
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
387
		spin_lock(&root->fs_info->trans_lock);
388 389 390 391
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
392
					break;
393
				cur_trans = t;
J
Josef Bacik 已提交
394
				atomic_inc(&cur_trans->use_count);
395 396 397
				break;
			}
		}
J
Josef Bacik 已提交
398
		spin_unlock(&root->fs_info->trans_lock);
399
		if (!cur_trans)
J
Josef Bacik 已提交
400
			goto out;  /* nothing committing|committed */
401 402 403 404 405 406
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
407
out:
408 409 410
	return ret;
}

C
Chris Mason 已提交
411 412
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
413
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
414
		wait_current_trans(root);
C
Chris Mason 已提交
415 416
}

417 418 419 420
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
421 422
	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0,
				    5, 0);
423 424 425 426 427 428 429
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
430
	struct btrfs_block_rsv *rsv = trans->block_rsv;
431 432
	int updates;

J
Josef Bacik 已提交
433
	smp_mb();
434 435 436
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

437 438 439 440 441 442
	/*
	 * We need to do this in case we're deleting csums so the global block
	 * rsv get's used instead of the csum block rsv.
	 */
	trans->block_rsv = NULL;

443 444 445 446 447
	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
	if (updates)
		btrfs_run_delayed_refs(trans, root, updates);

448 449
	trans->block_rsv = rsv;

450 451 452
	return should_end_transaction(trans, root);
}

453
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
454
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
455
{
456
	struct btrfs_transaction *cur_trans = trans->transaction;
457
	struct btrfs_fs_info *info = root->fs_info;
458 459
	int count = 0;

460 461 462 463 464
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

465
	trans->block_rsv = NULL;
466 467 468 469 470 471
	while (count < 4) {
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
472 473 474 475 476 477 478

			/*
			 * do a full flush if the transaction is trying
			 * to close
			 */
			if (trans->transaction->delayed_refs.flushing)
				cur = 0;
479 480 481 482 483
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
484 485
	}

486 487
	btrfs_trans_release_metadata(trans, root);

J
Josef Bacik 已提交
488 489
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
490
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
491 492
		smp_wmb();
	}
493

494
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
495 496 497 498 499 500 501
		if (throttle) {
			/*
			 * We may race with somebody else here so end up having
			 * to call end_transaction on ourselves again, so inc
			 * our use_count.
			 */
			trans->use_count++;
502
			return btrfs_commit_transaction(trans, root);
503
		} else {
504
			wake_up_process(info->transaction_kthread);
505
		}
506 507 508
	}

	WARN_ON(cur_trans != info->running_transaction);
509 510
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
511

512
	smp_mb();
C
Chris Mason 已提交
513 514 515
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
516 517 518

	if (current->journal_info == trans)
		current->journal_info = NULL;
C
Chris Mason 已提交
519
	memset(trans, 0, sizeof(*trans));
C
Chris Mason 已提交
520
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
521

Y
Yan, Zheng 已提交
522 523 524
	if (throttle)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
525 526 527
	return 0;
}

528 529 530
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
531 532 533 534 535 536
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 1);
	if (ret)
		return ret;
	return 0;
537 538 539 540 541
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
542 543 544 545 546 547
	int ret;

	ret = __btrfs_end_transaction(trans, root, 1, 1);
	if (ret)
		return ret;
	return 0;
548 549 550 551 552
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
553 554 555 556 557 558 559 560 561 562 563 564
	int ret;

	ret = __btrfs_end_transaction(trans, root, 0, 0);
	if (ret)
		return ret;
	return 0;
}

int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
				struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 1, 1);
565 566
}

C
Chris Mason 已提交
567 568 569
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
570
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
571
 */
572
int btrfs_write_marked_extents(struct btrfs_root *root,
573
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
574
{
575
	int err = 0;
576
	int werr = 0;
J
Josef Bacik 已提交
577
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
578
	u64 start = 0;
579
	u64 end;
580

J
Josef Bacik 已提交
581 582 583 584 585 586 587 588 589
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      mark)) {
		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
				   GFP_NOFS);
		err = filemap_fdatawrite_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
590
	}
591 592 593 594 595 596 597 598 599 600 601 602
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
603
			      struct extent_io_tree *dirty_pages, int mark)
604 605 606
{
	int err = 0;
	int werr = 0;
J
Josef Bacik 已提交
607
	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
608 609
	u64 start = 0;
	u64 end;
610

J
Josef Bacik 已提交
611 612 613 614 615 616 617 618
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
				      EXTENT_NEED_WAIT)) {
		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
		err = filemap_fdatawait_range(mapping, start, end);
		if (err)
			werr = err;
		cond_resched();
		start = end + 1;
619
	}
620 621 622
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
623 624
}

625 626 627 628 629 630
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
631
				struct extent_io_tree *dirty_pages, int mark)
632 633 634 635
{
	int ret;
	int ret2;

636 637
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
638 639 640
	return ret || ret2;
}

641 642 643 644 645 646 647 648 649
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
650 651
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
652 653
}

C
Chris Mason 已提交
654 655 656 657 658 659 660 661 662 663
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
664 665
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
666 667
{
	int ret;
668
	u64 old_root_bytenr;
669
	u64 old_root_used;
670
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
671

672
	old_root_used = btrfs_root_used(&root->root_item);
673
	btrfs_write_dirty_block_groups(trans, root);
674

C
Chris Mason 已提交
675
	while (1) {
676
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
677 678
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
679
			break;
680

681
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
682
		ret = btrfs_update_root(trans, tree_root,
683 684
					&root->root_key,
					&root->root_item);
C
Chris Mason 已提交
685
		BUG_ON(ret);
686

687
		old_root_used = btrfs_root_used(&root->root_item);
688
		ret = btrfs_write_dirty_block_groups(trans, root);
689
		BUG_ON(ret);
690
	}
691 692 693 694

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

695 696 697
	return 0;
}

C
Chris Mason 已提交
698 699 700
/*
 * update all the cowonly tree roots on disk
 */
701 702
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
703 704 705
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
706
	struct extent_buffer *eb;
707
	int ret;
708

709 710
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);
711

712
	eb = btrfs_lock_root_node(fs_info->tree_root);
713
	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
714 715
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
716

717 718
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);
719

C
Chris Mason 已提交
720
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
721 722 723
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
724

725
		update_cowonly_root(trans, root);
C
Chris Mason 已提交
726
	}
727 728 729 730 731

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
732 733 734
	return 0;
}

C
Chris Mason 已提交
735 736 737 738 739
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
740
int btrfs_add_dead_root(struct btrfs_root *root)
741
{
J
Josef Bacik 已提交
742
	spin_lock(&root->fs_info->trans_lock);
743
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
744
	spin_unlock(&root->fs_info->trans_lock);
745 746 747
	return 0;
}

C
Chris Mason 已提交
748
/*
749
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
750
 */
751 752
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
753 754
{
	struct btrfs_root *gang[8];
755
	struct btrfs_fs_info *fs_info = root->fs_info;
756 757
	int i;
	int ret;
758 759
	int err = 0;

J
Josef Bacik 已提交
760
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
761
	while (1) {
762 763
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
764 765 766 767 768 769
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
770 771 772
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
773
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
774

775
			btrfs_free_log(trans, root);
776
			btrfs_update_reloc_root(trans, root);
777
			btrfs_orphan_commit_root(trans, root);
778

779 780
			btrfs_save_ino_cache(root, trans);

781
			if (root->commit_root != root->node) {
782
				mutex_lock(&root->fs_commit_mutex);
J
Josef Bacik 已提交
783
				switch_commit_root(root);
784 785 786
				btrfs_unpin_free_ino(root);
				mutex_unlock(&root->fs_commit_mutex);

787 788 789
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
790 791

			err = btrfs_update_root(trans, fs_info->tree_root,
792 793
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
794
			spin_lock(&fs_info->fs_roots_radix_lock);
795 796
			if (err)
				break;
797 798
		}
	}
J
Josef Bacik 已提交
799
	spin_unlock(&fs_info->fs_roots_radix_lock);
800
	return err;
801 802
}

C
Chris Mason 已提交
803 804 805 806
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
807 808 809 810
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
811
	int ret;
812
	unsigned long nr;
813

814
	if (xchg(&root->defrag_running, 1))
815
		return 0;
816

817
	while (1) {
818 819 820 821
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

822
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
823

824
		nr = trans->blocks_used;
825
		btrfs_end_transaction(trans, root);
826
		btrfs_btree_balance_dirty(info->tree_root, nr);
827 828
		cond_resched();

829
		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
830 831 832
			break;
	}
	root->defrag_running = 0;
833
	return ret;
834 835
}

C
Chris Mason 已提交
836 837 838 839
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
840
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
841 842 843 844
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
845
	struct btrfs_root_item *new_root_item;
846 847
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
848
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
849
	struct btrfs_block_rsv *rsv;
850
	struct inode *parent_inode;
851
	struct dentry *parent;
852
	struct dentry *dentry;
853
	struct extent_buffer *tmp;
854
	struct extent_buffer *old;
855
	int ret;
856
	u64 to_reserve = 0;
857
	u64 index = 0;
858
	u64 objectid;
L
Li Zefan 已提交
859
	u64 root_flags;
860

L
Liu Bo 已提交
861 862
	rsv = trans->block_rsv;

863 864
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
865
		pending->error = -ENOMEM;
866 867
		goto fail;
	}
868

869
	ret = btrfs_find_free_objectid(tree_root, &objectid);
870 871
	if (ret) {
		pending->error = ret;
872
		goto fail;
873
	}
874

875
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
876 877

	if (to_reserve > 0) {
878
		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
879
					  to_reserve);
880 881 882 883 884 885
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

886
	key.objectid = objectid;
887 888
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
889

890
	trans->block_rsv = &pending->block_rsv;
891

892
	dentry = pending->dentry;
893 894
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
895
	parent_root = BTRFS_I(parent_inode)->root;
C
Chris Mason 已提交
896
	record_root_in_trans(trans, parent_root);
897

898 899 900
	/*
	 * insert the directory item
	 */
901
	ret = btrfs_set_inode_index(parent_inode, &index);
902
	BUG_ON(ret);
903
	ret = btrfs_insert_dir_item(trans, parent_root,
904
				dentry->d_name.name, dentry->d_name.len,
905
				parent_inode, &key,
906
				BTRFS_FT_DIR, index);
907
	BUG_ON(ret);
908

909 910
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
911 912 913
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
	BUG_ON(ret);

914 915 916 917 918 919 920 921 922
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
	ret = btrfs_run_delayed_items(trans, root);
	BUG_ON(ret);

C
Chris Mason 已提交
923
	record_root_in_trans(trans, root);
924 925
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
926
	btrfs_check_and_init_root_item(new_root_item);
927

L
Li Zefan 已提交
928 929 930 931 932 933 934
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

935 936 937 938 939 940 941 942 943
	old = btrfs_lock_root_node(root);
	btrfs_cow_block(trans, root, old, NULL, 0, &old);
	btrfs_set_lock_blocking(old);

	btrfs_copy_root(trans, root, old, &tmp, objectid);
	btrfs_tree_unlock(old);
	free_extent_buffer(old);

	btrfs_set_root_node(new_root_item, tmp);
944 945 946
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
947 948
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
949
	BUG_ON(ret);
950

951 952 953 954
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
955
				 parent_root->root_key.objectid,
L
Li Zefan 已提交
956
				 btrfs_ino(parent_inode), index,
957
				 dentry->d_name.name, dentry->d_name.len);
958
	BUG_ON(ret);
959
	dput(parent);
960

961 962 963
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
	BUG_ON(IS_ERR(pending->snap));
964

965
	btrfs_reloc_post_snapshot(trans, pending);
966
fail:
967
	kfree(new_root_item);
L
Liu Bo 已提交
968
	trans->block_rsv = rsv;
969 970
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
	return 0;
971 972
}

C
Chris Mason 已提交
973 974 975
/*
 * create all the snapshots we've scheduled for creation
 */
976 977
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
978 979 980 981 982
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;
	int ret;

Q
Qinghuang Feng 已提交
983
	list_for_each_entry(pending, head, list) {
984 985 986 987 988 989
		ret = create_pending_snapshot(trans, fs_info, pending);
		BUG_ON(ret);
	}
	return 0;
}

990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

	super = &root->fs_info->super_copy;

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1006 1007
	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
		super->cache_generation = root_item->generation;
1008 1009
}

1010 1011 1012
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1013
	spin_lock(&info->trans_lock);
1014 1015
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1016
	spin_unlock(&info->trans_lock);
1017 1018 1019
	return ret;
}

1020 1021 1022
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1023
	spin_lock(&info->trans_lock);
1024 1025
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1026
	spin_unlock(&info->trans_lock);
1027 1028 1029
	return ret;
}

S
Sage Weil 已提交
1030 1031 1032 1033 1034 1035 1036
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1037
	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
S
Sage Weil 已提交
1038 1039 1040 1041 1042 1043 1044 1045 1046
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
L
Li Zefan 已提交
1047 1048
	wait_event(root->fs_info->transaction_wait,
		   trans->commit_done || (trans->in_commit && !trans->blocked));
S
Sage Weil 已提交
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1078 1079
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1080 1081 1082

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1083
	ac->newtrans = btrfs_join_transaction(root);
1084 1085 1086 1087 1088
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1089 1090 1091

	/* take transaction reference */
	cur_trans = trans->transaction;
1092
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);

1103 1104 1105 1106
	if (current->journal_info == trans)
		current->journal_info = NULL;

	put_transaction(cur_trans);
S
Sage Weil 已提交
1107 1108 1109 1110 1111 1112 1113 1114 1115 1116
	return 0;
}

/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1117 1118 1119
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1120
	unsigned long joined = 0;
C
Chris Mason 已提交
1121
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
1122
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1123
	DEFINE_WAIT(wait);
1124
	int ret;
1125 1126
	int should_grow = 0;
	unsigned long now = get_seconds();
1127
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1128

1129 1130
	btrfs_run_ordered_operations(root, 0);

1131 1132
	trans->block_rsv = NULL;

1133 1134 1135 1136 1137 1138
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
	BUG_ON(ret);

1139 1140
	btrfs_trans_release_metadata(trans, root);

1141
	cur_trans = trans->transaction;
1142 1143 1144 1145
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1146
	cur_trans->delayed_refs.flushing = 1;
1147

1148
	ret = btrfs_run_delayed_refs(trans, root, 0);
1149 1150
	BUG_ON(ret);

J
Josef Bacik 已提交
1151
	spin_lock(&cur_trans->commit_lock);
1152
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1153
		spin_unlock(&cur_trans->commit_lock);
1154
		atomic_inc(&cur_trans->use_count);
C
Chris Mason 已提交
1155
		btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1156

1157
		wait_for_commit(root, cur_trans);
1158

C
Chris Mason 已提交
1159
		put_transaction(cur_trans);
1160

C
Chris Mason 已提交
1161 1162
		return 0;
	}
1163

C
Chris Mason 已提交
1164
	trans->transaction->in_commit = 1;
1165
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1166
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1167 1168
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1169
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1170 1171 1172 1173
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1174
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1175
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1176 1177 1178

			wait_for_commit(root, prev_trans);

1179
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1180 1181
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1182
		}
J
Josef Bacik 已提交
1183 1184
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1185
	}
1186

1187 1188 1189
	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
		should_grow = 1;

1190
	do {
1191
		int snap_pending = 0;
J
Josef Bacik 已提交
1192

1193
		joined = cur_trans->num_joined;
1194 1195 1196
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1197
		WARN_ON(cur_trans != trans->transaction);
1198

1199
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1200 1201
			btrfs_start_delalloc_inodes(root, 1);
			ret = btrfs_wait_ordered_extents(root, 0, 1);
1202
			BUG_ON(ret);
1203 1204
		}

1205 1206 1207
		ret = btrfs_run_delayed_items(trans, root);
		BUG_ON(ret);

1208 1209 1210 1211 1212 1213 1214 1215 1216
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1217 1218 1219
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1220
		if (atomic_read(&cur_trans->num_writers) > 1)
1221 1222 1223
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1224 1225

		finish_wait(&cur_trans->writer_wait, &wait);
1226
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1227
		 (should_grow && cur_trans->num_joined != joined));
1228

1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
	 * no_join so make sure to wait for num_writers to == 1 again.
	 */
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->trans_no_join = 1;
	spin_unlock(&root->fs_info->trans_lock);
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

C
Chris Mason 已提交
1240 1241 1242 1243 1244 1245 1246
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
	mutex_lock(&root->fs_info->reloc_mutex);

1247
	ret = btrfs_run_delayed_items(trans, root);
1248 1249
	BUG_ON(ret);

1250
	ret = create_pending_snapshots(trans, root->fs_info);
1251 1252
	BUG_ON(ret);

1253 1254 1255
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);

1256 1257 1258 1259 1260 1261
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
	btrfs_assert_delayed_root_empty(root);

C
Chris Mason 已提交
1262
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1263

A
Arne Jansen 已提交
1264
	btrfs_scrub_pause(root);
1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1280
	ret = commit_fs_roots(trans, root);
1281 1282
	BUG_ON(ret);

1283
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1284 1285 1286 1287
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1288
	ret = commit_cowonly_roots(trans, root);
C
Chris Mason 已提交
1289
	BUG_ON(ret);
1290

1291 1292
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1293
	cur_trans = root->fs_info->running_transaction;
1294 1295 1296

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1297
	switch_commit_root(root->fs_info->tree_root);
1298 1299 1300

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1301
	switch_commit_root(root->fs_info->chunk_root);
1302 1303

	update_super_roots(root);
1304 1305 1306 1307 1308 1309

	if (!root->fs_info->log_root_recovering) {
		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
	}

1310 1311
	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
	       sizeof(root->fs_info->super_copy));
C
Chris Mason 已提交
1312

1313
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1314 1315 1316 1317
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1318
	mutex_unlock(&root->fs_info->reloc_mutex);
1319

1320
	wake_up(&root->fs_info->transaction_wait);
1321

C
Chris Mason 已提交
1322 1323
	ret = btrfs_write_and_wait_transaction(trans, root);
	BUG_ON(ret);
Y
Yan Zheng 已提交
1324
	write_ctree_super(trans, root, 0);
1325

1326 1327 1328 1329 1330 1331
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1332
	btrfs_finish_extent_commit(trans, root);
1333

C
Chris Mason 已提交
1334
	cur_trans->commit_done = 1;
1335

1336
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1337

C
Chris Mason 已提交
1338
	wake_up(&cur_trans->commit_wait);
1339

J
Josef Bacik 已提交
1340
	spin_lock(&root->fs_info->trans_lock);
1341
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1342 1343
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1344
	put_transaction(cur_trans);
C
Chris Mason 已提交
1345
	put_transaction(cur_trans);
1346

1347 1348
	trace_btrfs_transaction_commit(root);

A
Arne Jansen 已提交
1349 1350
	btrfs_scrub_continue(root);

J
Josef Bacik 已提交
1351 1352 1353
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1354
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1355 1356 1357 1358

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1359 1360 1361
	return ret;
}

C
Chris Mason 已提交
1362 1363 1364
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1365 1366
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1367 1368 1369
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1370
	spin_lock(&fs_info->trans_lock);
1371
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1372
	spin_unlock(&fs_info->trans_lock);
1373

1374 1375
	while (!list_empty(&list)) {
		root = list_entry(list.next, struct btrfs_root, root_list);
1376 1377
		list_del(&root->root_list);

1378 1379
		btrfs_kill_all_delayed_nodes(root);

1380 1381
		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1382
			btrfs_drop_snapshot(root, NULL, 0);
1383
		else
1384
			btrfs_drop_snapshot(root, NULL, 1);
1385 1386 1387
	}
	return 0;
}