transaction.c 37.4 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
C
Chris Mason 已提交
25 26 27
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
28
#include "locking.h"
29
#include "tree-log.h"
C
Chris Mason 已提交
30

31 32
#define BTRFS_ROOT_TRANS_TAG 0

33
static noinline void put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
34
{
35 36
	WARN_ON(atomic_read(&transaction->use_count) == 0);
	if (atomic_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
37
		BUG_ON(!list_empty(&transaction->list));
C
Chris Mason 已提交
38 39
		memset(transaction, 0, sizeof(*transaction));
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
40
	}
C
Chris Mason 已提交
41 42
}

J
Josef Bacik 已提交
43 44 45 46 47 48
static noinline void switch_commit_root(struct btrfs_root *root)
{
	free_extent_buffer(root->commit_root);
	root->commit_root = btrfs_root_node(root);
}

C
Chris Mason 已提交
49 50 51
/*
 * either allocate a new transaction or hop into the existing one
 */
J
Josef Bacik 已提交
52
static noinline int join_transaction(struct btrfs_root *root, int nofail)
C
Chris Mason 已提交
53 54
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
55 56 57 58 59 60 61 62 63

	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->trans_no_join) {
		if (!nofail) {
			spin_unlock(&root->fs_info->trans_lock);
			return -EBUSY;
		}
	}

C
Chris Mason 已提交
64
	cur_trans = root->fs_info->running_transaction;
J
Josef Bacik 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
	if (cur_trans) {
		atomic_inc(&cur_trans->use_count);
		atomic_inc(&cur_trans->num_writers);
		cur_trans->num_joined++;
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
	}
	spin_unlock(&root->fs_info->trans_lock);

	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
	spin_lock(&root->fs_info->trans_lock);
	if (root->fs_info->running_transaction) {
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		cur_trans = root->fs_info->running_transaction;
		atomic_inc(&cur_trans->use_count);
82
		atomic_inc(&cur_trans->num_writers);
83
		cur_trans->num_joined++;
J
Josef Bacik 已提交
84 85
		spin_unlock(&root->fs_info->trans_lock);
		return 0;
C
Chris Mason 已提交
86
	}
J
Josef Bacik 已提交
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
	atomic_set(&cur_trans->num_writers, 1);
	cur_trans->num_joined = 0;
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
	cur_trans->in_commit = 0;
	cur_trans->blocked = 0;
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
	atomic_set(&cur_trans->use_count, 2);
	cur_trans->commit_done = 0;
	cur_trans->start_time = get_seconds();

	cur_trans->delayed_refs.root = RB_ROOT;
	cur_trans->delayed_refs.num_entries = 0;
	cur_trans->delayed_refs.num_heads_ready = 0;
	cur_trans->delayed_refs.num_heads = 0;
	cur_trans->delayed_refs.flushing = 0;
	cur_trans->delayed_refs.run_delayed_start = 0;
	spin_lock_init(&cur_trans->commit_lock);
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
	extent_io_tree_init(&cur_trans->dirty_pages,
			     root->fs_info->btree_inode->i_mapping,
			     GFP_NOFS);
	root->fs_info->generation++;
	cur_trans->transid = root->fs_info->generation;
	root->fs_info->running_transaction = cur_trans;
	spin_unlock(&root->fs_info->trans_lock);
119

C
Chris Mason 已提交
120 121 122
	return 0;
}

C
Chris Mason 已提交
123
/*
C
Chris Mason 已提交
124 125 126 127
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
128
 */
J
Josef Bacik 已提交
129 130
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
131
{
132
	if (root->ref_cows && root->last_trans < trans->transid) {
133
		WARN_ON(root == root->fs_info->extent_root);
134 135
		WARN_ON(root->commit_root != root->node);

J
Josef Bacik 已提交
136 137 138 139 140 141
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		if (root->last_trans == trans->transid) {
			spin_unlock(&root->fs_info->fs_roots_radix_lock);
			return 0;
		}
		root->last_trans = trans->transid;
142 143 144
		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
			   (unsigned long)root->root_key.objectid,
			   BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
145
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
146 147 148 149
		btrfs_init_reloc_root(trans, root);
	}
	return 0;
}
150

C
Chris Mason 已提交
151 152 153 154
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
C
Chris Mason 已提交
155
static void wait_current_trans(struct btrfs_root *root)
C
Chris Mason 已提交
156
{
157
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
158

J
Josef Bacik 已提交
159
	spin_lock(&root->fs_info->trans_lock);
160
	cur_trans = root->fs_info->running_transaction;
C
Chris Mason 已提交
161
	if (cur_trans && cur_trans->blocked) {
162
		DEFINE_WAIT(wait);
163
		atomic_inc(&cur_trans->use_count);
J
Josef Bacik 已提交
164
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
165
		while (1) {
166 167
			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
					TASK_UNINTERRUPTIBLE);
168
			if (!cur_trans->blocked)
169
				break;
170
			schedule();
171
		}
172
		finish_wait(&root->fs_info->transaction_wait, &wait);
173
		put_transaction(cur_trans);
J
Josef Bacik 已提交
174 175
	} else {
		spin_unlock(&root->fs_info->trans_lock);
176
	}
C
Chris Mason 已提交
177 178
}

179 180 181 182
enum btrfs_trans_type {
	TRANS_START,
	TRANS_JOIN,
	TRANS_USERSPACE,
183
	TRANS_JOIN_NOLOCK,
184 185
};

186 187
static int may_wait_transaction(struct btrfs_root *root, int type)
{
J
Josef Bacik 已提交
188 189 190 191 192 193 194 195
	if (root->fs_info->log_root_recovering)
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
	    !atomic_read(&root->fs_info->open_ioctl_trans))
196
		return 1;
J
Josef Bacik 已提交
197

198 199 200
	return 0;
}

201
static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
202
						    u64 num_items, int type)
C
Chris Mason 已提交
203
{
204 205
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
206
	int retries = 0;
C
Chris Mason 已提交
207
	int ret;
L
liubo 已提交
208 209 210

	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
		return ERR_PTR(-EROFS);
211 212 213 214 215 216 217 218 219

	if (current->journal_info) {
		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
		h = current->journal_info;
		h->use_count++;
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
220 221 222 223
again:
	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
	if (!h)
		return ERR_PTR(-ENOMEM);
C
Chris Mason 已提交
224

225
	if (may_wait_transaction(root, type))
C
Chris Mason 已提交
226
		wait_current_trans(root);
227

J
Josef Bacik 已提交
228 229 230 231 232 233
	do {
		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
		if (ret == -EBUSY)
			wait_current_trans(root);
	} while (ret == -EBUSY);

T
Tsutomu Itoh 已提交
234
	if (ret < 0) {
235
		kmem_cache_free(btrfs_trans_handle_cachep, h);
T
Tsutomu Itoh 已提交
236 237
		return ERR_PTR(ret);
	}
238

239 240 241 242
	cur_trans = root->fs_info->running_transaction;

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
C
Chris Mason 已提交
243
	h->blocks_used = 0;
244
	h->block_group = 0;
245
	h->bytes_reserved = 0;
246
	h->delayed_ref_updates = 0;
247
	h->use_count = 1;
248
	h->block_rsv = NULL;
249
	h->orig_rsv = NULL;
250

251 252 253 254 255 256 257
	smp_mb();
	if (cur_trans->blocked && may_wait_transaction(root, type)) {
		btrfs_commit_transaction(h, root);
		goto again;
	}

	if (num_items > 0) {
258
		ret = btrfs_trans_reserve_metadata(h, root, num_items);
259 260
		if (ret == -EAGAIN && !retries) {
			retries++;
261 262
			btrfs_commit_transaction(h, root);
			goto again;
263 264 265 266 267 268
		} else if (ret == -EAGAIN) {
			/*
			 * We have already retried and got EAGAIN, so really we
			 * don't have space, so set ret to -ENOSPC.
			 */
			ret = -ENOSPC;
269
		}
270

271 272 273 274 275
		if (ret < 0) {
			btrfs_end_transaction(h, root);
			return ERR_PTR(ret);
		}
	}
J
Josef Bacik 已提交
276

277
got_it:
J
Josef Bacik 已提交
278
	btrfs_record_root_in_trans(h, root);
279 280 281

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
282 283 284
	return h;
}

285
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
286
						   int num_items)
287
{
288
	return start_transaction(root, num_items, TRANS_START);
289
}
290
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
291
{
292
	return start_transaction(root, 0, TRANS_JOIN);
293 294
}

295
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
296 297 298 299
{
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
}

300
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
301
{
302
	return start_transaction(root, 0, TRANS_USERSPACE);
303 304
}

C
Chris Mason 已提交
305
/* wait for a transaction commit to be fully complete */
306 307 308 309
static noinline int wait_for_commit(struct btrfs_root *root,
				    struct btrfs_transaction *commit)
{
	DEFINE_WAIT(wait);
C
Chris Mason 已提交
310
	while (!commit->commit_done) {
311 312 313 314 315 316 317 318 319 320
		prepare_to_wait(&commit->commit_wait, &wait,
				TASK_UNINTERRUPTIBLE);
		if (commit->commit_done)
			break;
		schedule();
	}
	finish_wait(&commit->commit_wait, &wait);
	return 0;
}

321 322 323 324 325 326 327 328
int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
{
	struct btrfs_transaction *cur_trans = NULL, *t;
	int ret;

	ret = 0;
	if (transid) {
		if (transid <= root->fs_info->last_trans_committed)
J
Josef Bacik 已提交
329
			goto out;
330 331

		/* find specified transaction */
J
Josef Bacik 已提交
332
		spin_lock(&root->fs_info->trans_lock);
333 334 335
		list_for_each_entry(t, &root->fs_info->trans_list, list) {
			if (t->transid == transid) {
				cur_trans = t;
J
Josef Bacik 已提交
336
				atomic_inc(&cur_trans->use_count);
337 338 339 340 341
				break;
			}
			if (t->transid > transid)
				break;
		}
J
Josef Bacik 已提交
342
		spin_unlock(&root->fs_info->trans_lock);
343 344
		ret = -EINVAL;
		if (!cur_trans)
J
Josef Bacik 已提交
345
			goto out;  /* bad transid */
346 347
	} else {
		/* find newest transaction that is committing | committed */
J
Josef Bacik 已提交
348
		spin_lock(&root->fs_info->trans_lock);
349 350 351 352
		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
					    list) {
			if (t->in_commit) {
				if (t->commit_done)
J
Josef Bacik 已提交
353
					goto out;
354
				cur_trans = t;
J
Josef Bacik 已提交
355
				atomic_inc(&cur_trans->use_count);
356 357 358
				break;
			}
		}
J
Josef Bacik 已提交
359
		spin_unlock(&root->fs_info->trans_lock);
360
		if (!cur_trans)
J
Josef Bacik 已提交
361
			goto out;  /* nothing committing|committed */
362 363 364 365 366 367
	}

	wait_for_commit(root, cur_trans);

	put_transaction(cur_trans);
	ret = 0;
J
Josef Bacik 已提交
368
out:
369 370 371
	return ret;
}

372
#if 0
C
Chris Mason 已提交
373
/*
C
Chris Mason 已提交
374 375
 * rate limit against the drop_snapshot code.  This helps to slow down new
 * operations if the drop_snapshot code isn't able to keep up.
C
Chris Mason 已提交
376
 */
C
Chris Mason 已提交
377
static void throttle_on_drops(struct btrfs_root *root)
378 379
{
	struct btrfs_fs_info *info = root->fs_info;
C
Chris Mason 已提交
380
	int harder_count = 0;
381

C
Chris Mason 已提交
382
harder:
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
	if (atomic_read(&info->throttles)) {
		DEFINE_WAIT(wait);
		int thr;
		thr = atomic_read(&info->throttle_gen);

		do {
			prepare_to_wait(&info->transaction_throttle,
					&wait, TASK_UNINTERRUPTIBLE);
			if (!atomic_read(&info->throttles)) {
				finish_wait(&info->transaction_throttle, &wait);
				break;
			}
			schedule();
			finish_wait(&info->transaction_throttle, &wait);
		} while (thr == atomic_read(&info->throttle_gen));
C
Chris Mason 已提交
398 399 400 401 402 403 404 405 406 407 408 409 410
		harder_count++;

		if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
		    harder_count < 2)
			goto harder;

		if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
		    harder_count < 10)
			goto harder;

		if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
		    harder_count < 20)
			goto harder;
411 412
	}
}
413
#endif
414

C
Chris Mason 已提交
415 416
void btrfs_throttle(struct btrfs_root *root)
{
J
Josef Bacik 已提交
417
	if (!atomic_read(&root->fs_info->open_ioctl_trans))
418
		wait_current_trans(root);
C
Chris Mason 已提交
419 420
}

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
static int should_end_transaction(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
{
	int ret;
	ret = btrfs_block_rsv_check(trans, root,
				    &root->fs_info->global_block_rsv, 0, 5);
	return ret ? 1 : 0;
}

int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	struct btrfs_transaction *cur_trans = trans->transaction;
	int updates;

J
Josef Bacik 已提交
436
	smp_mb();
437 438 439 440 441 442 443 444 445 446 447
	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
		return 1;

	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
	if (updates)
		btrfs_run_delayed_refs(trans, root, updates);

	return should_end_transaction(trans, root);
}

448
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
449
			  struct btrfs_root *root, int throttle, int lock)
C
Chris Mason 已提交
450
{
451
	struct btrfs_transaction *cur_trans = trans->transaction;
452
	struct btrfs_fs_info *info = root->fs_info;
453 454
	int count = 0;

455 456 457 458 459
	if (--trans->use_count) {
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

460 461 462 463 464 465
	while (count < 4) {
		unsigned long cur = trans->delayed_ref_updates;
		trans->delayed_ref_updates = 0;
		if (cur &&
		    trans->transaction->delayed_refs.num_heads_ready > 64) {
			trans->delayed_ref_updates = 0;
466 467 468 469 470 471 472

			/*
			 * do a full flush if the transaction is trying
			 * to close
			 */
			if (trans->transaction->delayed_refs.flushing)
				cur = 0;
473 474 475 476 477
			btrfs_run_delayed_refs(trans, root, cur);
		} else {
			break;
		}
		count++;
478 479
	}

480 481
	btrfs_trans_release_metadata(trans, root);

J
Josef Bacik 已提交
482 483
	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
	    should_end_transaction(trans, root)) {
484
		trans->transaction->blocked = 1;
J
Josef Bacik 已提交
485 486
		smp_wmb();
	}
487

488
	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
489 490 491 492 493 494 495
		if (throttle)
			return btrfs_commit_transaction(trans, root);
		else
			wake_up_process(info->transaction_kthread);
	}

	WARN_ON(cur_trans != info->running_transaction);
496 497
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
498

499
	smp_mb();
C
Chris Mason 已提交
500 501 502
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
	put_transaction(cur_trans);
J
Josef Bacik 已提交
503 504 505

	if (current->journal_info == trans)
		current->journal_info = NULL;
C
Chris Mason 已提交
506
	memset(trans, 0, sizeof(*trans));
C
Chris Mason 已提交
507
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
508

Y
Yan, Zheng 已提交
509 510 511
	if (throttle)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
512 513 514
	return 0;
}

515 516 517
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
518
	return __btrfs_end_transaction(trans, root, 0, 1);
519 520 521 522 523
}

int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
524 525 526 527 528 529 530
	return __btrfs_end_transaction(trans, root, 1, 1);
}

int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root)
{
	return __btrfs_end_transaction(trans, root, 0, 0);
531 532
}

C
Chris Mason 已提交
533 534 535
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
536
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
537
 */
538
int btrfs_write_marked_extents(struct btrfs_root *root,
539
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
540
{
541
	int ret;
542
	int err = 0;
543 544 545
	int werr = 0;
	struct page *page;
	struct inode *btree_inode = root->fs_info->btree_inode;
546
	u64 start = 0;
547 548
	u64 end;
	unsigned long index;
549

C
Chris Mason 已提交
550
	while (1) {
551
		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
552
					    mark);
553
		if (ret)
554
			break;
C
Chris Mason 已提交
555
		while (start <= end) {
556 557
			cond_resched();

558
			index = start >> PAGE_CACHE_SHIFT;
559
			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
C
Chris Mason 已提交
560
			page = find_get_page(btree_inode->i_mapping, index);
561 562
			if (!page)
				continue;
C
Chris Mason 已提交
563 564 565 566 567 568 569 570

			btree_lock_page_hook(page);
			if (!page->mapping) {
				unlock_page(page);
				page_cache_release(page);
				continue;
			}

571 572 573 574 575 576 577 578 579
			if (PageWriteback(page)) {
				if (PageDirty(page))
					wait_on_page_writeback(page);
				else {
					unlock_page(page);
					page_cache_release(page);
					continue;
				}
			}
580 581 582 583 584 585
			err = write_one_page(page, 0);
			if (err)
				werr = err;
			page_cache_release(page);
		}
	}
586 587 588 589 590 591 592 593 594 595 596 597
	if (err)
		werr = err;
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
int btrfs_wait_marked_extents(struct btrfs_root *root,
598
			      struct extent_io_tree *dirty_pages, int mark)
599 600 601 602 603 604 605 606 607 608
{
	int ret;
	int err = 0;
	int werr = 0;
	struct page *page;
	struct inode *btree_inode = root->fs_info->btree_inode;
	u64 start = 0;
	u64 end;
	unsigned long index;

C
Chris Mason 已提交
609
	while (1) {
610 611
		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
					    mark);
612 613 614
		if (ret)
			break;

615
		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
C
Chris Mason 已提交
616
		while (start <= end) {
617 618 619 620 621 622
			index = start >> PAGE_CACHE_SHIFT;
			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
			page = find_get_page(btree_inode->i_mapping, index);
			if (!page)
				continue;
			if (PageDirty(page)) {
C
Chris Mason 已提交
623 624
				btree_lock_page_hook(page);
				wait_on_page_writeback(page);
625 626 627 628
				err = write_one_page(page, 0);
				if (err)
					werr = err;
			}
629
			wait_on_page_writeback(page);
630 631 632 633
			page_cache_release(page);
			cond_resched();
		}
	}
634 635 636
	if (err)
		werr = err;
	return werr;
C
Chris Mason 已提交
637 638
}

639 640 641 642 643 644
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
645
				struct extent_io_tree *dirty_pages, int mark)
646 647 648 649
{
	int ret;
	int ret2;

650 651
	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
652 653 654
	return ret || ret2;
}

655 656 657 658 659 660 661 662 663
int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root)
{
	if (!trans || !trans->transaction) {
		struct inode *btree_inode;
		btree_inode = root->fs_info->btree_inode;
		return filemap_write_and_wait(btree_inode->i_mapping);
	}
	return btrfs_write_and_wait_marked_extents(root,
664 665
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
666 667
}

C
Chris Mason 已提交
668 669 670 671 672 673 674 675 676 677
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
678 679
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
680 681
{
	int ret;
682
	u64 old_root_bytenr;
683
	u64 old_root_used;
684
	struct btrfs_root *tree_root = root->fs_info->tree_root;
C
Chris Mason 已提交
685

686
	old_root_used = btrfs_root_used(&root->root_item);
687
	btrfs_write_dirty_block_groups(trans, root);
688

C
Chris Mason 已提交
689
	while (1) {
690
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
691 692
		if (old_root_bytenr == root->node->start &&
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
693
			break;
694

695
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
696
		ret = btrfs_update_root(trans, tree_root,
697 698
					&root->root_key,
					&root->root_item);
C
Chris Mason 已提交
699
		BUG_ON(ret);
700

701
		old_root_used = btrfs_root_used(&root->root_item);
702
		ret = btrfs_write_dirty_block_groups(trans, root);
703
		BUG_ON(ret);
704
	}
705 706 707 708

	if (root != root->fs_info->extent_root)
		switch_commit_root(root);

709 710 711
	return 0;
}

C
Chris Mason 已提交
712 713 714
/*
 * update all the cowonly tree roots on disk
 */
715 716
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root)
717 718 719
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct list_head *next;
720
	struct extent_buffer *eb;
721
	int ret;
722

723 724
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);
725

726
	eb = btrfs_lock_root_node(fs_info->tree_root);
727
	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
728 729
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
730

731 732
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);
733

C
Chris Mason 已提交
734
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
735 736 737
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
738

739
		update_cowonly_root(trans, root);
C
Chris Mason 已提交
740
	}
741 742 743 744 745

	down_write(&fs_info->extent_commit_sem);
	switch_commit_root(fs_info->extent_root);
	up_write(&fs_info->extent_commit_sem);

C
Chris Mason 已提交
746 747 748
	return 0;
}

C
Chris Mason 已提交
749 750 751 752 753
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
754
int btrfs_add_dead_root(struct btrfs_root *root)
755
{
J
Josef Bacik 已提交
756
	spin_lock(&root->fs_info->trans_lock);
757
	list_add(&root->root_list, &root->fs_info->dead_roots);
J
Josef Bacik 已提交
758
	spin_unlock(&root->fs_info->trans_lock);
759 760 761
	return 0;
}

C
Chris Mason 已提交
762
/*
763
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
764
 */
765 766
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root)
767 768
{
	struct btrfs_root *gang[8];
769
	struct btrfs_fs_info *fs_info = root->fs_info;
770 771
	int i;
	int ret;
772 773
	int err = 0;

J
Josef Bacik 已提交
774
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
775
	while (1) {
776 777
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
778 779 780 781 782 783
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			root = gang[i];
784 785 786
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
787
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
788

789
			btrfs_free_log(trans, root);
790
			btrfs_update_reloc_root(trans, root);
791
			btrfs_orphan_commit_root(trans, root);
792

793
			if (root->commit_root != root->node) {
J
Josef Bacik 已提交
794
				switch_commit_root(root);
795 796 797
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
798 799

			err = btrfs_update_root(trans, fs_info->tree_root,
800 801
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
802
			spin_lock(&fs_info->fs_roots_radix_lock);
803 804
			if (err)
				break;
805 806
		}
	}
J
Josef Bacik 已提交
807
	spin_unlock(&fs_info->fs_roots_radix_lock);
808
	return err;
809 810
}

C
Chris Mason 已提交
811 812 813 814
/*
 * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
 * otherwise every leaf in the btree is read and defragged.
 */
815 816 817 818
int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
819
	int ret;
820
	unsigned long nr;
821

822
	if (xchg(&root->defrag_running, 1))
823
		return 0;
824

825
	while (1) {
826 827 828 829
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

830
		ret = btrfs_defrag_leaves(trans, root, cacheonly);
831

832
		nr = trans->blocks_used;
833
		btrfs_end_transaction(trans, root);
834
		btrfs_btree_balance_dirty(info->tree_root, nr);
835 836
		cond_resched();

837
		if (root->fs_info->closing || ret != -EAGAIN)
838 839 840
			break;
	}
	root->defrag_running = 0;
841
	return ret;
842 843
}

844
#if 0
845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861
/*
 * when dropping snapshots, we generate a ton of delayed refs, and it makes
 * sense not to join the transaction while it is trying to flush the current
 * queue of delayed refs out.
 *
 * This is used by the drop snapshot code only
 */
static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
{
	DEFINE_WAIT(wait);

	mutex_lock(&info->trans_mutex);
	while (info->running_transaction &&
	       info->running_transaction->delayed_refs.flushing) {
		prepare_to_wait(&info->transaction_wait, &wait,
				TASK_UNINTERRUPTIBLE);
		mutex_unlock(&info->trans_mutex);
862

863
		schedule();
864

865 866 867 868 869 870 871
		mutex_lock(&info->trans_mutex);
		finish_wait(&info->transaction_wait, &wait);
	}
	mutex_unlock(&info->trans_mutex);
	return 0;
}

C
Chris Mason 已提交
872 873 874 875
/*
 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 * all of them
 */
876
int btrfs_drop_dead_root(struct btrfs_root *root)
877 878
{
	struct btrfs_trans_handle *trans;
879
	struct btrfs_root *tree_root = root->fs_info->tree_root;
880
	unsigned long nr;
881
	int ret;
882

883 884 885 886 887 888 889
	while (1) {
		/*
		 * we don't want to jump in and create a bunch of
		 * delayed refs if the transaction is starting to close
		 */
		wait_transaction_pre_flush(tree_root->fs_info);
		trans = btrfs_start_transaction(tree_root, 1);
890

891 892 893 894 895 896 897
		/*
		 * we've joined a transaction, make sure it isn't
		 * closing right now
		 */
		if (trans->transaction->delayed_refs.flushing) {
			btrfs_end_transaction(trans, tree_root);
			continue;
898
		}
899

900 901 902
		ret = btrfs_drop_snapshot(trans, root);
		if (ret != -EAGAIN)
			break;
903

904 905 906 907
		ret = btrfs_update_root(trans, tree_root,
					&root->root_key,
					&root->root_item);
		if (ret)
908
			break;
909

910
		nr = trans->blocks_used;
911 912
		ret = btrfs_end_transaction(trans, tree_root);
		BUG_ON(ret);
913

914
		btrfs_btree_balance_dirty(tree_root, nr);
915
		cond_resched();
916
	}
917 918 919 920 921 922 923 924 925 926 927 928 929 930
	BUG_ON(ret);

	ret = btrfs_del_root(trans, tree_root, &root->root_key);
	BUG_ON(ret);

	nr = trans->blocks_used;
	ret = btrfs_end_transaction(trans, tree_root);
	BUG_ON(ret);

	free_extent_buffer(root->node);
	free_extent_buffer(root->commit_root);
	kfree(root);

	btrfs_btree_balance_dirty(tree_root, nr);
931
	return ret;
932
}
933
#endif
934

C
Chris Mason 已提交
935 936 937 938
/*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
 */
939
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
940 941 942 943
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
944
	struct btrfs_root_item *new_root_item;
945 946
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
947 948
	struct btrfs_root *parent_root;
	struct inode *parent_inode;
949
	struct dentry *parent;
950
	struct dentry *dentry;
951
	struct extent_buffer *tmp;
952
	struct extent_buffer *old;
953
	int ret;
954
	u64 to_reserve = 0;
955
	u64 index = 0;
956
	u64 objectid;
L
Li Zefan 已提交
957
	u64 root_flags;
958

959 960
	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
	if (!new_root_item) {
961
		pending->error = -ENOMEM;
962 963
		goto fail;
	}
964

965
	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
966 967
	if (ret) {
		pending->error = ret;
968
		goto fail;
969
	}
970

971
	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
972 973 974 975
	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);

	if (to_reserve > 0) {
		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
976
					  to_reserve);
977 978 979 980 981 982
		if (ret) {
			pending->error = ret;
			goto fail;
		}
	}

983
	key.objectid = objectid;
984 985
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
986

987
	trans->block_rsv = &pending->block_rsv;
988

989
	dentry = pending->dentry;
990 991
	parent = dget_parent(dentry);
	parent_inode = parent->d_inode;
992
	parent_root = BTRFS_I(parent_inode)->root;
J
Josef Bacik 已提交
993
	btrfs_record_root_in_trans(trans, parent_root);
994

995 996 997
	/*
	 * insert the directory item
	 */
998
	ret = btrfs_set_inode_index(parent_inode, &index);
999
	BUG_ON(ret);
1000
	ret = btrfs_insert_dir_item(trans, parent_root,
1001 1002 1003
				dentry->d_name.name, dentry->d_name.len,
				parent_inode->i_ino, &key,
				BTRFS_FT_DIR, index);
1004
	BUG_ON(ret);
1005

1006 1007
	btrfs_i_size_write(parent_inode, parent_inode->i_size +
					 dentry->d_name.len * 2);
1008 1009 1010
	ret = btrfs_update_inode(trans, parent_root, parent_inode);
	BUG_ON(ret);

J
Josef Bacik 已提交
1011
	btrfs_record_root_in_trans(trans, root);
1012 1013
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1014
	btrfs_check_and_init_root_item(new_root_item);
1015

L
Li Zefan 已提交
1016 1017 1018 1019 1020 1021 1022
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

1023 1024 1025 1026 1027 1028 1029 1030 1031
	old = btrfs_lock_root_node(root);
	btrfs_cow_block(trans, root, old, NULL, 0, &old);
	btrfs_set_lock_blocking(old);

	btrfs_copy_root(trans, root, old, &tmp, objectid);
	btrfs_tree_unlock(old);
	free_extent_buffer(old);

	btrfs_set_root_node(new_root_item, tmp);
1032 1033 1034
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1035 1036
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1037
	BUG_ON(ret);
1038

1039 1040 1041 1042
	/*
	 * insert root back/forward references
	 */
	ret = btrfs_add_root_ref(trans, tree_root, objectid,
1043
				 parent_root->root_key.objectid,
1044 1045
				 parent_inode->i_ino, index,
				 dentry->d_name.name, dentry->d_name.len);
1046
	BUG_ON(ret);
1047
	dput(parent);
1048

1049 1050 1051
	key.offset = (u64)-1;
	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
	BUG_ON(IS_ERR(pending->snap));
1052

1053
	btrfs_reloc_post_snapshot(trans, pending);
1054
	btrfs_orphan_post_snapshot(trans, pending);
1055
fail:
1056
	kfree(new_root_item);
1057 1058
	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
	return 0;
1059 1060
}

C
Chris Mason 已提交
1061 1062 1063
/*
 * create all the snapshots we've scheduled for creation
 */
1064 1065
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1066 1067 1068 1069 1070
{
	struct btrfs_pending_snapshot *pending;
	struct list_head *head = &trans->transaction->pending_snapshots;
	int ret;

Q
Qinghuang Feng 已提交
1071
	list_for_each_entry(pending, head, list) {
1072 1073 1074 1075 1076 1077
		ret = create_pending_snapshot(trans, fs_info, pending);
		BUG_ON(ret);
	}
	return 0;
}

1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093
static void update_super_roots(struct btrfs_root *root)
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

	super = &root->fs_info->super_copy;

	root_item = &root->fs_info->chunk_root->root_item;
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

	root_item = &root->fs_info->tree_root->root_item;
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1094 1095
	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
		super->cache_generation = root_item->generation;
1096 1097
}

1098 1099 1100
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1101
	spin_lock(&info->trans_lock);
1102 1103
	if (info->running_transaction)
		ret = info->running_transaction->in_commit;
J
Josef Bacik 已提交
1104
	spin_unlock(&info->trans_lock);
1105 1106 1107
	return ret;
}

1108 1109 1110
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
	int ret = 0;
J
Josef Bacik 已提交
1111
	spin_lock(&info->trans_lock);
1112 1113
	if (info->running_transaction)
		ret = info->running_transaction->blocked;
J
Josef Bacik 已提交
1114
	spin_unlock(&info->trans_lock);
1115 1116 1117
	return ret;
}

S
Sage Weil 已提交
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
static void wait_current_trans_commit_start(struct btrfs_root *root,
					    struct btrfs_transaction *trans)
{
	DEFINE_WAIT(wait);

	if (trans->in_commit)
		return;

	while (1) {
		prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
				TASK_UNINTERRUPTIBLE);
		if (trans->in_commit) {
			finish_wait(&root->fs_info->transaction_blocked_wait,
				    &wait);
			break;
		}
		schedule();
		finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
	}
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
					 struct btrfs_transaction *trans)
{
	DEFINE_WAIT(wait);

	if (trans->commit_done || (trans->in_commit && !trans->blocked))
		return;

	while (1) {
		prepare_to_wait(&root->fs_info->transaction_wait, &wait,
				TASK_UNINTERRUPTIBLE);
		if (trans->commit_done ||
		    (trans->in_commit && !trans->blocked)) {
			finish_wait(&root->fs_info->transaction_wait,
				    &wait);
			break;
		}
		schedule();
		finish_wait(&root->fs_info->transaction_wait,
			    &wait);
	}
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
	struct btrfs_root *root;
	struct delayed_work work;
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
		container_of(work, struct btrfs_async_commit, work.work);

	btrfs_commit_transaction(ac->newtrans, ac->root);
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   int wait_for_unblock)
{
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1197 1198
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1199 1200 1201

	INIT_DELAYED_WORK(&ac->work, do_async_commit);
	ac->root = root;
1202
	ac->newtrans = btrfs_join_transaction(root);
1203 1204 1205 1206 1207
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1208 1209 1210

	/* take transaction reference */
	cur_trans = trans->transaction;
1211
	atomic_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232

	btrfs_end_transaction(trans, root);
	schedule_delayed_work(&ac->work, 0);

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
		wait_current_trans_commit_start_and_unblock(root, cur_trans);
	else
		wait_current_trans_commit_start(root, cur_trans);
	put_transaction(cur_trans);

	return 0;
}

/*
 * btrfs_transaction state sequence:
 *    in_commit = 0, blocked = 0  (initial)
 *    in_commit = 1, blocked = 1
 *    blocked = 0
 *    commit_done = 1
 */
C
Chris Mason 已提交
1233 1234 1235
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root)
{
1236
	unsigned long joined = 0;
C
Chris Mason 已提交
1237
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
1238
	struct btrfs_transaction *prev_trans = NULL;
C
Chris Mason 已提交
1239
	DEFINE_WAIT(wait);
1240
	int ret;
1241 1242
	int should_grow = 0;
	unsigned long now = get_seconds();
1243
	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
C
Chris Mason 已提交
1244

1245 1246
	btrfs_run_ordered_operations(root, 0);

1247 1248 1249 1250 1251 1252
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
	BUG_ON(ret);

1253 1254
	btrfs_trans_release_metadata(trans, root);

1255
	cur_trans = trans->transaction;
1256 1257 1258 1259
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1260
	cur_trans->delayed_refs.flushing = 1;
1261

1262
	ret = btrfs_run_delayed_refs(trans, root, 0);
1263 1264
	BUG_ON(ret);

J
Josef Bacik 已提交
1265
	spin_lock(&cur_trans->commit_lock);
1266
	if (cur_trans->in_commit) {
J
Josef Bacik 已提交
1267
		spin_unlock(&cur_trans->commit_lock);
1268
		atomic_inc(&cur_trans->use_count);
C
Chris Mason 已提交
1269
		btrfs_end_transaction(trans, root);
C
Chris Mason 已提交
1270

C
Chris Mason 已提交
1271 1272
		ret = wait_for_commit(root, cur_trans);
		BUG_ON(ret);
1273

C
Chris Mason 已提交
1274
		put_transaction(cur_trans);
1275

C
Chris Mason 已提交
1276 1277
		return 0;
	}
1278

C
Chris Mason 已提交
1279
	trans->transaction->in_commit = 1;
1280
	trans->transaction->blocked = 1;
J
Josef Bacik 已提交
1281
	spin_unlock(&cur_trans->commit_lock);
S
Sage Weil 已提交
1282 1283
	wake_up(&root->fs_info->transaction_blocked_wait);

J
Josef Bacik 已提交
1284
	spin_lock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1285 1286 1287 1288
	if (cur_trans->list.prev != &root->fs_info->trans_list) {
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
		if (!prev_trans->commit_done) {
1289
			atomic_inc(&prev_trans->use_count);
J
Josef Bacik 已提交
1290
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1291 1292 1293

			wait_for_commit(root, prev_trans);

1294
			put_transaction(prev_trans);
J
Josef Bacik 已提交
1295 1296
		} else {
			spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1297
		}
J
Josef Bacik 已提交
1298 1299
	} else {
		spin_unlock(&root->fs_info->trans_lock);
C
Chris Mason 已提交
1300
	}
1301

1302 1303 1304
	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
		should_grow = 1;

1305
	do {
1306
		int snap_pending = 0;
J
Josef Bacik 已提交
1307

1308
		joined = cur_trans->num_joined;
1309 1310 1311
		if (!list_empty(&trans->transaction->pending_snapshots))
			snap_pending = 1;

C
Chris Mason 已提交
1312
		WARN_ON(cur_trans != trans->transaction);
1313

1314
		if (flush_on_commit || snap_pending) {
Y
Yan, Zheng 已提交
1315 1316
			btrfs_start_delalloc_inodes(root, 1);
			ret = btrfs_wait_ordered_extents(root, 0, 1);
1317
			BUG_ON(ret);
1318 1319
		}

1320 1321 1322 1323 1324 1325 1326 1327 1328
		/*
		 * rename don't use btrfs_join_transaction, so, once we
		 * set the transaction to blocked above, we aren't going
		 * to get any new ordered operations.  We can safely run
		 * it here and no for sure that nothing new will be added
		 * to the list
		 */
		btrfs_run_ordered_operations(root, 1);

1329 1330 1331
		prepare_to_wait(&cur_trans->writer_wait, &wait,
				TASK_UNINTERRUPTIBLE);

1332
		if (atomic_read(&cur_trans->num_writers) > 1)
1333 1334 1335
			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
		else if (should_grow)
			schedule_timeout(1);
1336 1337

		finish_wait(&cur_trans->writer_wait, &wait);
J
Josef Bacik 已提交
1338 1339 1340
		spin_lock(&root->fs_info->trans_lock);
		root->fs_info->trans_no_join = 1;
		spin_unlock(&root->fs_info->trans_lock);
1341
	} while (atomic_read(&cur_trans->num_writers) > 1 ||
1342
		 (should_grow && cur_trans->num_joined != joined));
1343

1344 1345 1346
	ret = create_pending_snapshots(trans, root->fs_info);
	BUG_ON(ret);

1347 1348 1349
	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
	BUG_ON(ret);

C
Chris Mason 已提交
1350
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
1351

1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
	mutex_lock(&root->fs_info->tree_log_mutex);

1367
	ret = commit_fs_roots(trans, root);
1368 1369
	BUG_ON(ret);

1370
	/* commit_fs_roots gets rid of all the tree log roots, it is now
1371 1372 1373 1374
	 * safe to free the root of tree log roots
	 */
	btrfs_free_log_root_tree(trans, root->fs_info);

1375
	ret = commit_cowonly_roots(trans, root);
C
Chris Mason 已提交
1376
	BUG_ON(ret);
1377

1378 1379
	btrfs_prepare_extent_commit(trans, root);

C
Chris Mason 已提交
1380
	cur_trans = root->fs_info->running_transaction;
1381 1382 1383

	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
			    root->fs_info->tree_root->node);
J
Josef Bacik 已提交
1384
	switch_commit_root(root->fs_info->tree_root);
1385 1386 1387

	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
			    root->fs_info->chunk_root->node);
J
Josef Bacik 已提交
1388
	switch_commit_root(root->fs_info->chunk_root);
1389 1390

	update_super_roots(root);
1391 1392 1393 1394 1395 1396

	if (!root->fs_info->log_root_recovering) {
		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
	}

1397 1398
	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
	       sizeof(root->fs_info->super_copy));
C
Chris Mason 已提交
1399

1400
	trans->transaction->blocked = 0;
J
Josef Bacik 已提交
1401 1402 1403 1404
	spin_lock(&root->fs_info->trans_lock);
	root->fs_info->running_transaction = NULL;
	root->fs_info->trans_no_join = 0;
	spin_unlock(&root->fs_info->trans_lock);
1405

1406
	wake_up(&root->fs_info->transaction_wait);
1407

C
Chris Mason 已提交
1408 1409
	ret = btrfs_write_and_wait_transaction(trans, root);
	BUG_ON(ret);
Y
Yan Zheng 已提交
1410
	write_ctree_super(trans, root, 0);
1411

1412 1413 1414 1415 1416 1417
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
	mutex_unlock(&root->fs_info->tree_log_mutex);

1418
	btrfs_finish_extent_commit(trans, root);
1419

C
Chris Mason 已提交
1420
	cur_trans->commit_done = 1;
1421

1422
	root->fs_info->last_trans_committed = cur_trans->transid;
J
Josef Bacik 已提交
1423

C
Chris Mason 已提交
1424
	wake_up(&cur_trans->commit_wait);
1425

J
Josef Bacik 已提交
1426
	spin_lock(&root->fs_info->trans_lock);
1427
	list_del_init(&cur_trans->list);
J
Josef Bacik 已提交
1428 1429
	spin_unlock(&root->fs_info->trans_lock);

C
Chris Mason 已提交
1430
	put_transaction(cur_trans);
C
Chris Mason 已提交
1431
	put_transaction(cur_trans);
1432

1433 1434
	trace_btrfs_transaction_commit(root);

J
Josef Bacik 已提交
1435 1436 1437
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
1438
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
1439 1440 1441 1442

	if (current != root->fs_info->transaction_kthread)
		btrfs_run_delayed_iputs(root);

C
Chris Mason 已提交
1443 1444 1445
	return ret;
}

C
Chris Mason 已提交
1446 1447 1448
/*
 * interface function to delete all the snapshots we have scheduled for deletion
 */
1449 1450
int btrfs_clean_old_snapshots(struct btrfs_root *root)
{
1451 1452 1453
	LIST_HEAD(list);
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
1454
	spin_lock(&fs_info->trans_lock);
1455
	list_splice_init(&fs_info->dead_roots, &list);
J
Josef Bacik 已提交
1456
	spin_unlock(&fs_info->trans_lock);
1457

1458 1459
	while (!list_empty(&list)) {
		root = list_entry(list.next, struct btrfs_root, root_list);
1460 1461 1462 1463
		list_del(&root->root_list);

		if (btrfs_header_backref_rev(root->node) <
		    BTRFS_MIXED_BACKREF_REV)
1464
			btrfs_drop_snapshot(root, NULL, 0);
1465
		else
1466
			btrfs_drop_snapshot(root, NULL, 1);
1467 1468 1469
	}
	return 0;
}