transaction.c 65.7 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19
#include <linux/fs.h>
20
#include <linux/slab.h>
C
Chris Mason 已提交
21
#include <linux/sched.h>
22
#include <linux/writeback.h>
23
#include <linux/pagemap.h>
24
#include <linux/blkdev.h>
25
#include <linux/uuid.h>
C
Chris Mason 已提交
26 27 28
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "inode-map.h"
32
#include "volumes.h"
33
#include "dev-replace.h"
J
Josef Bacik 已提交
34
#include "qgroup.h"
C
Chris Mason 已提交
35

36 37
#define BTRFS_ROOT_TRANS_TAG 0

38
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
	[TRANS_STATE_RUNNING]		= 0U,
	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
					   __TRANS_START),
	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH),
	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN),
	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK),
	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
					   __TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK),
};

61
void btrfs_put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
62
{
63 64
	WARN_ON(refcount_read(&transaction->use_count) == 0);
	if (refcount_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
65
		BUG_ON(!list_empty(&transaction->list));
L
Liu Bo 已提交
66
		WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
67
		if (transaction->delayed_refs.pending_csums)
68 69 70
			btrfs_err(transaction->fs_info,
				  "pending csums is %llu",
				  transaction->delayed_refs.pending_csums);
71 72 73 74 75 76 77 78
		while (!list_empty(&transaction->pending_chunks)) {
			struct extent_map *em;

			em = list_first_entry(&transaction->pending_chunks,
					      struct extent_map, list);
			list_del_init(&em->list);
			free_extent_map(em);
		}
79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
		/*
		 * If any block groups are found in ->deleted_bgs then it's
		 * because the transaction was aborted and a commit did not
		 * happen (things failed before writing the new superblock
		 * and calling btrfs_finish_extent_commit()), so we can not
		 * discard the physical locations of the block groups.
		 */
		while (!list_empty(&transaction->deleted_bgs)) {
			struct btrfs_block_group_cache *cache;

			cache = list_first_entry(&transaction->deleted_bgs,
						 struct btrfs_block_group_cache,
						 bg_list);
			list_del_init(&cache->bg_list);
			btrfs_put_block_group_trimming(cache);
			btrfs_put_block_group(cache);
		}
C
Chris Mason 已提交
96
		kmem_cache_free(btrfs_transaction_cachep, transaction);
C
Chris Mason 已提交
97
	}
C
Chris Mason 已提交
98 99
}

100 101 102
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
	spin_lock(&tree->lock);
103 104 105 106 107 108
	/*
	 * Do a single barrier for the waitqueue_active check here, the state
	 * of the waitqueue should not change once clear_btree_io_tree is
	 * called.
	 */
	smp_mb();
109 110 111 112 113 114 115 116 117 118 119 120 121 122
	while (!RB_EMPTY_ROOT(&tree->state)) {
		struct rb_node *node;
		struct extent_state *state;

		node = rb_first(&tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
		rb_erase(&state->rb_node, &tree->state);
		RB_CLEAR_NODE(&state->rb_node);
		/*
		 * btree io trees aren't supposed to have tasks waiting for
		 * changes in the flags of extent states ever.
		 */
		ASSERT(!waitqueue_active(&state->wq));
		free_extent_state(state);
123 124

		cond_resched_lock(&tree->lock);
125 126 127 128
	}
	spin_unlock(&tree->lock);
}

129 130
static noinline void switch_commit_roots(struct btrfs_transaction *trans,
					 struct btrfs_fs_info *fs_info)
J
Josef Bacik 已提交
131
{
132 133 134 135 136 137 138 139 140 141
	struct btrfs_root *root, *tmp;

	down_write(&fs_info->commit_root_sem);
	list_for_each_entry_safe(root, tmp, &trans->switch_commits,
				 dirty_list) {
		list_del_init(&root->dirty_list);
		free_extent_buffer(root->commit_root);
		root->commit_root = btrfs_root_node(root);
		if (is_fstree(root->objectid))
			btrfs_unpin_free_ino(root);
142
		clear_btree_io_tree(&root->dirty_log_pages);
143
	}
144 145 146 147 148 149 150 151 152 153 154 155

	/* We can free old roots now. */
	spin_lock(&trans->dropped_roots_lock);
	while (!list_empty(&trans->dropped_roots)) {
		root = list_first_entry(&trans->dropped_roots,
					struct btrfs_root, root_list);
		list_del_init(&root->root_list);
		spin_unlock(&trans->dropped_roots_lock);
		btrfs_drop_and_free_fs_root(fs_info, root);
		spin_lock(&trans->dropped_roots_lock);
	}
	spin_unlock(&trans->dropped_roots_lock);
156
	up_write(&fs_info->commit_root_sem);
J
Josef Bacik 已提交
157 158
}

159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_inc(&trans->num_extwriters);
}

static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_dec(&trans->num_extwriters);
}

static inline void extwriter_counter_init(struct btrfs_transaction *trans,
					  unsigned int type)
{
	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
}

static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
	return atomic_read(&trans->num_extwriters);
182 183
}

C
Chris Mason 已提交
184 185 186
/*
 * either allocate a new transaction or hop into the existing one
 */
187 188
static noinline int join_transaction(struct btrfs_fs_info *fs_info,
				     unsigned int type)
C
Chris Mason 已提交
189 190
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
191

192
	spin_lock(&fs_info->trans_lock);
193
loop:
194
	/* The file system has been taken offline. No new transactions. */
195
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
196
		spin_unlock(&fs_info->trans_lock);
197 198 199
		return -EROFS;
	}

200
	cur_trans = fs_info->running_transaction;
J
Josef Bacik 已提交
201
	if (cur_trans) {
202
		if (cur_trans->aborted) {
203
			spin_unlock(&fs_info->trans_lock);
204
			return cur_trans->aborted;
205
		}
206
		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
207 208 209
			spin_unlock(&fs_info->trans_lock);
			return -EBUSY;
		}
210
		refcount_inc(&cur_trans->use_count);
211
		atomic_inc(&cur_trans->num_writers);
212
		extwriter_counter_inc(cur_trans, type);
213
		spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
214
		return 0;
C
Chris Mason 已提交
215
	}
216
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
217

218 219 220 221 222 223 224
	/*
	 * If we are ATTACH, we just want to catch the current transaction,
	 * and commit it. If there is no transaction, just return ENOENT.
	 */
	if (type == TRANS_ATTACH)
		return -ENOENT;

225 226 227 228 229 230
	/*
	 * JOIN_NOLOCK only happens during the transaction commit, so
	 * it is impossible that ->running_transaction is NULL
	 */
	BUG_ON(type == TRANS_JOIN_NOLOCK);

J
Josef Bacik 已提交
231 232 233
	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
	if (!cur_trans)
		return -ENOMEM;
234

235 236
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
237 238
		/*
		 * someone started a transaction after we unlocked.  Make sure
239
		 * to redo the checks above
240
		 */
J
Josef Bacik 已提交
241
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
242
		goto loop;
243
	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
244
		spin_unlock(&fs_info->trans_lock);
245 246
		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
		return -EROFS;
C
Chris Mason 已提交
247
	}
248

249
	cur_trans->fs_info = fs_info;
J
Josef Bacik 已提交
250
	atomic_set(&cur_trans->num_writers, 1);
251
	extwriter_counter_init(cur_trans, type);
J
Josef Bacik 已提交
252 253
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
254
	init_waitqueue_head(&cur_trans->pending_wait);
255
	cur_trans->state = TRANS_STATE_RUNNING;
J
Josef Bacik 已提交
256 257 258 259
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
260
	refcount_set(&cur_trans->use_count, 2);
261
	atomic_set(&cur_trans->pending_ordered, 0);
262
	cur_trans->flags = 0;
J
Josef Bacik 已提交
263 264
	cur_trans->start_time = get_seconds();

265 266
	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));

L
Liu Bo 已提交
267
	cur_trans->delayed_refs.href_root = RB_ROOT;
268
	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
269
	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
270 271 272 273 274 275

	/*
	 * although the tree mod log is per file system and not per transaction,
	 * the log must never go across transaction boundaries.
	 */
	smp_mb();
J
Julia Lawall 已提交
276
	if (!list_empty(&fs_info->tree_mod_seq_list))
J
Jeff Mahoney 已提交
277
		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
J
Julia Lawall 已提交
278
	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
J
Jeff Mahoney 已提交
279
		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
280
	atomic64_set(&fs_info->tree_mod_seq, 0);
281

J
Josef Bacik 已提交
282 283 284
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
285
	INIT_LIST_HEAD(&cur_trans->pending_chunks);
286
	INIT_LIST_HEAD(&cur_trans->switch_commits);
287
	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
288
	INIT_LIST_HEAD(&cur_trans->io_bgs);
289
	INIT_LIST_HEAD(&cur_trans->dropped_roots);
290
	mutex_init(&cur_trans->cache_write_mutex);
291
	cur_trans->num_dirty_bgs = 0;
292
	spin_lock_init(&cur_trans->dirty_bgs_lock);
293
	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
294
	spin_lock_init(&cur_trans->dropped_roots_lock);
295
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
J
Josef Bacik 已提交
296
	extent_io_tree_init(&cur_trans->dirty_pages,
297 298 299 300
			     fs_info->btree_inode->i_mapping);
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
301
	cur_trans->aborted = 0;
302
	spin_unlock(&fs_info->trans_lock);
303

C
Chris Mason 已提交
304 305 306
	return 0;
}

C
Chris Mason 已提交
307
/*
C
Chris Mason 已提交
308 309 310 311
 * this does all the record keeping required to make sure that a reference
 * counted root is properly recorded in a given transaction.  This is required
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
C
Chris Mason 已提交
312
 */
C
Chris Mason 已提交
313
static int record_root_in_trans(struct btrfs_trans_handle *trans,
314 315
			       struct btrfs_root *root,
			       int force)
316
{
317 318
	struct btrfs_fs_info *fs_info = root->fs_info;

319 320
	if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
	    root->last_trans < trans->transid) || force) {
321
		WARN_ON(root == fs_info->extent_root);
322 323
		WARN_ON(root->commit_root != root->node);

C
Chris Mason 已提交
324
		/*
325
		 * see below for IN_TRANS_SETUP usage rules
C
Chris Mason 已提交
326 327 328
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
329
		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
C
Chris Mason 已提交
330

331
		/* make sure readers find IN_TRANS_SETUP before
C
Chris Mason 已提交
332 333 334 335
		 * they find our root->last_trans update
		 */
		smp_wmb();

336
		spin_lock(&fs_info->fs_roots_radix_lock);
337
		if (root->last_trans == trans->transid && !force) {
338
			spin_unlock(&fs_info->fs_roots_radix_lock);
J
Josef Bacik 已提交
339 340
			return 0;
		}
341 342 343 344
		radix_tree_tag_set(&fs_info->fs_roots_radix,
				   (unsigned long)root->root_key.objectid,
				   BTRFS_ROOT_TRANS_TAG);
		spin_unlock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
345 346 347 348 349 350 351 352 353 354 355 356 357
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
358
		 * with root IN_TRANS_SETUP.  When this is 1, we're still
C
Chris Mason 已提交
359 360 361 362 363 364 365
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
366
		btrfs_init_reloc_root(trans, root);
367
		smp_mb__before_atomic();
368
		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
369 370 371
	}
	return 0;
}
372

C
Chris Mason 已提交
373

374 375 376
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root)
{
377
	struct btrfs_fs_info *fs_info = root->fs_info;
378 379 380 381 382 383 384 385
	struct btrfs_transaction *cur_trans = trans->transaction;

	/* Add ourselves to the transaction dropped list */
	spin_lock(&cur_trans->dropped_roots_lock);
	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
	spin_unlock(&cur_trans->dropped_roots_lock);

	/* Make sure we don't try to update the root at commit time */
386 387
	spin_lock(&fs_info->fs_roots_radix_lock);
	radix_tree_tag_clear(&fs_info->fs_roots_radix,
388 389
			     (unsigned long)root->root_key.objectid,
			     BTRFS_ROOT_TRANS_TAG);
390
	spin_unlock(&fs_info->fs_roots_radix_lock);
391 392
}

C
Chris Mason 已提交
393 394 395
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
396 397
	struct btrfs_fs_info *fs_info = root->fs_info;

398
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
C
Chris Mason 已提交
399 400 401
		return 0;

	/*
402
	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
C
Chris Mason 已提交
403 404 405 406
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
407
	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
C
Chris Mason 已提交
408 409
		return 0;

410
	mutex_lock(&fs_info->reloc_mutex);
411
	record_root_in_trans(trans, root, 0);
412
	mutex_unlock(&fs_info->reloc_mutex);
C
Chris Mason 已提交
413 414 415 416

	return 0;
}

417 418 419
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
	return (trans->state >= TRANS_STATE_BLOCKED &&
420 421
		trans->state < TRANS_STATE_UNBLOCKED &&
		!trans->aborted);
422 423
}

C
Chris Mason 已提交
424 425 426 427
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
428
static void wait_current_trans(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
429
{
430
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
431

432 433
	spin_lock(&fs_info->trans_lock);
	cur_trans = fs_info->running_transaction;
434
	if (cur_trans && is_transaction_blocked(cur_trans)) {
435
		refcount_inc(&cur_trans->use_count);
436
		spin_unlock(&fs_info->trans_lock);
L
Li Zefan 已提交
437

438
		wait_event(fs_info->transaction_wait,
439 440
			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
			   cur_trans->aborted);
441
		btrfs_put_transaction(cur_trans);
J
Josef Bacik 已提交
442
	} else {
443
		spin_unlock(&fs_info->trans_lock);
444
	}
C
Chris Mason 已提交
445 446
}

447
static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
448
{
449
	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
J
Josef Bacik 已提交
450 451 452 453 454 455
		return 0;

	if (type == TRANS_USERSPACE)
		return 1;

	if (type == TRANS_START &&
456
	    !atomic_read(&fs_info->open_ioctl_trans))
457
		return 1;
J
Josef Bacik 已提交
458

459 460 461
	return 0;
}

462 463
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
464 465 466
	struct btrfs_fs_info *fs_info = root->fs_info;

	if (!fs_info->reloc_ctl ||
467
	    !test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
468 469 470 471 472 473 474
	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
	    root->reloc_root)
		return false;

	return true;
}

M
Miao Xie 已提交
475
static struct btrfs_trans_handle *
476
start_transaction(struct btrfs_root *root, unsigned int num_items,
477 478
		  unsigned int type, enum btrfs_reserve_flush_enum flush,
		  bool enforce_qgroups)
C
Chris Mason 已提交
479
{
480 481
	struct btrfs_fs_info *fs_info = root->fs_info;

482 483
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
484
	u64 num_bytes = 0;
485
	u64 qgroup_reserved = 0;
486 487
	bool reloc_reserved = false;
	int ret;
L
liubo 已提交
488

489
	/* Send isn't supposed to start transactions. */
490
	ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB);
491

492
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
L
liubo 已提交
493
		return ERR_PTR(-EROFS);
494

495
	if (current->journal_info) {
496
		WARN_ON(type & TRANS_EXTWRITERS);
497 498
		h = current->journal_info;
		h->use_count++;
499
		WARN_ON(h->use_count > 2);
500 501 502 503
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
504 505 506 507 508

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
509
	if (num_items && root != fs_info->chunk_root) {
510
		qgroup_reserved = num_items * fs_info->nodesize;
511 512
		ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved,
						enforce_qgroups);
513 514
		if (ret)
			return ERR_PTR(ret);
515

516
		num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
517 518 519
		/*
		 * Do the reservation for the relocation root creation
		 */
520
		if (need_reserve_reloc_root(root)) {
521
			num_bytes += fs_info->nodesize;
522 523 524
			reloc_reserved = true;
		}

525
		ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
M
Miao Xie 已提交
526
					  num_bytes, flush);
527
		if (ret)
528
			goto reserve_fail;
529
	}
530
again:
531
	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
532 533 534 535
	if (!h) {
		ret = -ENOMEM;
		goto alloc_fail;
	}
C
Chris Mason 已提交
536

537 538 539 540 541 542
	/*
	 * If we are JOIN_NOLOCK we're already committing a transaction and
	 * waiting on this guy, so we don't need to do the sb_start_intwrite
	 * because we're already holding a ref.  We need this because we could
	 * have raced in and did an fsync() on a file which can kick a commit
	 * and then we deadlock with somebody doing a freeze.
543 544 545
	 *
	 * If we are ATTACH, it means we just want to catch the current
	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
546
	 */
547
	if (type & __TRANS_FREEZABLE)
548
		sb_start_intwrite(fs_info->sb);
549

550 551
	if (may_wait_transaction(fs_info, type))
		wait_current_trans(fs_info);
552

J
Josef Bacik 已提交
553
	do {
554
		ret = join_transaction(fs_info, type);
555
		if (ret == -EBUSY) {
556
			wait_current_trans(fs_info);
557 558 559
			if (unlikely(type == TRANS_ATTACH))
				ret = -ENOENT;
		}
J
Josef Bacik 已提交
560 561
	} while (ret == -EBUSY);

562
	if (ret < 0)
563
		goto join_fail;
564

565
	cur_trans = fs_info->running_transaction;
566 567 568

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
569
	h->root = root;
570
	h->use_count = 1;
571
	h->fs_info = root->fs_info;
572

573
	h->type = type;
574
	h->can_flush_pending_bgs = true;
575
	INIT_LIST_HEAD(&h->new_bgs);
576

577
	smp_mb();
578
	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
579
	    may_wait_transaction(fs_info, type)) {
580
		current->journal_info = h;
581
		btrfs_commit_transaction(h);
582 583 584
		goto again;
	}

585
	if (num_bytes) {
586
		trace_btrfs_space_reservation(fs_info, "transaction",
587
					      h->transid, num_bytes, 1);
588
		h->block_rsv = &fs_info->trans_block_rsv;
589
		h->bytes_reserved = num_bytes;
590
		h->reloc_reserved = reloc_reserved;
591
	}
J
Josef Bacik 已提交
592

593
got_it:
J
Josef Bacik 已提交
594
	btrfs_record_root_in_trans(h, root);
595 596 597

	if (!current->journal_info && type != TRANS_USERSPACE)
		current->journal_info = h;
C
Chris Mason 已提交
598
	return h;
599 600

join_fail:
601
	if (type & __TRANS_FREEZABLE)
602
		sb_end_intwrite(fs_info->sb);
603 604 605
	kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
	if (num_bytes)
606
		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
607 608
					num_bytes);
reserve_fail:
609
	btrfs_qgroup_free_meta(root, qgroup_reserved);
610
	return ERR_PTR(ret);
C
Chris Mason 已提交
611 612
}

613
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
614
						   unsigned int num_items)
615
{
M
Miao Xie 已提交
616
	return start_transaction(root, num_items, TRANS_START,
617
				 BTRFS_RESERVE_FLUSH_ALL, true);
618
}
619

620 621 622 623 624
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
					struct btrfs_root *root,
					unsigned int num_items,
					int min_factor)
{
625
	struct btrfs_fs_info *fs_info = root->fs_info;
626 627 628 629
	struct btrfs_trans_handle *trans;
	u64 num_bytes;
	int ret;

630 631 632 633 634 635 636 637
	/*
	 * We have two callers: unlink and block group removal.  The
	 * former should succeed even if we will temporarily exceed
	 * quota and the latter operates on the extent root so
	 * qgroup enforcement is ignored anyway.
	 */
	trans = start_transaction(root, num_items, TRANS_START,
				  BTRFS_RESERVE_FLUSH_ALL, false);
638 639 640 641 642 643 644
	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
		return trans;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans))
		return trans;

645 646 647
	num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
	ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
				       num_bytes, min_factor);
648
	if (ret) {
649
		btrfs_end_transaction(trans);
650 651 652
		return ERR_PTR(ret);
	}

653
	trans->block_rsv = &fs_info->trans_block_rsv;
654
	trans->bytes_reserved = num_bytes;
655
	trace_btrfs_space_reservation(fs_info, "transaction",
656
				      trans->transid, num_bytes, 1);
657 658 659

	return trans;
}
660

M
Miao Xie 已提交
661
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
662 663
					struct btrfs_root *root,
					unsigned int num_items)
664
{
M
Miao Xie 已提交
665
	return start_transaction(root, num_items, TRANS_START,
666
				 BTRFS_RESERVE_FLUSH_LIMIT, true);
667 668
}

669
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
670
{
671 672
	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
				 true);
673 674
}

675
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
676
{
677
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
678
				 BTRFS_RESERVE_NO_FLUSH, true);
679 680
}

681
struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
682
{
683
	return start_transaction(root, 0, TRANS_USERSPACE,
684
				 BTRFS_RESERVE_NO_FLUSH, true);
685 686
}

M
Miao Xie 已提交
687 688 689 690 691 692 693 694 695 696 697 698 699
/*
 * btrfs_attach_transaction() - catch the running transaction
 *
 * It is used when we want to commit the current the transaction, but
 * don't want to start a new one.
 *
 * Note: If this function return -ENOENT, it just means there is no
 * running transaction. But it is possible that the inactive transaction
 * is still in the memory, not fully on disk. If you hope there is no
 * inactive transaction in the fs when -ENOENT is returned, you should
 * invoke
 *     btrfs_attach_transaction_barrier()
 */
700
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
701
{
702
	return start_transaction(root, 0, TRANS_ATTACH,
703
				 BTRFS_RESERVE_NO_FLUSH, true);
704 705
}

M
Miao Xie 已提交
706
/*
707
 * btrfs_attach_transaction_barrier() - catch the running transaction
M
Miao Xie 已提交
708 709 710 711 712 713 714 715 716 717
 *
 * It is similar to the above function, the differentia is this one
 * will wait for all the inactive transactions until they fully
 * complete.
 */
struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
	struct btrfs_trans_handle *trans;

718
	trans = start_transaction(root, 0, TRANS_ATTACH,
719
				  BTRFS_RESERVE_NO_FLUSH, true);
M
Miao Xie 已提交
720
	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
721
		btrfs_wait_for_commit(root->fs_info, 0);
M
Miao Xie 已提交
722 723 724 725

	return trans;
}

C
Chris Mason 已提交
726
/* wait for a transaction commit to be fully complete */
727
static noinline void wait_for_commit(struct btrfs_transaction *commit)
728
{
729
	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
730 731
}

732
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
733 734
{
	struct btrfs_transaction *cur_trans = NULL, *t;
735
	int ret = 0;
736 737

	if (transid) {
738
		if (transid <= fs_info->last_trans_committed)
J
Josef Bacik 已提交
739
			goto out;
740 741

		/* find specified transaction */
742 743
		spin_lock(&fs_info->trans_lock);
		list_for_each_entry(t, &fs_info->trans_list, list) {
744 745
			if (t->transid == transid) {
				cur_trans = t;
746
				refcount_inc(&cur_trans->use_count);
747
				ret = 0;
748 749
				break;
			}
750 751
			if (t->transid > transid) {
				ret = 0;
752
				break;
753
			}
754
		}
755
		spin_unlock(&fs_info->trans_lock);
S
Sage Weil 已提交
756 757 758 759 760 761

		/*
		 * The specified transaction doesn't exist, or we
		 * raced with btrfs_commit_transaction
		 */
		if (!cur_trans) {
762
			if (transid > fs_info->last_trans_committed)
S
Sage Weil 已提交
763
				ret = -EINVAL;
764
			goto out;
S
Sage Weil 已提交
765
		}
766 767
	} else {
		/* find newest transaction that is committing | committed */
768 769
		spin_lock(&fs_info->trans_lock);
		list_for_each_entry_reverse(t, &fs_info->trans_list,
770
					    list) {
771 772
			if (t->state >= TRANS_STATE_COMMIT_START) {
				if (t->state == TRANS_STATE_COMPLETED)
773
					break;
774
				cur_trans = t;
775
				refcount_inc(&cur_trans->use_count);
776 777 778
				break;
			}
		}
779
		spin_unlock(&fs_info->trans_lock);
780
		if (!cur_trans)
J
Josef Bacik 已提交
781
			goto out;  /* nothing committing|committed */
782 783
	}

784
	wait_for_commit(cur_trans);
785
	btrfs_put_transaction(cur_trans);
J
Josef Bacik 已提交
786
out:
787 788 789
	return ret;
}

790
void btrfs_throttle(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
791
{
792
	if (!atomic_read(&fs_info->open_ioctl_trans))
793
		wait_current_trans(fs_info);
C
Chris Mason 已提交
794 795
}

796
static int should_end_transaction(struct btrfs_trans_handle *trans)
797
{
798
	struct btrfs_fs_info *fs_info = trans->fs_info;
799 800

	if (fs_info->global_block_rsv.space_info->full &&
801
	    btrfs_check_space_for_delayed_refs(trans, fs_info))
802
		return 1;
803

804
	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
805 806
}

807
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
808 809
{
	struct btrfs_transaction *cur_trans = trans->transaction;
810
	struct btrfs_fs_info *fs_info = trans->fs_info;
811
	int updates;
812
	int err;
813

J
Josef Bacik 已提交
814
	smp_mb();
815 816
	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
	    cur_trans->delayed_refs.flushing)
817 818 819 820
		return 1;

	updates = trans->delayed_ref_updates;
	trans->delayed_ref_updates = 0;
821
	if (updates) {
822
		err = btrfs_run_delayed_refs(trans, fs_info, updates * 2);
823 824 825
		if (err) /* Error code will also eval true */
			return err;
	}
826

827
	return should_end_transaction(trans);
828 829
}

830
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
831
				   int throttle)
C
Chris Mason 已提交
832
{
833
	struct btrfs_fs_info *info = trans->fs_info;
834
	struct btrfs_transaction *cur_trans = trans->transaction;
835
	u64 transid = trans->transid;
836
	unsigned long cur = trans->delayed_ref_updates;
837
	int lock = (trans->type != TRANS_JOIN_NOLOCK);
838
	int err = 0;
C
Chris Mason 已提交
839
	int must_run_delayed_refs = 0;
840

841 842
	if (trans->use_count > 1) {
		trans->use_count--;
843 844 845 846
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

847
	btrfs_trans_release_metadata(trans, info);
848
	trans->block_rsv = NULL;
849

850
	if (!list_empty(&trans->new_bgs))
851
		btrfs_create_pending_block_groups(trans, info);
852

853
	trans->delayed_ref_updates = 0;
C
Chris Mason 已提交
854 855
	if (!trans->sync) {
		must_run_delayed_refs =
856
			btrfs_should_throttle_delayed_refs(trans, info);
857
		cur = max_t(unsigned long, cur, 32);
C
Chris Mason 已提交
858 859 860 861 862 863 864 865

		/*
		 * don't make the caller wait if they are from a NOLOCK
		 * or ATTACH transaction, it will deadlock with commit
		 */
		if (must_run_delayed_refs == 1 &&
		    (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
			must_run_delayed_refs = 2;
866
	}
867

868
	btrfs_trans_release_metadata(trans, info);
869
	trans->block_rsv = NULL;
870

871
	if (!list_empty(&trans->new_bgs))
872
		btrfs_create_pending_block_groups(trans, info);
873

874 875
	btrfs_trans_release_chunk_metadata(trans);

876
	if (lock && !atomic_read(&info->open_ioctl_trans) &&
877
	    should_end_transaction(trans) &&
S
Seraphime Kirkovski 已提交
878
	    READ_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
879 880 881 882
		spin_lock(&info->trans_lock);
		if (cur_trans->state == TRANS_STATE_RUNNING)
			cur_trans->state = TRANS_STATE_BLOCKED;
		spin_unlock(&info->trans_lock);
J
Josef Bacik 已提交
883
	}
884

S
Seraphime Kirkovski 已提交
885
	if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
886
		if (throttle)
887
			return btrfs_commit_transaction(trans);
888
		else
889 890 891
			wake_up_process(info->transaction_kthread);
	}

892
	if (trans->type & __TRANS_FREEZABLE)
893
		sb_end_intwrite(info->sb);
894

895
	WARN_ON(cur_trans != info->running_transaction);
896 897
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
898
	extwriter_counter_dec(cur_trans, trans->type);
899

900 901 902
	/*
	 * Make sure counter is updated before we wake up waiters.
	 */
903
	smp_mb();
C
Chris Mason 已提交
904 905
	if (waitqueue_active(&cur_trans->writer_wait))
		wake_up(&cur_trans->writer_wait);
906
	btrfs_put_transaction(cur_trans);
J
Josef Bacik 已提交
907 908 909

	if (current->journal_info == trans)
		current->journal_info = NULL;
910

Y
Yan, Zheng 已提交
911
	if (throttle)
912
		btrfs_run_delayed_iputs(info);
Y
Yan, Zheng 已提交
913

914
	if (trans->aborted ||
915
	    test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
J
Josef Bacik 已提交
916
		wake_up_process(info->transaction_kthread);
917
		err = -EIO;
J
Josef Bacik 已提交
918
	}
919

920
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
C
Chris Mason 已提交
921
	if (must_run_delayed_refs) {
922
		btrfs_async_run_delayed_refs(info, cur, transid,
C
Chris Mason 已提交
923 924
					     must_run_delayed_refs == 1);
	}
925
	return err;
C
Chris Mason 已提交
926 927
}

928
int btrfs_end_transaction(struct btrfs_trans_handle *trans)
929
{
930
	return __btrfs_end_transaction(trans, 0);
931 932
}

933
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
934
{
935
	return __btrfs_end_transaction(trans, 1);
936 937
}

C
Chris Mason 已提交
938 939 940
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
941
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
942
 */
943
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
944
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
945
{
946
	int err = 0;
947
	int werr = 0;
948
	struct address_space *mapping = fs_info->btree_inode->i_mapping;
949
	struct extent_state *cached_state = NULL;
950
	u64 start = 0;
951
	u64 end;
952

J
Josef Bacik 已提交
953
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
954
				      mark, &cached_state)) {
955 956 957 958
		bool wait_writeback = false;

		err = convert_extent_bit(dirty_pages, start, end,
					 EXTENT_NEED_WAIT,
959
					 mark, &cached_state);
960 961 962 963 964
		/*
		 * convert_extent_bit can return -ENOMEM, which is most of the
		 * time a temporary error. So when it happens, ignore the error
		 * and wait for writeback of this range to finish - because we
		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
965 966 967 968 969
		 * to __btrfs_wait_marked_extents() would not know that
		 * writeback for this range started and therefore wouldn't
		 * wait for it to finish - we don't want to commit a
		 * superblock that points to btree nodes/leafs for which
		 * writeback hasn't finished yet (and without errors).
970 971 972 973 974 975 976 977 978
		 * We cleanup any entries left in the io tree when committing
		 * the transaction (through clear_btree_io_tree()).
		 */
		if (err == -ENOMEM) {
			err = 0;
			wait_writeback = true;
		}
		if (!err)
			err = filemap_fdatawrite_range(mapping, start, end);
J
Josef Bacik 已提交
979 980
		if (err)
			werr = err;
981 982
		else if (wait_writeback)
			werr = filemap_fdatawait_range(mapping, start, end);
983
		free_extent_state(cached_state);
984
		cached_state = NULL;
J
Josef Bacik 已提交
985 986
		cond_resched();
		start = end + 1;
987
	}
988 989 990 991 992 993 994 995 996
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
997 998
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
				       struct extent_io_tree *dirty_pages)
999 1000 1001
{
	int err = 0;
	int werr = 0;
1002
	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1003
	struct extent_state *cached_state = NULL;
1004 1005
	u64 start = 0;
	u64 end;
1006

J
Josef Bacik 已提交
1007
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
1008
				      EXTENT_NEED_WAIT, &cached_state)) {
1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023
		/*
		 * Ignore -ENOMEM errors returned by clear_extent_bit().
		 * When committing the transaction, we'll remove any entries
		 * left in the io tree. For a log commit, we don't remove them
		 * after committing the log because the tree can be accessed
		 * concurrently - we do it only at transaction commit time when
		 * it's safe to do it (through clear_btree_io_tree()).
		 */
		err = clear_extent_bit(dirty_pages, start, end,
				       EXTENT_NEED_WAIT,
				       0, 0, &cached_state, GFP_NOFS);
		if (err == -ENOMEM)
			err = 0;
		if (!err)
			err = filemap_fdatawait_range(mapping, start, end);
J
Josef Bacik 已提交
1024 1025
		if (err)
			werr = err;
1026 1027
		free_extent_state(cached_state);
		cached_state = NULL;
J
Josef Bacik 已提交
1028 1029
		cond_resched();
		start = end + 1;
1030
	}
1031 1032
	if (err)
		werr = err;
1033 1034
	return werr;
}
1035

1036 1037 1038 1039 1040
int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
		       struct extent_io_tree *dirty_pages)
{
	bool errors = false;
	int err;
1041

1042 1043 1044 1045 1046 1047 1048 1049
	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
		errors = true;

	if (errors && !err)
		err = -EIO;
	return err;
}
1050

1051 1052 1053 1054 1055 1056
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
{
	struct btrfs_fs_info *fs_info = log_root->fs_info;
	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
	bool errors = false;
	int err;
1057

1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071
	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);

	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
	if ((mark & EXTENT_DIRTY) &&
	    test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
		errors = true;

	if ((mark & EXTENT_NEW) &&
	    test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
		errors = true;

	if (errors && !err)
		err = -EIO;
	return err;
C
Chris Mason 已提交
1072 1073
}

1074 1075 1076 1077 1078
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit
 */
1079
static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info,
1080
				struct extent_io_tree *dirty_pages, int mark)
1081 1082 1083
{
	int ret;
	int ret2;
1084
	struct blk_plug plug;
1085

1086
	blk_start_plug(&plug);
1087
	ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark);
1088
	blk_finish_plug(&plug);
1089
	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
1090 1091 1092 1093 1094 1095

	if (ret)
		return ret;
	if (ret2)
		return ret2;
	return 0;
1096 1097
}

1098
static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
1099
					    struct btrfs_fs_info *fs_info)
1100
{
1101 1102
	int ret;

1103
	ret = btrfs_write_and_wait_marked_extents(fs_info,
1104 1105
					   &trans->transaction->dirty_pages,
					   EXTENT_DIRTY);
1106 1107 1108
	clear_btree_io_tree(&trans->transaction->dirty_pages);

	return ret;
1109 1110
}

C
Chris Mason 已提交
1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
1121 1122
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
1123 1124
{
	int ret;
1125
	u64 old_root_bytenr;
1126
	u64 old_root_used;
1127 1128
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *tree_root = fs_info->tree_root;
C
Chris Mason 已提交
1129

1130
	old_root_used = btrfs_root_used(&root->root_item);
1131

C
Chris Mason 已提交
1132
	while (1) {
1133
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
1134
		if (old_root_bytenr == root->node->start &&
1135
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
1136
			break;
1137

1138
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
1139
		ret = btrfs_update_root(trans, tree_root,
1140 1141
					&root->root_key,
					&root->root_item);
1142 1143
		if (ret)
			return ret;
1144

1145
		old_root_used = btrfs_root_used(&root->root_item);
1146
	}
1147

1148 1149 1150
	return 0;
}

C
Chris Mason 已提交
1151 1152
/*
 * update all the cowonly tree roots on disk
1153 1154 1155 1156
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
1157
 */
1158
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
1159
					 struct btrfs_fs_info *fs_info)
1160
{
1161
	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
1162
	struct list_head *io_bgs = &trans->transaction->io_bgs;
1163
	struct list_head *next;
1164
	struct extent_buffer *eb;
1165
	int ret;
1166 1167

	eb = btrfs_lock_root_node(fs_info->tree_root);
1168 1169
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
			      0, &eb);
1170 1171
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
1172

1173 1174 1175
	if (ret)
		return ret;

1176
	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
1177 1178
	if (ret)
		return ret;
1179

1180
	ret = btrfs_run_dev_stats(trans, fs_info);
1181 1182
	if (ret)
		return ret;
1183
	ret = btrfs_run_dev_replace(trans, fs_info);
1184 1185
	if (ret)
		return ret;
1186
	ret = btrfs_run_qgroups(trans, fs_info);
1187 1188
	if (ret)
		return ret;
1189

1190
	ret = btrfs_setup_space_cache(trans, fs_info);
1191 1192 1193
	if (ret)
		return ret;

1194
	/* run_qgroups might have added some more refs */
1195
	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
1196 1197
	if (ret)
		return ret;
1198
again:
C
Chris Mason 已提交
1199
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
1200
		struct btrfs_root *root;
1201 1202 1203
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
1204
		clear_bit(BTRFS_ROOT_DIRTY, &root->state);
1205

1206 1207 1208
		if (root != fs_info->extent_root)
			list_add_tail(&root->dirty_list,
				      &trans->transaction->switch_commits);
1209 1210 1211
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
1212
		ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
1213 1214
		if (ret)
			return ret;
C
Chris Mason 已提交
1215
	}
1216

1217
	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
1218
		ret = btrfs_write_dirty_block_groups(trans, fs_info);
1219 1220
		if (ret)
			return ret;
1221
		ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
1222 1223 1224 1225 1226 1227 1228
		if (ret)
			return ret;
	}

	if (!list_empty(&fs_info->dirty_cowonly_roots))
		goto again;

1229 1230
	list_add_tail(&fs_info->extent_root->dirty_list,
		      &trans->transaction->switch_commits);
1231 1232
	btrfs_after_dev_replace_commit(fs_info);

C
Chris Mason 已提交
1233 1234 1235
	return 0;
}

C
Chris Mason 已提交
1236 1237 1238 1239 1240
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
1241
void btrfs_add_dead_root(struct btrfs_root *root)
1242
{
1243 1244 1245
	struct btrfs_fs_info *fs_info = root->fs_info;

	spin_lock(&fs_info->trans_lock);
1246
	if (list_empty(&root->root_list))
1247 1248
		list_add_tail(&root->root_list, &fs_info->dead_roots);
	spin_unlock(&fs_info->trans_lock);
1249 1250
}

C
Chris Mason 已提交
1251
/*
1252
 * update all the cowonly tree roots on disk
C
Chris Mason 已提交
1253
 */
1254
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
1255
				    struct btrfs_fs_info *fs_info)
1256 1257 1258 1259
{
	struct btrfs_root *gang[8];
	int i;
	int ret;
1260 1261
	int err = 0;

J
Josef Bacik 已提交
1262
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
1263
	while (1) {
1264 1265
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
1266 1267 1268 1269 1270
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
1271
			struct btrfs_root *root = gang[i];
1272 1273 1274
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
J
Josef Bacik 已提交
1275
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
1276

1277
			btrfs_free_log(trans, root);
1278
			btrfs_update_reloc_root(trans, root);
1279
			btrfs_orphan_commit_root(trans, root);
1280

1281 1282
			btrfs_save_ino_cache(root, trans);

1283
			/* see comments in should_cow_block() */
1284
			clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1285
			smp_mb__after_atomic();
1286

1287
			if (root->commit_root != root->node) {
1288 1289
				list_add_tail(&root->dirty_list,
					&trans->transaction->switch_commits);
1290 1291 1292
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
1293 1294

			err = btrfs_update_root(trans, fs_info->tree_root,
1295 1296
						&root->root_key,
						&root->root_item);
J
Josef Bacik 已提交
1297
			spin_lock(&fs_info->fs_roots_radix_lock);
1298 1299
			if (err)
				break;
1300
			btrfs_qgroup_free_meta_all(root);
1301 1302
		}
	}
J
Josef Bacik 已提交
1303
	spin_unlock(&fs_info->fs_roots_radix_lock);
1304
	return err;
1305 1306
}

C
Chris Mason 已提交
1307
/*
1308 1309
 * defrag a given btree.
 * Every leaf in the btree is read and defragged.
C
Chris Mason 已提交
1310
 */
1311
int btrfs_defrag_root(struct btrfs_root *root)
1312 1313 1314
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
1315
	int ret;
1316

1317
	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
1318
		return 0;
1319

1320
	while (1) {
1321 1322 1323 1324
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

1325
		ret = btrfs_defrag_leaves(trans, root);
1326

1327
		btrfs_end_transaction(trans);
1328
		btrfs_btree_balance_dirty(info);
1329 1330
		cond_resched();

1331
		if (btrfs_fs_closing(info) || ret != -EAGAIN)
1332
			break;
1333

1334 1335
		if (btrfs_defrag_cancelled(info)) {
			btrfs_debug(info, "defrag_root cancelled");
1336 1337 1338
			ret = -EAGAIN;
			break;
		}
1339
	}
1340
	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
1341
	return ret;
1342 1343
}

1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364
/*
 * Do all special snapshot related qgroup dirty hack.
 *
 * Will do all needed qgroup inherit and dirty hack like switch commit
 * roots inside one transaction and write all btree into disk, to make
 * qgroup works.
 */
static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
				   struct btrfs_root *src,
				   struct btrfs_root *parent,
				   struct btrfs_qgroup_inherit *inherit,
				   u64 dst_objectid)
{
	struct btrfs_fs_info *fs_info = src->fs_info;
	int ret;

	/*
	 * Save some performance in the case that qgroups are not
	 * enabled. If this check races with the ioctl, rescan will
	 * kick in anyway.
	 */
1365
	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1366 1367 1368 1369 1370 1371 1372 1373
		return 0;

	/*
	 * We are going to commit transaction, see btrfs_commit_transaction()
	 * comment for reason locking tree_log_mutex
	 */
	mutex_lock(&fs_info->tree_log_mutex);

1374
	ret = commit_fs_roots(trans, fs_info);
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402
	if (ret)
		goto out;
	ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
	if (ret < 0)
		goto out;
	ret = btrfs_qgroup_account_extents(trans, fs_info);
	if (ret < 0)
		goto out;

	/* Now qgroup are all updated, we can inherit it to new qgroups */
	ret = btrfs_qgroup_inherit(trans, fs_info,
				   src->root_key.objectid, dst_objectid,
				   inherit);
	if (ret < 0)
		goto out;

	/*
	 * Now we do a simplified commit transaction, which will:
	 * 1) commit all subvolume and extent tree
	 *    To ensure all subvolume and extent tree have a valid
	 *    commit_root to accounting later insert_dir_item()
	 * 2) write all btree blocks onto disk
	 *    This is to make sure later btree modification will be cowed
	 *    Or commit_root can be populated and cause wrong qgroup numbers
	 * In this simplified commit, we don't really care about other trees
	 * like chunk and root tree, as they won't affect qgroup.
	 * And we don't write super to avoid half committed status.
	 */
1403
	ret = commit_cowonly_roots(trans, fs_info);
1404 1405 1406
	if (ret)
		goto out;
	switch_commit_roots(trans->transaction, fs_info);
1407
	ret = btrfs_write_and_wait_transaction(trans, fs_info);
1408
	if (ret)
1409
		btrfs_handle_fs_error(fs_info, ret,
1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425
			"Error while writing out transaction for qgroup");

out:
	mutex_unlock(&fs_info->tree_log_mutex);

	/*
	 * Force parent root to be updated, as we recorded it before so its
	 * last_trans == cur_transid.
	 * Or it won't be committed again onto disk after later
	 * insert_dir_item()
	 */
	if (!ret)
		record_root_in_trans(trans, parent, 1);
	return ret;
}

C
Chris Mason 已提交
1426 1427
/*
 * new snapshots need to be created at a very specific time in the
1428 1429 1430 1431 1432 1433
 * transaction commit.  This does the actual creation.
 *
 * Note:
 * If the error which may affect the commitment of the current transaction
 * happens, we should return the error number. If the error which just affect
 * the creation of the pending snapshots, just return 0.
C
Chris Mason 已提交
1434
 */
1435
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1436 1437 1438 1439
				   struct btrfs_fs_info *fs_info,
				   struct btrfs_pending_snapshot *pending)
{
	struct btrfs_key key;
1440
	struct btrfs_root_item *new_root_item;
1441 1442
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
1443
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
1444
	struct btrfs_block_rsv *rsv;
1445
	struct inode *parent_inode;
1446 1447
	struct btrfs_path *path;
	struct btrfs_dir_item *dir_item;
1448
	struct dentry *dentry;
1449
	struct extent_buffer *tmp;
1450
	struct extent_buffer *old;
1451
	struct timespec cur_time;
1452
	int ret = 0;
1453
	u64 to_reserve = 0;
1454
	u64 index = 0;
1455
	u64 objectid;
L
Li Zefan 已提交
1456
	u64 root_flags;
1457
	uuid_le new_uuid;
1458

1459 1460
	ASSERT(pending->path);
	path = pending->path;
1461

1462 1463
	ASSERT(pending->root_item);
	new_root_item = pending->root_item;
1464

1465 1466
	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
	if (pending->error)
1467
		goto no_free_objectid;
1468

1469 1470 1471 1472 1473 1474
	/*
	 * Make qgroup to skip current new snapshot's qgroupid, as it is
	 * accounted by later btrfs_qgroup_inherit().
	 */
	btrfs_set_skip_qgroup(trans, objectid);

1475
	btrfs_reloc_pre_snapshot(pending, &to_reserve);
1476 1477

	if (to_reserve > 0) {
1478 1479 1480 1481 1482
		pending->error = btrfs_block_rsv_add(root,
						     &pending->block_rsv,
						     to_reserve,
						     BTRFS_RESERVE_NO_FLUSH);
		if (pending->error)
1483
			goto clear_skip_qgroup;
1484 1485
	}

1486
	key.objectid = objectid;
1487 1488
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
1489

1490
	rsv = trans->block_rsv;
1491
	trans->block_rsv = &pending->block_rsv;
1492
	trans->bytes_reserved = trans->block_rsv->reserved;
1493
	trace_btrfs_space_reservation(fs_info, "transaction",
1494 1495
				      trans->transid,
				      trans->bytes_reserved, 1);
1496
	dentry = pending->dentry;
1497
	parent_inode = pending->dir;
1498
	parent_root = BTRFS_I(parent_inode)->root;
1499
	record_root_in_trans(trans, parent_root, 0);
1500

1501
	cur_time = current_time(parent_inode);
1502

1503 1504 1505
	/*
	 * insert the directory item
	 */
1506
	ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
1507
	BUG_ON(ret); /* -ENOMEM */
1508 1509 1510

	/* check if there is a file/dir which has the same name. */
	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1511
					 btrfs_ino(BTRFS_I(parent_inode)),
1512 1513 1514
					 dentry->d_name.name,
					 dentry->d_name.len, 0);
	if (dir_item != NULL && !IS_ERR(dir_item)) {
1515
		pending->error = -EEXIST;
1516
		goto dir_item_existed;
1517 1518
	} else if (IS_ERR(dir_item)) {
		ret = PTR_ERR(dir_item);
1519
		btrfs_abort_transaction(trans, ret);
1520
		goto fail;
1521
	}
1522
	btrfs_release_path(path);
1523

1524 1525 1526 1527 1528 1529
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
1530
	ret = btrfs_run_delayed_items(trans, fs_info);
1531
	if (ret) {	/* Transaction aborted */
1532
		btrfs_abort_transaction(trans, ret);
1533 1534
		goto fail;
	}
1535

1536
	record_root_in_trans(trans, root, 0);
1537 1538
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1539
	btrfs_check_and_init_root_item(new_root_item);
1540

L
Li Zefan 已提交
1541 1542 1543 1544 1545 1546 1547
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

1548 1549 1550 1551 1552 1553
	btrfs_set_root_generation_v2(new_root_item,
			trans->transid);
	uuid_le_gen(&new_uuid);
	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
			BTRFS_UUID_SIZE);
1554 1555 1556 1557 1558 1559 1560 1561
	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
		memset(new_root_item->received_uuid, 0,
		       sizeof(new_root_item->received_uuid));
		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
		btrfs_set_root_stransid(new_root_item, 0);
		btrfs_set_root_rtransid(new_root_item, 0);
	}
1562 1563
	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
1564 1565
	btrfs_set_root_otransid(new_root_item, trans->transid);

1566
	old = btrfs_lock_root_node(root);
1567
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
1568 1569 1570
	if (ret) {
		btrfs_tree_unlock(old);
		free_extent_buffer(old);
1571
		btrfs_abort_transaction(trans, ret);
1572
		goto fail;
1573
	}
1574

1575 1576
	btrfs_set_lock_blocking(old);

1577
	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1578
	/* clean up in any case */
1579 1580
	btrfs_tree_unlock(old);
	free_extent_buffer(old);
1581
	if (ret) {
1582
		btrfs_abort_transaction(trans, ret);
1583 1584
		goto fail;
	}
1585
	/* see comments in should_cow_block() */
1586
	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1587 1588
	smp_wmb();

1589
	btrfs_set_root_node(new_root_item, tmp);
1590 1591 1592
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1593 1594
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1595
	if (ret) {
1596
		btrfs_abort_transaction(trans, ret);
1597 1598
		goto fail;
	}
1599

1600 1601 1602
	/*
	 * insert root back/forward references
	 */
1603
	ret = btrfs_add_root_ref(trans, fs_info, objectid,
1604
				 parent_root->root_key.objectid,
1605
				 btrfs_ino(BTRFS_I(parent_inode)), index,
1606
				 dentry->d_name.name, dentry->d_name.len);
1607
	if (ret) {
1608
		btrfs_abort_transaction(trans, ret);
1609 1610
		goto fail;
	}
1611

1612
	key.offset = (u64)-1;
1613
	pending->snap = btrfs_read_fs_root_no_name(fs_info, &key);
1614 1615
	if (IS_ERR(pending->snap)) {
		ret = PTR_ERR(pending->snap);
1616
		btrfs_abort_transaction(trans, ret);
1617
		goto fail;
1618
	}
1619

1620
	ret = btrfs_reloc_post_snapshot(trans, pending);
1621
	if (ret) {
1622
		btrfs_abort_transaction(trans, ret);
1623 1624
		goto fail;
	}
1625

1626
	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
1627
	if (ret) {
1628
		btrfs_abort_transaction(trans, ret);
1629 1630
		goto fail;
	}
1631

1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
	/*
	 * Do special qgroup accounting for snapshot, as we do some qgroup
	 * snapshot hack to do fast snapshot.
	 * To co-operate with that hack, we do hack again.
	 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
	 */
	ret = qgroup_account_snapshot(trans, root, parent_root,
				      pending->inherit, objectid);
	if (ret < 0)
		goto fail;

1643 1644
	ret = btrfs_insert_dir_item(trans, parent_root,
				    dentry->d_name.name, dentry->d_name.len,
1645
				    BTRFS_I(parent_inode), &key,
1646 1647
				    BTRFS_FT_DIR, index);
	/* We have check then name at the beginning, so it is impossible. */
C
Chris Mason 已提交
1648
	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1649
	if (ret) {
1650
		btrfs_abort_transaction(trans, ret);
1651 1652
		goto fail;
	}
1653

1654
	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
1655
					 dentry->d_name.len * 2);
1656
	parent_inode->i_mtime = parent_inode->i_ctime =
1657
		current_time(parent_inode);
1658
	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
1659
	if (ret) {
1660
		btrfs_abort_transaction(trans, ret);
1661 1662
		goto fail;
	}
1663
	ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b,
1664 1665
				  BTRFS_UUID_KEY_SUBVOL, objectid);
	if (ret) {
1666
		btrfs_abort_transaction(trans, ret);
1667 1668 1669
		goto fail;
	}
	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
1670
		ret = btrfs_uuid_tree_add(trans, fs_info,
1671 1672 1673 1674
					  new_root_item->received_uuid,
					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
					  objectid);
		if (ret && ret != -EEXIST) {
1675
			btrfs_abort_transaction(trans, ret);
1676 1677 1678
			goto fail;
		}
	}
1679

1680
	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
1681
	if (ret) {
1682
		btrfs_abort_transaction(trans, ret);
1683 1684 1685
		goto fail;
	}

1686
fail:
1687 1688
	pending->error = ret;
dir_item_existed:
L
Liu Bo 已提交
1689
	trans->block_rsv = rsv;
1690
	trans->bytes_reserved = 0;
1691 1692
clear_skip_qgroup:
	btrfs_clear_skip_qgroup(trans);
1693 1694
no_free_objectid:
	kfree(new_root_item);
1695
	pending->root_item = NULL;
1696
	btrfs_free_path(path);
1697 1698
	pending->path = NULL;

1699
	return ret;
1700 1701
}

C
Chris Mason 已提交
1702 1703 1704
/*
 * create all the snapshots we've scheduled for creation
 */
1705 1706
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
					     struct btrfs_fs_info *fs_info)
1707
{
1708
	struct btrfs_pending_snapshot *pending, *next;
1709
	struct list_head *head = &trans->transaction->pending_snapshots;
1710
	int ret = 0;
1711

1712 1713 1714 1715 1716 1717 1718
	list_for_each_entry_safe(pending, next, head, list) {
		list_del(&pending->list);
		ret = create_pending_snapshot(trans, fs_info, pending);
		if (ret)
			break;
	}
	return ret;
1719 1720
}

1721
static void update_super_roots(struct btrfs_fs_info *fs_info)
1722 1723 1724 1725
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1726
	super = fs_info->super_copy;
1727

1728
	root_item = &fs_info->chunk_root->root_item;
1729 1730 1731 1732
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;

1733
	root_item = &fs_info->tree_root->root_item;
1734 1735 1736
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1737
	if (btrfs_test_opt(fs_info, SPACE_CACHE))
1738
		super->cache_generation = root_item->generation;
1739
	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
1740
		super->uuid_tree_generation = root_item->generation;
1741 1742
}

1743 1744
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
1745
	struct btrfs_transaction *trans;
1746
	int ret = 0;
1747

J
Josef Bacik 已提交
1748
	spin_lock(&info->trans_lock);
1749 1750 1751
	trans = info->running_transaction;
	if (trans)
		ret = (trans->state >= TRANS_STATE_COMMIT_START);
J
Josef Bacik 已提交
1752
	spin_unlock(&info->trans_lock);
1753 1754 1755
	return ret;
}

1756 1757
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
1758
	struct btrfs_transaction *trans;
1759
	int ret = 0;
1760

J
Josef Bacik 已提交
1761
	spin_lock(&info->trans_lock);
1762 1763 1764
	trans = info->running_transaction;
	if (trans)
		ret = is_transaction_blocked(trans);
J
Josef Bacik 已提交
1765
	spin_unlock(&info->trans_lock);
1766 1767 1768
	return ret;
}

S
Sage Weil 已提交
1769 1770 1771 1772
/*
 * wait for the current transaction commit to start and block subsequent
 * transaction joins
 */
1773
static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
S
Sage Weil 已提交
1774 1775
					    struct btrfs_transaction *trans)
{
1776 1777
	wait_event(fs_info->transaction_blocked_wait,
		   trans->state >= TRANS_STATE_COMMIT_START || trans->aborted);
S
Sage Weil 已提交
1778 1779 1780 1781 1782 1783
}

/*
 * wait for the current transaction to start and then become unblocked.
 * caller holds ref.
 */
1784 1785 1786
static void wait_current_trans_commit_start_and_unblock(
					struct btrfs_fs_info *fs_info,
					struct btrfs_transaction *trans)
S
Sage Weil 已提交
1787
{
1788 1789
	wait_event(fs_info->transaction_wait,
		   trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted);
S
Sage Weil 已提交
1790 1791 1792 1793 1794 1795 1796 1797
}

/*
 * commit transactions asynchronously. once btrfs_commit_transaction_async
 * returns, any subsequent transaction will not be allowed to join.
 */
struct btrfs_async_commit {
	struct btrfs_trans_handle *newtrans;
1798
	struct work_struct work;
S
Sage Weil 已提交
1799 1800 1801 1802 1803
};

static void do_async_commit(struct work_struct *work)
{
	struct btrfs_async_commit *ac =
1804
		container_of(work, struct btrfs_async_commit, work);
S
Sage Weil 已提交
1805

1806 1807 1808 1809
	/*
	 * We've got freeze protection passed with the transaction.
	 * Tell lockdep about it.
	 */
L
Liu Bo 已提交
1810
	if (ac->newtrans->type & __TRANS_FREEZABLE)
1811
		__sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS);
1812

1813 1814
	current->journal_info = ac->newtrans;

1815
	btrfs_commit_transaction(ac->newtrans);
S
Sage Weil 已提交
1816 1817 1818 1819 1820 1821
	kfree(ac);
}

int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
				   int wait_for_unblock)
{
1822
	struct btrfs_fs_info *fs_info = trans->fs_info;
S
Sage Weil 已提交
1823 1824 1825 1826
	struct btrfs_async_commit *ac;
	struct btrfs_transaction *cur_trans;

	ac = kmalloc(sizeof(*ac), GFP_NOFS);
T
Tsutomu Itoh 已提交
1827 1828
	if (!ac)
		return -ENOMEM;
S
Sage Weil 已提交
1829

1830
	INIT_WORK(&ac->work, do_async_commit);
1831
	ac->newtrans = btrfs_join_transaction(trans->root);
1832 1833 1834 1835 1836
	if (IS_ERR(ac->newtrans)) {
		int err = PTR_ERR(ac->newtrans);
		kfree(ac);
		return err;
	}
S
Sage Weil 已提交
1837 1838 1839

	/* take transaction reference */
	cur_trans = trans->transaction;
1840
	refcount_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1841

1842
	btrfs_end_transaction(trans);
1843 1844 1845 1846 1847

	/*
	 * Tell lockdep we've released the freeze rwsem, since the
	 * async commit thread will be the one to unlock it.
	 */
L
Liu Bo 已提交
1848
	if (ac->newtrans->type & __TRANS_FREEZABLE)
1849
		__sb_writers_release(fs_info->sb, SB_FREEZE_FS);
1850

1851
	schedule_work(&ac->work);
S
Sage Weil 已提交
1852 1853 1854

	/* wait for transaction to start and unblock */
	if (wait_for_unblock)
1855
		wait_current_trans_commit_start_and_unblock(fs_info, cur_trans);
S
Sage Weil 已提交
1856
	else
1857
		wait_current_trans_commit_start(fs_info, cur_trans);
S
Sage Weil 已提交
1858

1859 1860 1861
	if (current->journal_info == trans)
		current->journal_info = NULL;

1862
	btrfs_put_transaction(cur_trans);
S
Sage Weil 已提交
1863 1864 1865
	return 0;
}

1866 1867

static void cleanup_transaction(struct btrfs_trans_handle *trans,
1868
				struct btrfs_root *root, int err)
1869
{
1870
	struct btrfs_fs_info *fs_info = root->fs_info;
1871
	struct btrfs_transaction *cur_trans = trans->transaction;
1872
	DEFINE_WAIT(wait);
1873 1874 1875

	WARN_ON(trans->use_count > 1);

1876
	btrfs_abort_transaction(trans, err);
1877

1878
	spin_lock(&fs_info->trans_lock);
1879

1880 1881 1882 1883 1884 1885
	/*
	 * If the transaction is removed from the list, it means this
	 * transaction has been committed successfully, so it is impossible
	 * to call the cleanup function.
	 */
	BUG_ON(list_empty(&cur_trans->list));
1886

1887
	list_del_init(&cur_trans->list);
1888
	if (cur_trans == fs_info->running_transaction) {
1889
		cur_trans->state = TRANS_STATE_COMMIT_DOING;
1890
		spin_unlock(&fs_info->trans_lock);
1891 1892 1893
		wait_event(cur_trans->writer_wait,
			   atomic_read(&cur_trans->num_writers) == 1);

1894
		spin_lock(&fs_info->trans_lock);
1895
	}
1896
	spin_unlock(&fs_info->trans_lock);
1897

1898
	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
1899

1900 1901 1902 1903
	spin_lock(&fs_info->trans_lock);
	if (cur_trans == fs_info->running_transaction)
		fs_info->running_transaction = NULL;
	spin_unlock(&fs_info->trans_lock);
1904

1905
	if (trans->type & __TRANS_FREEZABLE)
1906
		sb_end_intwrite(fs_info->sb);
1907 1908
	btrfs_put_transaction(cur_trans);
	btrfs_put_transaction(cur_trans);
1909 1910 1911 1912 1913

	trace_btrfs_transaction_commit(root);

	if (current->journal_info == trans)
		current->journal_info = NULL;
1914
	btrfs_scrub_cancel(fs_info);
1915 1916 1917 1918

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

1919 1920
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
1921
	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
1922
		return btrfs_start_delalloc_roots(fs_info, 1, -1);
1923 1924 1925 1926 1927
	return 0;
}

static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
1928
	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
1929
		btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
1930 1931
}

1932
static inline void
1933
btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
1934
{
1935 1936
	wait_event(cur_trans->pending_wait,
		   atomic_read(&cur_trans->pending_ordered) == 0);
1937 1938
}

1939
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
C
Chris Mason 已提交
1940
{
1941
	struct btrfs_fs_info *fs_info = trans->fs_info;
1942
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
1943
	struct btrfs_transaction *prev_trans = NULL;
1944
	int ret;
C
Chris Mason 已提交
1945

1946
	/* Stop the commit early if ->aborted is set */
S
Seraphime Kirkovski 已提交
1947
	if (unlikely(READ_ONCE(cur_trans->aborted))) {
1948
		ret = cur_trans->aborted;
1949
		btrfs_end_transaction(trans);
1950
		return ret;
1951
	}
1952

1953 1954 1955
	/* make a pass through all the delayed refs we have so far
	 * any runnings procs may add more while we are here
	 */
1956
	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
1957
	if (ret) {
1958
		btrfs_end_transaction(trans);
1959 1960
		return ret;
	}
1961

1962
	btrfs_trans_release_metadata(trans, fs_info);
1963 1964
	trans->block_rsv = NULL;

1965
	cur_trans = trans->transaction;
1966

1967 1968 1969 1970
	/*
	 * set the flushing flag so procs in this transaction have to
	 * start sending their work down.
	 */
1971
	cur_trans->delayed_refs.flushing = 1;
1972
	smp_wmb();
1973

1974
	if (!list_empty(&trans->new_bgs))
1975
		btrfs_create_pending_block_groups(trans, fs_info);
1976

1977
	ret = btrfs_run_delayed_refs(trans, fs_info, 0);
1978
	if (ret) {
1979
		btrfs_end_transaction(trans);
1980 1981
		return ret;
	}
1982

1983
	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
1984 1985 1986 1987 1988 1989 1990 1991
		int run_it = 0;

		/* this mutex is also taken before trying to set
		 * block groups readonly.  We need to make sure
		 * that nobody has set a block group readonly
		 * after a extents from that block group have been
		 * allocated for cache files.  btrfs_set_block_group_ro
		 * will wait for the transaction to commit if it
1992
		 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
1993
		 *
1994 1995
		 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
		 * only one process starts all the block group IO.  It wouldn't
1996 1997 1998
		 * hurt to have more than one go through, but there's no
		 * real advantage to it either.
		 */
1999
		mutex_lock(&fs_info->ro_block_group_mutex);
2000 2001
		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
				      &cur_trans->flags))
2002
			run_it = 1;
2003
		mutex_unlock(&fs_info->ro_block_group_mutex);
2004 2005

		if (run_it)
2006
			ret = btrfs_start_dirty_block_groups(trans, fs_info);
2007 2008
	}
	if (ret) {
2009
		btrfs_end_transaction(trans);
2010 2011 2012
		return ret;
	}

2013
	spin_lock(&fs_info->trans_lock);
2014
	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
2015
		spin_unlock(&fs_info->trans_lock);
2016
		refcount_inc(&cur_trans->use_count);
2017
		ret = btrfs_end_transaction(trans);
C
Chris Mason 已提交
2018

2019
		wait_for_commit(cur_trans);
2020

2021 2022 2023
		if (unlikely(cur_trans->aborted))
			ret = cur_trans->aborted;

2024
		btrfs_put_transaction(cur_trans);
2025

2026
		return ret;
C
Chris Mason 已提交
2027
	}
2028

2029
	cur_trans->state = TRANS_STATE_COMMIT_START;
2030
	wake_up(&fs_info->transaction_blocked_wait);
S
Sage Weil 已提交
2031

2032
	if (cur_trans->list.prev != &fs_info->trans_list) {
C
Chris Mason 已提交
2033 2034
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
2035
		if (prev_trans->state != TRANS_STATE_COMPLETED) {
2036
			refcount_inc(&prev_trans->use_count);
2037
			spin_unlock(&fs_info->trans_lock);
C
Chris Mason 已提交
2038

2039
			wait_for_commit(prev_trans);
2040
			ret = prev_trans->aborted;
C
Chris Mason 已提交
2041

2042
			btrfs_put_transaction(prev_trans);
2043 2044
			if (ret)
				goto cleanup_transaction;
J
Josef Bacik 已提交
2045
		} else {
2046
			spin_unlock(&fs_info->trans_lock);
C
Chris Mason 已提交
2047
		}
J
Josef Bacik 已提交
2048
	} else {
2049
		spin_unlock(&fs_info->trans_lock);
C
Chris Mason 已提交
2050
	}
2051

2052 2053
	extwriter_counter_dec(cur_trans, trans->type);

2054
	ret = btrfs_start_delalloc_flush(fs_info);
2055 2056 2057
	if (ret)
		goto cleanup_transaction;

2058
	ret = btrfs_run_delayed_items(trans, fs_info);
2059 2060
	if (ret)
		goto cleanup_transaction;
2061

2062 2063
	wait_event(cur_trans->writer_wait,
		   extwriter_counter_read(cur_trans) == 0);
2064

2065
	/* some pending stuffs might be added after the previous flush. */
2066
	ret = btrfs_run_delayed_items(trans, fs_info);
2067 2068 2069
	if (ret)
		goto cleanup_transaction;

2070
	btrfs_wait_delalloc_flush(fs_info);
2071

2072
	btrfs_wait_pending_ordered(cur_trans);
2073

2074
	btrfs_scrub_pause(fs_info);
2075 2076 2077
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
2078
	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
2079
	 */
2080
	spin_lock(&fs_info->trans_lock);
2081
	cur_trans->state = TRANS_STATE_COMMIT_DOING;
2082
	spin_unlock(&fs_info->trans_lock);
2083 2084 2085
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

2086
	/* ->aborted might be set after the previous check, so check it */
S
Seraphime Kirkovski 已提交
2087
	if (unlikely(READ_ONCE(cur_trans->aborted))) {
2088
		ret = cur_trans->aborted;
2089
		goto scrub_continue;
2090
	}
C
Chris Mason 已提交
2091 2092 2093 2094 2095
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
2096
	mutex_lock(&fs_info->reloc_mutex);
C
Chris Mason 已提交
2097

2098 2099 2100 2101 2102
	/*
	 * We needn't worry about the delayed items because we will
	 * deal with them in create_pending_snapshot(), which is the
	 * core function of the snapshot creation.
	 */
2103
	ret = create_pending_snapshots(trans, fs_info);
2104
	if (ret) {
2105
		mutex_unlock(&fs_info->reloc_mutex);
2106
		goto scrub_continue;
2107
	}
2108

2109 2110 2111 2112 2113 2114 2115 2116 2117 2118
	/*
	 * We insert the dir indexes of the snapshots and update the inode
	 * of the snapshots' parents after the snapshot creation, so there
	 * are some delayed items which are not dealt with. Now deal with
	 * them.
	 *
	 * We needn't worry that this operation will corrupt the snapshots,
	 * because all the tree which are snapshoted will be forced to COW
	 * the nodes and leaves.
	 */
2119
	ret = btrfs_run_delayed_items(trans, fs_info);
2120
	if (ret) {
2121
		mutex_unlock(&fs_info->reloc_mutex);
2122
		goto scrub_continue;
2123
	}
2124

2125
	ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1);
2126
	if (ret) {
2127
		mutex_unlock(&fs_info->reloc_mutex);
2128
		goto scrub_continue;
2129
	}
2130

2131
	/* Reocrd old roots for later qgroup accounting */
2132
	ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
2133
	if (ret) {
2134
		mutex_unlock(&fs_info->reloc_mutex);
2135 2136 2137
		goto scrub_continue;
	}

2138 2139 2140 2141
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
2142
	btrfs_assert_delayed_root_empty(fs_info);
2143

C
Chris Mason 已提交
2144
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
2145

2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158
	/* btrfs_commit_tree_roots is responsible for getting the
	 * various roots consistent with each other.  Every pointer
	 * in the tree of tree roots has to point to the most up to date
	 * root for every subvolume and other tree.  So, we have to keep
	 * the tree logging code from jumping in and changing any
	 * of the trees.
	 *
	 * At this point in the commit, there can't be any tree-log
	 * writers, but a little lower down we drop the trans mutex
	 * and let new people in.  By holding the tree_log_mutex
	 * from now until after the super is written, we avoid races
	 * with the tree-log code.
	 */
2159
	mutex_lock(&fs_info->tree_log_mutex);
2160

2161
	ret = commit_fs_roots(trans, fs_info);
2162
	if (ret) {
2163 2164
		mutex_unlock(&fs_info->tree_log_mutex);
		mutex_unlock(&fs_info->reloc_mutex);
2165
		goto scrub_continue;
2166
	}
2167

2168
	/*
2169 2170
	 * Since the transaction is done, we can apply the pending changes
	 * before the next transaction.
2171
	 */
2172
	btrfs_apply_pending_changes(fs_info);
2173

2174
	/* commit_fs_roots gets rid of all the tree log roots, it is now
2175 2176
	 * safe to free the root of tree log roots
	 */
2177
	btrfs_free_log_root_tree(trans, fs_info);
2178

2179 2180 2181 2182
	/*
	 * Since fs roots are all committed, we can get a quite accurate
	 * new_roots. So let's do quota accounting.
	 */
2183
	ret = btrfs_qgroup_account_extents(trans, fs_info);
2184
	if (ret < 0) {
2185 2186
		mutex_unlock(&fs_info->tree_log_mutex);
		mutex_unlock(&fs_info->reloc_mutex);
2187 2188 2189
		goto scrub_continue;
	}

2190
	ret = commit_cowonly_roots(trans, fs_info);
2191
	if (ret) {
2192 2193
		mutex_unlock(&fs_info->tree_log_mutex);
		mutex_unlock(&fs_info->reloc_mutex);
2194
		goto scrub_continue;
2195
	}
2196

2197 2198 2199 2200
	/*
	 * The tasks which save the space cache and inode cache may also
	 * update ->aborted, check it.
	 */
S
Seraphime Kirkovski 已提交
2201
	if (unlikely(READ_ONCE(cur_trans->aborted))) {
2202
		ret = cur_trans->aborted;
2203 2204
		mutex_unlock(&fs_info->tree_log_mutex);
		mutex_unlock(&fs_info->reloc_mutex);
2205
		goto scrub_continue;
2206 2207
	}

2208
	btrfs_prepare_extent_commit(fs_info);
2209

2210
	cur_trans = fs_info->running_transaction;
2211

2212 2213 2214
	btrfs_set_root_node(&fs_info->tree_root->root_item,
			    fs_info->tree_root->node);
	list_add_tail(&fs_info->tree_root->dirty_list,
2215
		      &cur_trans->switch_commits);
2216

2217 2218 2219
	btrfs_set_root_node(&fs_info->chunk_root->root_item,
			    fs_info->chunk_root->node);
	list_add_tail(&fs_info->chunk_root->dirty_list,
2220 2221
		      &cur_trans->switch_commits);

2222
	switch_commit_roots(cur_trans, fs_info);
2223

2224
	ASSERT(list_empty(&cur_trans->dirty_bgs));
2225
	ASSERT(list_empty(&cur_trans->io_bgs));
2226
	update_super_roots(fs_info);
2227

2228 2229 2230 2231
	btrfs_set_super_log_root(fs_info->super_copy, 0);
	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
	memcpy(fs_info->super_for_commit, fs_info->super_copy,
	       sizeof(*fs_info->super_copy));
C
Chris Mason 已提交
2232

2233
	btrfs_update_commit_device_size(fs_info);
2234
	btrfs_update_commit_device_bytes_used(fs_info, cur_trans);
2235

2236 2237
	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
2238

2239 2240
	btrfs_trans_release_chunk_metadata(trans);

2241
	spin_lock(&fs_info->trans_lock);
2242
	cur_trans->state = TRANS_STATE_UNBLOCKED;
2243 2244 2245
	fs_info->running_transaction = NULL;
	spin_unlock(&fs_info->trans_lock);
	mutex_unlock(&fs_info->reloc_mutex);
2246

2247
	wake_up(&fs_info->transaction_wait);
2248

2249
	ret = btrfs_write_and_wait_transaction(trans, fs_info);
2250
	if (ret) {
2251 2252 2253
		btrfs_handle_fs_error(fs_info, ret,
				      "Error while writing out transaction");
		mutex_unlock(&fs_info->tree_log_mutex);
2254
		goto scrub_continue;
2255 2256
	}

2257
	ret = write_all_supers(fs_info, 0);
2258
	if (ret) {
2259
		mutex_unlock(&fs_info->tree_log_mutex);
2260
		goto scrub_continue;
2261
	}
2262

2263 2264 2265 2266
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
2267
	mutex_unlock(&fs_info->tree_log_mutex);
2268

2269
	btrfs_finish_extent_commit(trans, fs_info);
2270

2271
	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
2272
		btrfs_clear_space_info_full(fs_info);
Z
Zhao Lei 已提交
2273

2274
	fs_info->last_trans_committed = cur_trans->transid;
2275 2276 2277 2278 2279
	/*
	 * We needn't acquire the lock here because there is no other task
	 * which can change it.
	 */
	cur_trans->state = TRANS_STATE_COMPLETED;
C
Chris Mason 已提交
2280
	wake_up(&cur_trans->commit_wait);
2281

2282
	spin_lock(&fs_info->trans_lock);
2283
	list_del_init(&cur_trans->list);
2284
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
2285

2286 2287
	btrfs_put_transaction(cur_trans);
	btrfs_put_transaction(cur_trans);
2288

2289
	if (trans->type & __TRANS_FREEZABLE)
2290
		sb_end_intwrite(fs_info->sb);
2291

2292
	trace_btrfs_transaction_commit(trans->root);
2293

2294
	btrfs_scrub_continue(fs_info);
A
Arne Jansen 已提交
2295

J
Josef Bacik 已提交
2296 2297 2298
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
2299
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
2300

2301 2302 2303 2304
	/*
	 * If fs has been frozen, we can not handle delayed iputs, otherwise
	 * it'll result in deadlock about SB_FREEZE_FS.
	 */
2305 2306
	if (current != fs_info->transaction_kthread &&
	    current != fs_info->cleaner_kthread && !fs_info->fs_frozen)
2307
		btrfs_run_delayed_iputs(fs_info);
Y
Yan, Zheng 已提交
2308

C
Chris Mason 已提交
2309
	return ret;
2310

2311
scrub_continue:
2312
	btrfs_scrub_continue(fs_info);
2313
cleanup_transaction:
2314
	btrfs_trans_release_metadata(trans, fs_info);
2315
	btrfs_trans_release_chunk_metadata(trans);
2316
	trans->block_rsv = NULL;
2317
	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
2318 2319
	if (current->journal_info == trans)
		current->journal_info = NULL;
2320
	cleanup_transaction(trans, trans->root, ret);
2321 2322

	return ret;
C
Chris Mason 已提交
2323 2324
}

C
Chris Mason 已提交
2325
/*
D
David Sterba 已提交
2326 2327 2328 2329 2330 2331 2332 2333
 * return < 0 if error
 * 0 if there are no more dead_roots at the time of call
 * 1 there are more to be processed, call me again
 *
 * The return value indicates there are certainly more snapshots to delete, but
 * if there comes a new one during processing, it may return 0. We don't mind,
 * because btrfs_commit_super will poke cleaner thread and it will process it a
 * few seconds later.
C
Chris Mason 已提交
2334
 */
D
David Sterba 已提交
2335
int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
2336
{
D
David Sterba 已提交
2337
	int ret;
2338 2339
	struct btrfs_fs_info *fs_info = root->fs_info;

J
Josef Bacik 已提交
2340
	spin_lock(&fs_info->trans_lock);
D
David Sterba 已提交
2341 2342 2343 2344 2345 2346
	if (list_empty(&fs_info->dead_roots)) {
		spin_unlock(&fs_info->trans_lock);
		return 0;
	}
	root = list_first_entry(&fs_info->dead_roots,
			struct btrfs_root, root_list);
2347
	list_del_init(&root->root_list);
J
Josef Bacik 已提交
2348
	spin_unlock(&fs_info->trans_lock);
2349

2350
	btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
2351

D
David Sterba 已提交
2352
	btrfs_kill_all_delayed_nodes(root);
2353

D
David Sterba 已提交
2354 2355 2356 2357 2358
	if (btrfs_header_backref_rev(root->node) <
			BTRFS_MIXED_BACKREF_REV)
		ret = btrfs_drop_snapshot(root, NULL, 0, 0);
	else
		ret = btrfs_drop_snapshot(root, NULL, 1, 0);
2359

2360
	return (ret < 0) ? 0 : 1;
2361
}
2362 2363 2364 2365 2366 2367

void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
{
	unsigned long prev;
	unsigned long bit;

2368
	prev = xchg(&fs_info->pending_changes, 0);
2369 2370 2371
	if (!prev)
		return;

2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
	bit = 1 << BTRFS_PENDING_SET_INODE_MAP_CACHE;
	if (prev & bit)
		btrfs_set_opt(fs_info->mount_opt, INODE_MAP_CACHE);
	prev &= ~bit;

	bit = 1 << BTRFS_PENDING_CLEAR_INODE_MAP_CACHE;
	if (prev & bit)
		btrfs_clear_opt(fs_info->mount_opt, INODE_MAP_CACHE);
	prev &= ~bit;

2382 2383 2384 2385 2386
	bit = 1 << BTRFS_PENDING_COMMIT;
	if (prev & bit)
		btrfs_debug(fs_info, "pending commit done");
	prev &= ~bit;

2387 2388 2389 2390
	if (prev)
		btrfs_warn(fs_info,
			"unknown pending changes left 0x%lx, ignoring", prev);
}