transaction.c 76.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

C
Chris Mason 已提交
6
#include <linux/fs.h>
7
#include <linux/slab.h>
C
Chris Mason 已提交
8
#include <linux/sched.h>
9
#include <linux/sched/mm.h>
10
#include <linux/writeback.h>
11
#include <linux/pagemap.h>
12
#include <linux/blkdev.h>
13
#include <linux/uuid.h>
14
#include <linux/timekeeping.h>
15
#include "misc.h"
C
Chris Mason 已提交
16 17 18
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
19
#include "locking.h"
20
#include "tree-log.h"
21
#include "volumes.h"
22
#include "dev-replace.h"
J
Josef Bacik 已提交
23
#include "qgroup.h"
24
#include "block-group.h"
25
#include "space-info.h"
26
#include "zoned.h"
27
#include "fs.h"
28
#include "accessors.h"
29
#include "extent-tree.h"
30
#include "root-tree.h"
31
#include "defrag.h"
32
#include "dir-item.h"
33
#include "uuid-tree.h"
34
#include "ioctl.h"
35
#include "relocation.h"
C
Chris Mason 已提交
36

37 38
static struct kmem_cache *btrfs_trans_handle_cachep;

39
#define BTRFS_ROOT_TRANS_TAG 0
40

41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
/*
 * Transaction states and transitions
 *
 * No running transaction (fs tree blocks are not modified)
 * |
 * | To next stage:
 * |  Call start_transaction() variants. Except btrfs_join_transaction_nostart().
 * V
 * Transaction N [[TRANS_STATE_RUNNING]]
 * |
 * | New trans handles can be attached to transaction N by calling all
 * | start_transaction() variants.
 * |
 * | To next stage:
 * |  Call btrfs_commit_transaction() on any trans handle attached to
 * |  transaction N
 * V
 * Transaction N [[TRANS_STATE_COMMIT_START]]
 * |
 * | Will wait for previous running transaction to completely finish if there
 * | is one
 * |
 * | Then one of the following happes:
 * | - Wait for all other trans handle holders to release.
 * |   The btrfs_commit_transaction() caller will do the commit work.
 * | - Wait for current transaction to be committed by others.
 * |   Other btrfs_commit_transaction() caller will do the commit work.
 * |
 * | At this stage, only btrfs_join_transaction*() variants can attach
 * | to this running transaction.
 * | All other variants will wait for current one to finish and attach to
 * | transaction N+1.
 * |
 * | To next stage:
 * |  Caller is chosen to commit transaction N, and all other trans handle
 * |  haven been released.
 * V
 * Transaction N [[TRANS_STATE_COMMIT_DOING]]
 * |
 * | The heavy lifting transaction work is started.
 * | From running delayed refs (modifying extent tree) to creating pending
 * | snapshots, running qgroups.
 * | In short, modify supporting trees to reflect modifications of subvolume
 * | trees.
 * |
 * | At this stage, all start_transaction() calls will wait for this
 * | transaction to finish and attach to transaction N+1.
 * |
 * | To next stage:
 * |  Until all supporting trees are updated.
 * V
 * Transaction N [[TRANS_STATE_UNBLOCKED]]
 * |						    Transaction N+1
 * | All needed trees are modified, thus we only    [[TRANS_STATE_RUNNING]]
 * | need to write them back to disk and update	    |
 * | super blocks.				    |
 * |						    |
 * | At this stage, new transaction is allowed to   |
 * | start.					    |
 * | All new start_transaction() calls will be	    |
 * | attached to transid N+1.			    |
 * |						    |
 * | To next stage:				    |
 * |  Until all tree blocks are super blocks are    |
 * |  written to block devices			    |
 * V						    |
 * Transaction N [[TRANS_STATE_COMPLETED]]	    V
 *   All tree blocks and super blocks are written.  Transaction N+1
 *   This transaction is finished and all its	    [[TRANS_STATE_COMMIT_START]]
 *   data structures will be cleaned up.	    | Life goes on
 */
112
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
113
	[TRANS_STATE_RUNNING]		= 0U,
114 115
	[TRANS_STATE_COMMIT_START]	= (__TRANS_START | __TRANS_ATTACH),
	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_START |
116
					   __TRANS_ATTACH |
117 118
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOSTART),
119
	[TRANS_STATE_UNBLOCKED]		= (__TRANS_START |
120 121
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
122 123
					   __TRANS_JOIN_NOLOCK |
					   __TRANS_JOIN_NOSTART),
124 125 126 127 128
	[TRANS_STATE_SUPER_COMMITTED]	= (__TRANS_START |
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
					   __TRANS_JOIN_NOLOCK |
					   __TRANS_JOIN_NOSTART),
129
	[TRANS_STATE_COMPLETED]		= (__TRANS_START |
130 131
					   __TRANS_ATTACH |
					   __TRANS_JOIN |
132 133
					   __TRANS_JOIN_NOLOCK |
					   __TRANS_JOIN_NOSTART),
134 135
};

136
void btrfs_put_transaction(struct btrfs_transaction *transaction)
C
Chris Mason 已提交
137
{
138 139
	WARN_ON(refcount_read(&transaction->use_count) == 0);
	if (refcount_dec_and_test(&transaction->use_count)) {
J
Josef Bacik 已提交
140
		BUG_ON(!list_empty(&transaction->list));
141 142
		WARN_ON(!RB_EMPTY_ROOT(
				&transaction->delayed_refs.href_root.rb_root));
143 144
		WARN_ON(!RB_EMPTY_ROOT(
				&transaction->delayed_refs.dirty_extent_root));
145
		if (transaction->delayed_refs.pending_csums)
146 147 148
			btrfs_err(transaction->fs_info,
				  "pending csums is %llu",
				  transaction->delayed_refs.pending_csums);
149 150 151 152 153 154 155 156
		/*
		 * If any block groups are found in ->deleted_bgs then it's
		 * because the transaction was aborted and a commit did not
		 * happen (things failed before writing the new superblock
		 * and calling btrfs_finish_extent_commit()), so we can not
		 * discard the physical locations of the block groups.
		 */
		while (!list_empty(&transaction->deleted_bgs)) {
157
			struct btrfs_block_group *cache;
158 159

			cache = list_first_entry(&transaction->deleted_bgs,
160
						 struct btrfs_block_group,
161 162
						 bg_list);
			list_del_init(&cache->bg_list);
163
			btrfs_unfreeze_block_group(cache);
164 165
			btrfs_put_block_group(cache);
		}
166
		WARN_ON(!list_empty(&transaction->dev_update_list));
167
		kfree(transaction);
C
Chris Mason 已提交
168
	}
C
Chris Mason 已提交
169 170
}

171
static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
J
Josef Bacik 已提交
172
{
173
	struct btrfs_transaction *cur_trans = trans->transaction;
174
	struct btrfs_fs_info *fs_info = trans->fs_info;
175 176
	struct btrfs_root *root, *tmp;

177 178 179 180 181 182
	/*
	 * At this point no one can be using this transaction to modify any tree
	 * and no one can start another transaction to modify any tree either.
	 */
	ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING);

183
	down_write(&fs_info->commit_root_sem);
184 185 186 187

	if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
		fs_info->last_reloc_trans = trans->transid;

188
	list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
189 190 191 192
				 dirty_list) {
		list_del_init(&root->dirty_list);
		free_extent_buffer(root->commit_root);
		root->commit_root = btrfs_root_node(root);
193
		extent_io_tree_release(&root->dirty_log_pages);
194
		btrfs_qgroup_clean_swapped_blocks(root);
195
	}
196 197

	/* We can free old roots now. */
198 199 200
	spin_lock(&cur_trans->dropped_roots_lock);
	while (!list_empty(&cur_trans->dropped_roots)) {
		root = list_first_entry(&cur_trans->dropped_roots,
201 202
					struct btrfs_root, root_list);
		list_del_init(&root->root_list);
203 204
		spin_unlock(&cur_trans->dropped_roots_lock);
		btrfs_free_log(trans, root);
205
		btrfs_drop_and_free_fs_root(fs_info, root);
206
		spin_lock(&cur_trans->dropped_roots_lock);
207
	}
208
	spin_unlock(&cur_trans->dropped_roots_lock);
209

210
	up_write(&fs_info->commit_root_sem);
J
Josef Bacik 已提交
211 212
}

213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_inc(&trans->num_extwriters);
}

static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
					 unsigned int type)
{
	if (type & TRANS_EXTWRITERS)
		atomic_dec(&trans->num_extwriters);
}

static inline void extwriter_counter_init(struct btrfs_transaction *trans,
					  unsigned int type)
{
	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
}

static inline int extwriter_counter_read(struct btrfs_transaction *trans)
{
	return atomic_read(&trans->num_extwriters);
236 237
}

238
/*
239 240 241 242 243
 * To be called after doing the chunk btree updates right after allocating a new
 * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a
 * chunk after all chunk btree updates and after finishing the second phase of
 * chunk allocation (btrfs_create_pending_block_groups()) in case some block
 * group had its chunk item insertion delayed to the second phase.
244 245 246 247 248 249 250 251 252
 */
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;

	if (!trans->chunk_bytes_reserved)
		return;

	btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
253
				trans->chunk_bytes_reserved, NULL);
254 255 256
	trans->chunk_bytes_reserved = 0;
}

C
Chris Mason 已提交
257 258 259
/*
 * either allocate a new transaction or hop into the existing one
 */
260 261
static noinline int join_transaction(struct btrfs_fs_info *fs_info,
				     unsigned int type)
C
Chris Mason 已提交
262 263
{
	struct btrfs_transaction *cur_trans;
J
Josef Bacik 已提交
264

265
	spin_lock(&fs_info->trans_lock);
266
loop:
267
	/* The file system has been taken offline. No new transactions. */
J
Josef Bacik 已提交
268
	if (BTRFS_FS_ERROR(fs_info)) {
269
		spin_unlock(&fs_info->trans_lock);
270 271 272
		return -EROFS;
	}

273
	cur_trans = fs_info->running_transaction;
J
Josef Bacik 已提交
274
	if (cur_trans) {
275
		if (TRANS_ABORTED(cur_trans)) {
276
			spin_unlock(&fs_info->trans_lock);
277
			return cur_trans->aborted;
278
		}
279
		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
280 281 282
			spin_unlock(&fs_info->trans_lock);
			return -EBUSY;
		}
283
		refcount_inc(&cur_trans->use_count);
284
		atomic_inc(&cur_trans->num_writers);
285
		extwriter_counter_inc(cur_trans, type);
286
		spin_unlock(&fs_info->trans_lock);
287
		btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
288
		btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
J
Josef Bacik 已提交
289
		return 0;
C
Chris Mason 已提交
290
	}
291
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
292

293 294 295 296 297 298 299
	/*
	 * If we are ATTACH, we just want to catch the current transaction,
	 * and commit it. If there is no transaction, just return ENOENT.
	 */
	if (type == TRANS_ATTACH)
		return -ENOENT;

300 301 302 303 304 305
	/*
	 * JOIN_NOLOCK only happens during the transaction commit, so
	 * it is impossible that ->running_transaction is NULL
	 */
	BUG_ON(type == TRANS_JOIN_NOLOCK);

306
	cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS);
J
Josef Bacik 已提交
307 308
	if (!cur_trans)
		return -ENOMEM;
309

310
	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers);
311
	btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters);
312

313 314
	spin_lock(&fs_info->trans_lock);
	if (fs_info->running_transaction) {
315 316
		/*
		 * someone started a transaction after we unlocked.  Make sure
317
		 * to redo the checks above
318
		 */
319
		btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
320
		btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
321
		kfree(cur_trans);
322
		goto loop;
J
Josef Bacik 已提交
323
	} else if (BTRFS_FS_ERROR(fs_info)) {
324
		spin_unlock(&fs_info->trans_lock);
325
		btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
326
		btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
327
		kfree(cur_trans);
328
		return -EROFS;
C
Chris Mason 已提交
329
	}
330

331
	cur_trans->fs_info = fs_info;
332 333
	atomic_set(&cur_trans->pending_ordered, 0);
	init_waitqueue_head(&cur_trans->pending_wait);
J
Josef Bacik 已提交
334
	atomic_set(&cur_trans->num_writers, 1);
335
	extwriter_counter_init(cur_trans, type);
J
Josef Bacik 已提交
336 337
	init_waitqueue_head(&cur_trans->writer_wait);
	init_waitqueue_head(&cur_trans->commit_wait);
338
	cur_trans->state = TRANS_STATE_RUNNING;
J
Josef Bacik 已提交
339 340 341 342
	/*
	 * One for this trans handle, one so it will live on until we
	 * commit the transaction.
	 */
343
	refcount_set(&cur_trans->use_count, 2);
344
	cur_trans->flags = 0;
345
	cur_trans->start_time = ktime_get_seconds();
J
Josef Bacik 已提交
346

347 348
	memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));

349
	cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
350
	cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
351
	atomic_set(&cur_trans->delayed_refs.num_entries, 0);
352 353 354 355 356 357

	/*
	 * although the tree mod log is per file system and not per transaction,
	 * the log must never go across transaction boundaries.
	 */
	smp_mb();
J
Julia Lawall 已提交
358
	if (!list_empty(&fs_info->tree_mod_seq_list))
J
Jeff Mahoney 已提交
359
		WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n");
J
Julia Lawall 已提交
360
	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
J
Jeff Mahoney 已提交
361
		WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n");
362
	atomic64_set(&fs_info->tree_mod_seq, 0);
363

J
Josef Bacik 已提交
364 365 366
	spin_lock_init(&cur_trans->delayed_refs.lock);

	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
367
	INIT_LIST_HEAD(&cur_trans->dev_update_list);
368
	INIT_LIST_HEAD(&cur_trans->switch_commits);
369
	INIT_LIST_HEAD(&cur_trans->dirty_bgs);
370
	INIT_LIST_HEAD(&cur_trans->io_bgs);
371
	INIT_LIST_HEAD(&cur_trans->dropped_roots);
372
	mutex_init(&cur_trans->cache_write_mutex);
373
	spin_lock_init(&cur_trans->dirty_bgs_lock);
374
	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
375
	spin_lock_init(&cur_trans->dropped_roots_lock);
376 377
	INIT_LIST_HEAD(&cur_trans->releasing_ebs);
	spin_lock_init(&cur_trans->releasing_ebs_lock);
378
	list_add_tail(&cur_trans->list, &fs_info->trans_list);
379
	extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
380
			IO_TREE_TRANS_DIRTY_PAGES, NULL);
381 382
	extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
			IO_TREE_FS_PINNED_EXTENTS, NULL);
383 384 385
	fs_info->generation++;
	cur_trans->transid = fs_info->generation;
	fs_info->running_transaction = cur_trans;
386
	cur_trans->aborted = 0;
387
	spin_unlock(&fs_info->trans_lock);
388

C
Chris Mason 已提交
389 390 391
	return 0;
}

C
Chris Mason 已提交
392
/*
393 394 395 396
 * This does all the record keeping required to make sure that a shareable root
 * is properly recorded in a given transaction.  This is required to make sure
 * the old root from before we joined the transaction is deleted when the
 * transaction commits.
C
Chris Mason 已提交
397
 */
C
Chris Mason 已提交
398
static int record_root_in_trans(struct btrfs_trans_handle *trans,
399 400
			       struct btrfs_root *root,
			       int force)
401
{
402
	struct btrfs_fs_info *fs_info = root->fs_info;
403
	int ret = 0;
404

405
	if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
406
	    root->last_trans < trans->transid) || force) {
407
		WARN_ON(!force && root->commit_root != root->node);
408

C
Chris Mason 已提交
409
		/*
410
		 * see below for IN_TRANS_SETUP usage rules
C
Chris Mason 已提交
411 412 413
		 * we have the reloc mutex held now, so there
		 * is only one writer in this function
		 */
414
		set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
C
Chris Mason 已提交
415

416
		/* make sure readers find IN_TRANS_SETUP before
C
Chris Mason 已提交
417 418 419 420
		 * they find our root->last_trans update
		 */
		smp_wmb();

421
		spin_lock(&fs_info->fs_roots_radix_lock);
422
		if (root->last_trans == trans->transid && !force) {
423
			spin_unlock(&fs_info->fs_roots_radix_lock);
J
Josef Bacik 已提交
424 425
			return 0;
		}
426 427 428 429
		radix_tree_tag_set(&fs_info->fs_roots_radix,
				   (unsigned long)root->root_key.objectid,
				   BTRFS_ROOT_TRANS_TAG);
		spin_unlock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
430 431 432 433 434 435 436 437 438 439 440 441 442
		root->last_trans = trans->transid;

		/* this is pretty tricky.  We don't want to
		 * take the relocation lock in btrfs_record_root_in_trans
		 * unless we're really doing the first setup for this root in
		 * this transaction.
		 *
		 * Normally we'd use root->last_trans as a flag to decide
		 * if we want to take the expensive mutex.
		 *
		 * But, we have to set root->last_trans before we
		 * init the relocation root, otherwise, we trip over warnings
		 * in ctree.c.  The solution used here is to flag ourselves
443
		 * with root IN_TRANS_SETUP.  When this is 1, we're still
C
Chris Mason 已提交
444 445 446 447 448 449 450
		 * fixing up the reloc trees and everyone must wait.
		 *
		 * When this is zero, they can trust root->last_trans and fly
		 * through btrfs_record_root_in_trans without having to take the
		 * lock.  smp_wmb() makes sure that all the writes above are
		 * done before we pop in the zero below
		 */
451
		ret = btrfs_init_reloc_root(trans, root);
452
		smp_mb__before_atomic();
453
		clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
454
	}
455
	return ret;
456
}
457

C
Chris Mason 已提交
458

459 460 461
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root)
{
462
	struct btrfs_fs_info *fs_info = root->fs_info;
463 464 465 466 467 468 469 470
	struct btrfs_transaction *cur_trans = trans->transaction;

	/* Add ourselves to the transaction dropped list */
	spin_lock(&cur_trans->dropped_roots_lock);
	list_add_tail(&root->root_list, &cur_trans->dropped_roots);
	spin_unlock(&cur_trans->dropped_roots_lock);

	/* Make sure we don't try to update the root at commit time */
471 472 473 474 475
	spin_lock(&fs_info->fs_roots_radix_lock);
	radix_tree_tag_clear(&fs_info->fs_roots_radix,
			     (unsigned long)root->root_key.objectid,
			     BTRFS_ROOT_TRANS_TAG);
	spin_unlock(&fs_info->fs_roots_radix_lock);
476 477
}

C
Chris Mason 已提交
478 479 480
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
{
481
	struct btrfs_fs_info *fs_info = root->fs_info;
482
	int ret;
483

484
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
C
Chris Mason 已提交
485 486 487
		return 0;

	/*
488
	 * see record_root_in_trans for comments about IN_TRANS_SETUP usage
C
Chris Mason 已提交
489 490 491 492
	 * and barriers
	 */
	smp_rmb();
	if (root->last_trans == trans->transid &&
493
	    !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state))
C
Chris Mason 已提交
494 495
		return 0;

496
	mutex_lock(&fs_info->reloc_mutex);
497
	ret = record_root_in_trans(trans, root, 0);
498
	mutex_unlock(&fs_info->reloc_mutex);
C
Chris Mason 已提交
499

500
	return ret;
C
Chris Mason 已提交
501 502
}

503 504
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
505
	return (trans->state >= TRANS_STATE_COMMIT_START &&
506
		trans->state < TRANS_STATE_UNBLOCKED &&
507
		!TRANS_ABORTED(trans));
508 509
}

C
Chris Mason 已提交
510 511 512 513
/* wait for commit against the current transaction to become unblocked
 * when this is done, it is safe to start a new transaction, but the current
 * transaction might not be fully on disk.
 */
514
static void wait_current_trans(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
515
{
516
	struct btrfs_transaction *cur_trans;
C
Chris Mason 已提交
517

518 519
	spin_lock(&fs_info->trans_lock);
	cur_trans = fs_info->running_transaction;
520
	if (cur_trans && is_transaction_blocked(cur_trans)) {
521
		refcount_inc(&cur_trans->use_count);
522
		spin_unlock(&fs_info->trans_lock);
L
Li Zefan 已提交
523

524
		btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
525
		wait_event(fs_info->transaction_wait,
526
			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
527
			   TRANS_ABORTED(cur_trans));
528
		btrfs_put_transaction(cur_trans);
J
Josef Bacik 已提交
529
	} else {
530
		spin_unlock(&fs_info->trans_lock);
531
	}
C
Chris Mason 已提交
532 533
}

534
static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type)
535
{
536
	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
J
Josef Bacik 已提交
537 538
		return 0;

539
	if (type == TRANS_START)
540
		return 1;
J
Josef Bacik 已提交
541

542 543 544
	return 0;
}

545 546
static inline bool need_reserve_reloc_root(struct btrfs_root *root)
{
547 548 549
	struct btrfs_fs_info *fs_info = root->fs_info;

	if (!fs_info->reloc_ctl ||
550
	    !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
551 552 553 554 555 556 557
	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
	    root->reloc_root)
		return false;

	return true;
}

M
Miao Xie 已提交
558
static struct btrfs_trans_handle *
559
start_transaction(struct btrfs_root *root, unsigned int num_items,
560 561
		  unsigned int type, enum btrfs_reserve_flush_enum flush,
		  bool enforce_qgroups)
C
Chris Mason 已提交
562
{
563
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Josef Bacik 已提交
564
	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
565 566
	struct btrfs_trans_handle *h;
	struct btrfs_transaction *cur_trans;
567
	u64 num_bytes = 0;
568
	u64 qgroup_reserved = 0;
569
	bool reloc_reserved = false;
570
	bool do_chunk_alloc = false;
571
	int ret;
L
liubo 已提交
572

J
Josef Bacik 已提交
573
	if (BTRFS_FS_ERROR(fs_info))
L
liubo 已提交
574
		return ERR_PTR(-EROFS);
575

576
	if (current->journal_info) {
577
		WARN_ON(type & TRANS_EXTWRITERS);
578
		h = current->journal_info;
579 580
		refcount_inc(&h->use_count);
		WARN_ON(refcount_read(&h->use_count) > 2);
581 582 583 584
		h->orig_rsv = h->block_rsv;
		h->block_rsv = NULL;
		goto got_it;
	}
585 586 587 588 589

	/*
	 * Do the reservation before we join the transaction so we can do all
	 * the appropriate flushing if need be.
	 */
590
	if (num_items && root != fs_info->chunk_root) {
J
Josef Bacik 已提交
591 592 593
		struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
		u64 delayed_refs_bytes = 0;

594
		qgroup_reserved = num_items * fs_info->nodesize;
595 596
		ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
				enforce_qgroups);
597 598
		if (ret)
			return ERR_PTR(ret);
599

J
Josef Bacik 已提交
600 601 602 603 604 605 606
		/*
		 * We want to reserve all the bytes we may need all at once, so
		 * we only do 1 enospc flushing cycle per transaction start.  We
		 * accomplish this by simply assuming we'll do 2 x num_items
		 * worth of delayed refs updates in this trans handle, and
		 * refill that amount for whatever is missing in the reserve.
		 */
607
		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
608
		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
609
		    btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
J
Josef Bacik 已提交
610 611 612 613
			delayed_refs_bytes = num_bytes;
			num_bytes <<= 1;
		}

614 615 616
		/*
		 * Do the reservation for the relocation root creation
		 */
617
		if (need_reserve_reloc_root(root)) {
618
			num_bytes += fs_info->nodesize;
619 620 621
			reloc_reserved = true;
		}

622
		ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush);
J
Josef Bacik 已提交
623 624 625 626 627 628 629
		if (ret)
			goto reserve_fail;
		if (delayed_refs_bytes) {
			btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
							  delayed_refs_bytes);
			num_bytes -= delayed_refs_bytes;
		}
630 631 632

		if (rsv->space_info->force_alloc)
			do_chunk_alloc = true;
J
Josef Bacik 已提交
633
	} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
634
		   !btrfs_block_rsv_full(delayed_refs_rsv)) {
J
Josef Bacik 已提交
635 636 637 638 639 640 641 642
		/*
		 * Some people call with btrfs_start_transaction(root, 0)
		 * because they can be throttled, but have some other mechanism
		 * for reserving space.  We still want these guys to refill the
		 * delayed block_rsv so just add 1 items worth of reservation
		 * here.
		 */
		ret = btrfs_delayed_refs_rsv_refill(fs_info, flush);
643
		if (ret)
644
			goto reserve_fail;
645
	}
646
again:
647
	h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
648 649 650 651
	if (!h) {
		ret = -ENOMEM;
		goto alloc_fail;
	}
C
Chris Mason 已提交
652

653 654 655 656 657 658
	/*
	 * If we are JOIN_NOLOCK we're already committing a transaction and
	 * waiting on this guy, so we don't need to do the sb_start_intwrite
	 * because we're already holding a ref.  We need this because we could
	 * have raced in and did an fsync() on a file which can kick a commit
	 * and then we deadlock with somebody doing a freeze.
659 660 661
	 *
	 * If we are ATTACH, it means we just want to catch the current
	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
662
	 */
663
	if (type & __TRANS_FREEZABLE)
664
		sb_start_intwrite(fs_info->sb);
665

666 667
	if (may_wait_transaction(fs_info, type))
		wait_current_trans(fs_info);
668

J
Josef Bacik 已提交
669
	do {
670
		ret = join_transaction(fs_info, type);
671
		if (ret == -EBUSY) {
672
			wait_current_trans(fs_info);
673 674
			if (unlikely(type == TRANS_ATTACH ||
				     type == TRANS_JOIN_NOSTART))
675 676
				ret = -ENOENT;
		}
J
Josef Bacik 已提交
677 678
	} while (ret == -EBUSY);

679
	if (ret < 0)
680
		goto join_fail;
681

682
	cur_trans = fs_info->running_transaction;
683 684 685

	h->transid = cur_trans->transid;
	h->transaction = cur_trans;
686
	refcount_set(&h->use_count, 1);
687
	h->fs_info = root->fs_info;
688

689
	h->type = type;
690
	INIT_LIST_HEAD(&h->new_bgs);
691

692
	smp_mb();
693
	if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
694
	    may_wait_transaction(fs_info, type)) {
695
		current->journal_info = h;
696
		btrfs_commit_transaction(h);
697 698 699
		goto again;
	}

700
	if (num_bytes) {
701
		trace_btrfs_space_reservation(fs_info, "transaction",
702
					      h->transid, num_bytes, 1);
703
		h->block_rsv = &fs_info->trans_block_rsv;
704
		h->bytes_reserved = num_bytes;
705
		h->reloc_reserved = reloc_reserved;
706
	}
J
Josef Bacik 已提交
707

708
got_it:
709
	if (!current->journal_info)
710
		current->journal_info = h;
711

712 713 714 715 716 717 718 719 720 721 722 723 724
	/*
	 * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
	 * ALLOC_FORCE the first run through, and then we won't allocate for
	 * anybody else who races in later.  We don't care about the return
	 * value here.
	 */
	if (do_chunk_alloc && num_bytes) {
		u64 flags = h->block_rsv->space_info->flags;

		btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
				  CHUNK_ALLOC_NO_FORCE);
	}

725 726 727 728 729 730 731 732
	/*
	 * btrfs_record_root_in_trans() needs to alloc new extents, and may
	 * call btrfs_join_transaction() while we're also starting a
	 * transaction.
	 *
	 * Thus it need to be called after current->journal_info initialized,
	 * or we can deadlock.
	 */
733 734 735 736 737 738 739 740 741 742
	ret = btrfs_record_root_in_trans(h, root);
	if (ret) {
		/*
		 * The transaction handle is fully initialized and linked with
		 * other structures so it needs to be ended in case of errors,
		 * not just freed.
		 */
		btrfs_end_transaction(h);
		return ERR_PTR(ret);
	}
743

C
Chris Mason 已提交
744
	return h;
745 746

join_fail:
747
	if (type & __TRANS_FREEZABLE)
748
		sb_end_intwrite(fs_info->sb);
749 750 751
	kmem_cache_free(btrfs_trans_handle_cachep, h);
alloc_fail:
	if (num_bytes)
752
		btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv,
753
					num_bytes, NULL);
754
reserve_fail:
755
	btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved);
756
	return ERR_PTR(ret);
C
Chris Mason 已提交
757 758
}

759
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
760
						   unsigned int num_items)
761
{
M
Miao Xie 已提交
762
	return start_transaction(root, num_items, TRANS_START,
763
				 BTRFS_RESERVE_FLUSH_ALL, true);
764
}
765

766 767
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
					struct btrfs_root *root,
768
					unsigned int num_items)
769
{
770 771
	return start_transaction(root, num_items, TRANS_START,
				 BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
772
}
773

774
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
775
{
776 777
	return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH,
				 true);
778 779
}

780
struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root)
781
{
782
	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
783
				 BTRFS_RESERVE_NO_FLUSH, true);
784 785
}

786 787 788 789 790 791 792 793 794 795
/*
 * Similar to regular join but it never starts a transaction when none is
 * running or after waiting for the current one to finish.
 */
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root)
{
	return start_transaction(root, 0, TRANS_JOIN_NOSTART,
				 BTRFS_RESERVE_NO_FLUSH, true);
}

M
Miao Xie 已提交
796 797 798 799 800 801 802 803 804 805 806 807 808
/*
 * btrfs_attach_transaction() - catch the running transaction
 *
 * It is used when we want to commit the current the transaction, but
 * don't want to start a new one.
 *
 * Note: If this function return -ENOENT, it just means there is no
 * running transaction. But it is possible that the inactive transaction
 * is still in the memory, not fully on disk. If you hope there is no
 * inactive transaction in the fs when -ENOENT is returned, you should
 * invoke
 *     btrfs_attach_transaction_barrier()
 */
809
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
810
{
811
	return start_transaction(root, 0, TRANS_ATTACH,
812
				 BTRFS_RESERVE_NO_FLUSH, true);
813 814
}

M
Miao Xie 已提交
815
/*
816
 * btrfs_attach_transaction_barrier() - catch the running transaction
M
Miao Xie 已提交
817
 *
818
 * It is similar to the above function, the difference is this one
M
Miao Xie 已提交
819 820 821 822 823 824 825 826
 * will wait for all the inactive transactions until they fully
 * complete.
 */
struct btrfs_trans_handle *
btrfs_attach_transaction_barrier(struct btrfs_root *root)
{
	struct btrfs_trans_handle *trans;

827
	trans = start_transaction(root, 0, TRANS_ATTACH,
828
				  BTRFS_RESERVE_NO_FLUSH, true);
A
Al Viro 已提交
829
	if (trans == ERR_PTR(-ENOENT))
830
		btrfs_wait_for_commit(root->fs_info, 0);
M
Miao Xie 已提交
831 832 833 834

	return trans;
}

835 836 837
/* Wait for a transaction commit to reach at least the given state. */
static noinline void wait_for_commit(struct btrfs_transaction *commit,
				     const enum btrfs_trans_state min_state)
838
{
839 840 841 842
	struct btrfs_fs_info *fs_info = commit->fs_info;
	u64 transid = commit->transid;
	bool put = false;

843 844 845 846 847 848 849 850 851
	/*
	 * At the moment this function is called with min_state either being
	 * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED.
	 */
	if (min_state == TRANS_STATE_COMPLETED)
		btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
	else
		btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);

852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
	while (1) {
		wait_event(commit->commit_wait, commit->state >= min_state);
		if (put)
			btrfs_put_transaction(commit);

		if (min_state < TRANS_STATE_COMPLETED)
			break;

		/*
		 * A transaction isn't really completed until all of the
		 * previous transactions are completed, but with fsync we can
		 * end up with SUPER_COMMITTED transactions before a COMPLETED
		 * transaction. Wait for those.
		 */

		spin_lock(&fs_info->trans_lock);
		commit = list_first_entry_or_null(&fs_info->trans_list,
						  struct btrfs_transaction,
						  list);
		if (!commit || commit->transid > transid) {
			spin_unlock(&fs_info->trans_lock);
			break;
		}
		refcount_inc(&commit->use_count);
		put = true;
		spin_unlock(&fs_info->trans_lock);
	}
879 880
}

881
int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
882 883
{
	struct btrfs_transaction *cur_trans = NULL, *t;
884
	int ret = 0;
885 886

	if (transid) {
887
		if (transid <= fs_info->last_trans_committed)
J
Josef Bacik 已提交
888
			goto out;
889 890

		/* find specified transaction */
891 892
		spin_lock(&fs_info->trans_lock);
		list_for_each_entry(t, &fs_info->trans_list, list) {
893 894
			if (t->transid == transid) {
				cur_trans = t;
895
				refcount_inc(&cur_trans->use_count);
896
				ret = 0;
897 898
				break;
			}
899 900
			if (t->transid > transid) {
				ret = 0;
901
				break;
902
			}
903
		}
904
		spin_unlock(&fs_info->trans_lock);
S
Sage Weil 已提交
905 906 907 908 909 910

		/*
		 * The specified transaction doesn't exist, or we
		 * raced with btrfs_commit_transaction
		 */
		if (!cur_trans) {
911
			if (transid > fs_info->last_trans_committed)
S
Sage Weil 已提交
912
				ret = -EINVAL;
913
			goto out;
S
Sage Weil 已提交
914
		}
915 916
	} else {
		/* find newest transaction that is committing | committed */
917 918
		spin_lock(&fs_info->trans_lock);
		list_for_each_entry_reverse(t, &fs_info->trans_list,
919
					    list) {
920 921
			if (t->state >= TRANS_STATE_COMMIT_START) {
				if (t->state == TRANS_STATE_COMPLETED)
922
					break;
923
				cur_trans = t;
924
				refcount_inc(&cur_trans->use_count);
925 926 927
				break;
			}
		}
928
		spin_unlock(&fs_info->trans_lock);
929
		if (!cur_trans)
J
Josef Bacik 已提交
930
			goto out;  /* nothing committing|committed */
931 932
	}

933
	wait_for_commit(cur_trans, TRANS_STATE_COMPLETED);
934
	btrfs_put_transaction(cur_trans);
J
Josef Bacik 已提交
935
out:
936 937 938
	return ret;
}

939
void btrfs_throttle(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
940
{
941
	wait_current_trans(fs_info);
C
Chris Mason 已提交
942 943
}

944
static bool should_end_transaction(struct btrfs_trans_handle *trans)
945
{
946
	struct btrfs_fs_info *fs_info = trans->fs_info;
947

948
	if (btrfs_check_space_for_delayed_refs(fs_info))
949
		return true;
950

951
	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
952 953
}

954
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
955 956 957
{
	struct btrfs_transaction *cur_trans = trans->transaction;

958
	if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
959
	    test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
960
		return true;
961

962
	return should_end_transaction(trans);
963 964
}

965 966
static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)

967
{
968 969
	struct btrfs_fs_info *fs_info = trans->fs_info;

970 971 972 973 974 975 976 977 978 979 980 981
	if (!trans->block_rsv) {
		ASSERT(!trans->bytes_reserved);
		return;
	}

	if (!trans->bytes_reserved)
		return;

	ASSERT(trans->block_rsv == &fs_info->trans_block_rsv);
	trace_btrfs_space_reservation(fs_info, "transaction",
				      trans->transid, trans->bytes_reserved, 0);
	btrfs_block_rsv_release(fs_info, trans->block_rsv,
982
				trans->bytes_reserved, NULL);
983 984 985
	trans->bytes_reserved = 0;
}

986
static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
987
				   int throttle)
C
Chris Mason 已提交
988
{
989
	struct btrfs_fs_info *info = trans->fs_info;
990
	struct btrfs_transaction *cur_trans = trans->transaction;
991
	int err = 0;
992

993 994
	if (refcount_read(&trans->use_count) > 1) {
		refcount_dec(&trans->use_count);
995 996 997 998
		trans->block_rsv = trans->orig_rsv;
		return 0;
	}

999
	btrfs_trans_release_metadata(trans);
1000
	trans->block_rsv = NULL;
1001

1002
	btrfs_create_pending_block_groups(trans);
1003

1004 1005
	btrfs_trans_release_chunk_metadata(trans);

1006
	if (trans->type & __TRANS_FREEZABLE)
1007
		sb_end_intwrite(info->sb);
1008

1009
	WARN_ON(cur_trans != info->running_transaction);
1010 1011
	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
	atomic_dec(&cur_trans->num_writers);
1012
	extwriter_counter_dec(cur_trans, trans->type);
1013

1014
	cond_wake_up(&cur_trans->writer_wait);
1015

1016
	btrfs_lockdep_release(info, btrfs_trans_num_extwriters);
1017 1018
	btrfs_lockdep_release(info, btrfs_trans_num_writers);

1019
	btrfs_put_transaction(cur_trans);
J
Josef Bacik 已提交
1020 1021 1022

	if (current->journal_info == trans)
		current->journal_info = NULL;
1023

Y
Yan, Zheng 已提交
1024
	if (throttle)
1025
		btrfs_run_delayed_iputs(info);
Y
Yan, Zheng 已提交
1026

J
Josef Bacik 已提交
1027
	if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) {
J
Josef Bacik 已提交
1028
		wake_up_process(info->transaction_kthread);
1029 1030 1031 1032
		if (TRANS_ABORTED(trans))
			err = trans->aborted;
		else
			err = -EROFS;
J
Josef Bacik 已提交
1033
	}
1034

1035 1036
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
	return err;
C
Chris Mason 已提交
1037 1038
}

1039
int btrfs_end_transaction(struct btrfs_trans_handle *trans)
1040
{
1041
	return __btrfs_end_transaction(trans, 0);
1042 1043
}

1044
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans)
1045
{
1046
	return __btrfs_end_transaction(trans, 1);
1047 1048
}

C
Chris Mason 已提交
1049 1050 1051
/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
1052
 * those extents are sent to disk but does not wait on them
C
Chris Mason 已提交
1053
 */
1054
int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
1055
			       struct extent_io_tree *dirty_pages, int mark)
C
Chris Mason 已提交
1056
{
1057
	int err = 0;
1058
	int werr = 0;
1059
	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1060
	struct extent_state *cached_state = NULL;
1061
	u64 start = 0;
1062
	u64 end;
1063

1064
	atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
J
Josef Bacik 已提交
1065
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
1066
				      mark, &cached_state)) {
1067 1068 1069 1070
		bool wait_writeback = false;

		err = convert_extent_bit(dirty_pages, start, end,
					 EXTENT_NEED_WAIT,
1071
					 mark, &cached_state);
1072 1073 1074 1075 1076
		/*
		 * convert_extent_bit can return -ENOMEM, which is most of the
		 * time a temporary error. So when it happens, ignore the error
		 * and wait for writeback of this range to finish - because we
		 * failed to set the bit EXTENT_NEED_WAIT for the range, a call
1077 1078 1079 1080 1081
		 * to __btrfs_wait_marked_extents() would not know that
		 * writeback for this range started and therefore wouldn't
		 * wait for it to finish - we don't want to commit a
		 * superblock that points to btree nodes/leafs for which
		 * writeback hasn't finished yet (and without errors).
1082
		 * We cleanup any entries left in the io tree when committing
1083
		 * the transaction (through extent_io_tree_release()).
1084 1085 1086 1087 1088 1089 1090
		 */
		if (err == -ENOMEM) {
			err = 0;
			wait_writeback = true;
		}
		if (!err)
			err = filemap_fdatawrite_range(mapping, start, end);
J
Josef Bacik 已提交
1091 1092
		if (err)
			werr = err;
1093 1094
		else if (wait_writeback)
			werr = filemap_fdatawait_range(mapping, start, end);
1095
		free_extent_state(cached_state);
1096
		cached_state = NULL;
J
Josef Bacik 已提交
1097 1098
		cond_resched();
		start = end + 1;
1099
	}
1100
	atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
1101 1102 1103 1104 1105 1106 1107 1108 1109
	return werr;
}

/*
 * when btree blocks are allocated, they have some corresponding bits set for
 * them in one of two extent_io trees.  This is used to make sure all of
 * those extents are on disk for transaction or log commit.  We wait
 * on all the pages and clear them from the dirty pages state tree
 */
1110 1111
static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
				       struct extent_io_tree *dirty_pages)
1112 1113 1114
{
	int err = 0;
	int werr = 0;
1115
	struct address_space *mapping = fs_info->btree_inode->i_mapping;
1116
	struct extent_state *cached_state = NULL;
1117 1118
	u64 start = 0;
	u64 end;
1119

J
Josef Bacik 已提交
1120
	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
1121
				      EXTENT_NEED_WAIT, &cached_state)) {
1122 1123 1124 1125 1126 1127
		/*
		 * Ignore -ENOMEM errors returned by clear_extent_bit().
		 * When committing the transaction, we'll remove any entries
		 * left in the io tree. For a log commit, we don't remove them
		 * after committing the log because the tree can be accessed
		 * concurrently - we do it only at transaction commit time when
1128
		 * it's safe to do it (through extent_io_tree_release()).
1129 1130
		 */
		err = clear_extent_bit(dirty_pages, start, end,
1131
				       EXTENT_NEED_WAIT, &cached_state);
1132 1133 1134 1135
		if (err == -ENOMEM)
			err = 0;
		if (!err)
			err = filemap_fdatawait_range(mapping, start, end);
J
Josef Bacik 已提交
1136 1137
		if (err)
			werr = err;
1138 1139
		free_extent_state(cached_state);
		cached_state = NULL;
J
Josef Bacik 已提交
1140 1141
		cond_resched();
		start = end + 1;
1142
	}
1143 1144
	if (err)
		werr = err;
1145 1146
	return werr;
}
1147

1148
static int btrfs_wait_extents(struct btrfs_fs_info *fs_info,
1149 1150 1151 1152
		       struct extent_io_tree *dirty_pages)
{
	bool errors = false;
	int err;
1153

1154 1155 1156 1157 1158 1159 1160 1161
	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
	if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags))
		errors = true;

	if (errors && !err)
		err = -EIO;
	return err;
}
1162

1163 1164 1165 1166 1167 1168
int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark)
{
	struct btrfs_fs_info *fs_info = log_root->fs_info;
	struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages;
	bool errors = false;
	int err;
1169

1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183
	ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);

	err = __btrfs_wait_marked_extents(fs_info, dirty_pages);
	if ((mark & EXTENT_DIRTY) &&
	    test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags))
		errors = true;

	if ((mark & EXTENT_NEW) &&
	    test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags))
		errors = true;

	if (errors && !err)
		err = -EIO;
	return err;
C
Chris Mason 已提交
1184 1185
}

1186
/*
1187 1188 1189 1190 1191
 * When btree blocks are allocated the corresponding extents are marked dirty.
 * This function ensures such extents are persisted on disk for transaction or
 * log commit.
 *
 * @trans: transaction whose dirty pages we'd like to write
1192
 */
1193
static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans)
1194 1195 1196
{
	int ret;
	int ret2;
1197
	struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages;
1198
	struct btrfs_fs_info *fs_info = trans->fs_info;
1199
	struct blk_plug plug;
1200

1201
	blk_start_plug(&plug);
1202
	ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY);
1203
	blk_finish_plug(&plug);
1204
	ret2 = btrfs_wait_extents(fs_info, dirty_pages);
1205

1206
	extent_io_tree_release(&trans->transaction->dirty_pages);
1207

1208 1209
	if (ret)
		return ret;
1210
	else if (ret2)
1211
		return ret2;
1212 1213
	else
		return 0;
1214 1215
}

C
Chris Mason 已提交
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225
/*
 * this is used to update the root pointer in the tree of tree roots.
 *
 * But, in the case of the extent allocation tree, updating the root
 * pointer may allocate blocks which may change the root of the extent
 * allocation tree.
 *
 * So, this loops and repeats and makes sure the cowonly root didn't
 * change while the root pointer was being updated in the metadata.
 */
1226 1227
static int update_cowonly_root(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root)
C
Chris Mason 已提交
1228 1229
{
	int ret;
1230
	u64 old_root_bytenr;
1231
	u64 old_root_used;
1232 1233
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *tree_root = fs_info->tree_root;
C
Chris Mason 已提交
1234

1235
	old_root_used = btrfs_root_used(&root->root_item);
1236

C
Chris Mason 已提交
1237
	while (1) {
1238
		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
1239
		if (old_root_bytenr == root->node->start &&
1240
		    old_root_used == btrfs_root_used(&root->root_item))
C
Chris Mason 已提交
1241
			break;
1242

1243
		btrfs_set_root_node(&root->root_item, root->node);
C
Chris Mason 已提交
1244
		ret = btrfs_update_root(trans, tree_root,
1245 1246
					&root->root_key,
					&root->root_item);
1247 1248
		if (ret)
			return ret;
1249

1250
		old_root_used = btrfs_root_used(&root->root_item);
1251
	}
1252

1253 1254 1255
	return 0;
}

C
Chris Mason 已提交
1256 1257
/*
 * update all the cowonly tree roots on disk
1258 1259 1260 1261
 *
 * The error handling in this function may not be obvious. Any of the
 * failures will cause the file system to go offline. We still need
 * to clean up the delayed refs.
C
Chris Mason 已提交
1262
 */
1263
static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans)
1264
{
1265
	struct btrfs_fs_info *fs_info = trans->fs_info;
1266
	struct list_head *dirty_bgs = &trans->transaction->dirty_bgs;
1267
	struct list_head *io_bgs = &trans->transaction->io_bgs;
1268
	struct list_head *next;
1269
	struct extent_buffer *eb;
1270
	int ret;
1271

1272 1273 1274 1275 1276 1277
	/*
	 * At this point no one can be using this transaction to modify any tree
	 * and no one can start another transaction to modify any tree either.
	 */
	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);

1278
	eb = btrfs_lock_root_node(fs_info->tree_root);
1279
	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
1280
			      0, &eb, BTRFS_NESTING_COW);
1281 1282
	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);
1283

1284 1285
	if (ret)
		return ret;
1286

1287
	ret = btrfs_run_dev_stats(trans);
1288 1289
	if (ret)
		return ret;
1290
	ret = btrfs_run_dev_replace(trans);
1291 1292
	if (ret)
		return ret;
1293
	ret = btrfs_run_qgroups(trans);
1294 1295
	if (ret)
		return ret;
1296

1297
	ret = btrfs_setup_space_cache(trans);
1298 1299 1300
	if (ret)
		return ret;

1301
again:
C
Chris Mason 已提交
1302
	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
1303
		struct btrfs_root *root;
1304 1305 1306
		next = fs_info->dirty_cowonly_roots.next;
		list_del_init(next);
		root = list_entry(next, struct btrfs_root, dirty_list);
1307
		clear_bit(BTRFS_ROOT_DIRTY, &root->state);
1308

1309 1310
		list_add_tail(&root->dirty_list,
			      &trans->transaction->switch_commits);
1311 1312 1313
		ret = update_cowonly_root(trans, root);
		if (ret)
			return ret;
C
Chris Mason 已提交
1314
	}
1315

1316 1317 1318 1319 1320
	/* Now flush any delayed refs generated by updating all of the roots */
	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
	if (ret)
		return ret;

1321
	while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) {
1322
		ret = btrfs_write_dirty_block_groups(trans);
1323 1324
		if (ret)
			return ret;
1325 1326 1327 1328 1329 1330 1331

		/*
		 * We're writing the dirty block groups, which could generate
		 * delayed refs, which could generate more dirty block groups,
		 * so we want to keep this flushing in this loop to make sure
		 * everything gets run.
		 */
1332
		ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
1333 1334 1335 1336 1337 1338 1339
		if (ret)
			return ret;
	}

	if (!list_empty(&fs_info->dirty_cowonly_roots))
		goto again;

1340 1341 1342
	/* Update dev-replace pointer once everything is committed */
	fs_info->dev_replace.committed_cursor_left =
		fs_info->dev_replace.cursor_left_last_write_of_item;
1343

C
Chris Mason 已提交
1344 1345 1346
	return 0;
}

1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
/*
 * If we had a pending drop we need to see if there are any others left in our
 * dead roots list, and if not clear our bit and wake any waiters.
 */
void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
	/*
	 * We put the drop in progress roots at the front of the list, so if the
	 * first entry doesn't have UNFINISHED_DROP set we can wake everybody
	 * up.
	 */
	spin_lock(&fs_info->trans_lock);
	if (!list_empty(&fs_info->dead_roots)) {
		struct btrfs_root *root = list_first_entry(&fs_info->dead_roots,
							   struct btrfs_root,
							   root_list);
		if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) {
			spin_unlock(&fs_info->trans_lock);
			return;
		}
	}
	spin_unlock(&fs_info->trans_lock);

	btrfs_wake_unfinished_drop(fs_info);
}

C
Chris Mason 已提交
1373 1374 1375 1376 1377
/*
 * dead roots are old snapshots that need to be deleted.  This allocates
 * a dirty root struct and adds it into the list of dead roots that need to
 * be deleted
 */
1378
void btrfs_add_dead_root(struct btrfs_root *root)
1379
{
1380 1381 1382
	struct btrfs_fs_info *fs_info = root->fs_info;

	spin_lock(&fs_info->trans_lock);
1383 1384
	if (list_empty(&root->root_list)) {
		btrfs_grab_root(root);
1385 1386 1387 1388 1389 1390

		/* We want to process the partially complete drops first. */
		if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state))
			list_add(&root->root_list, &fs_info->dead_roots);
		else
			list_add_tail(&root->root_list, &fs_info->dead_roots);
1391
	}
1392
	spin_unlock(&fs_info->trans_lock);
1393 1394
}

C
Chris Mason 已提交
1395
/*
1396 1397
 * Update each subvolume root and its relocation root, if it exists, in the tree
 * of tree roots. Also free log roots if they exist.
C
Chris Mason 已提交
1398
 */
1399
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
1400
{
1401
	struct btrfs_fs_info *fs_info = trans->fs_info;
1402 1403 1404
	struct btrfs_root *gang[8];
	int i;
	int ret;
1405

1406 1407 1408 1409 1410 1411
	/*
	 * At this point no one can be using this transaction to modify any tree
	 * and no one can start another transaction to modify any tree either.
	 */
	ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);

1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451
	spin_lock(&fs_info->fs_roots_radix_lock);
	while (1) {
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			struct btrfs_root *root = gang[i];
			int ret2;

			/*
			 * At this point we can neither have tasks logging inodes
			 * from a root nor trying to commit a log tree.
			 */
			ASSERT(atomic_read(&root->log_writers) == 0);
			ASSERT(atomic_read(&root->log_commit[0]) == 0);
			ASSERT(atomic_read(&root->log_commit[1]) == 0);

			radix_tree_tag_clear(&fs_info->fs_roots_radix,
					(unsigned long)root->root_key.objectid,
					BTRFS_ROOT_TRANS_TAG);
			spin_unlock(&fs_info->fs_roots_radix_lock);

			btrfs_free_log(trans, root);
			ret2 = btrfs_update_reloc_root(trans, root);
			if (ret2)
				return ret2;

			/* see comments in should_cow_block() */
			clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
			smp_mb__after_atomic();

			if (root->commit_root != root->node) {
				list_add_tail(&root->dirty_list,
					&trans->transaction->switch_commits);
				btrfs_set_root_node(&root->root_item,
						    root->node);
			}
1452

1453 1454 1455 1456 1457 1458 1459
			ret2 = btrfs_update_root(trans, fs_info->tree_root,
						&root->root_key,
						&root->root_item);
			if (ret2)
				return ret2;
			spin_lock(&fs_info->fs_roots_radix_lock);
			btrfs_qgroup_free_meta_all_pertrans(root);
1460 1461
		}
	}
1462
	spin_unlock(&fs_info->fs_roots_radix_lock);
1463
	return 0;
1464 1465
}

C
Chris Mason 已提交
1466
/*
1467 1468
 * defrag a given btree.
 * Every leaf in the btree is read and defragged.
C
Chris Mason 已提交
1469
 */
1470
int btrfs_defrag_root(struct btrfs_root *root)
1471 1472 1473
{
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_trans_handle *trans;
1474
	int ret;
1475

1476
	if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state))
1477
		return 0;
1478

1479
	while (1) {
1480
		trans = btrfs_start_transaction(root, 0);
1481 1482 1483 1484
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			break;
		}
1485

1486
		ret = btrfs_defrag_leaves(trans, root);
1487

1488
		btrfs_end_transaction(trans);
1489
		btrfs_btree_balance_dirty(info);
1490 1491
		cond_resched();

1492
		if (btrfs_fs_closing(info) || ret != -EAGAIN)
1493
			break;
1494

1495 1496
		if (btrfs_defrag_cancelled(info)) {
			btrfs_debug(info, "defrag_root cancelled");
1497 1498 1499
			ret = -EAGAIN;
			break;
		}
1500
	}
1501
	clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state);
1502
	return ret;
1503 1504
}

1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
/*
 * Do all special snapshot related qgroup dirty hack.
 *
 * Will do all needed qgroup inherit and dirty hack like switch commit
 * roots inside one transaction and write all btree into disk, to make
 * qgroup works.
 */
static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
				   struct btrfs_root *src,
				   struct btrfs_root *parent,
				   struct btrfs_qgroup_inherit *inherit,
				   u64 dst_objectid)
{
	struct btrfs_fs_info *fs_info = src->fs_info;
	int ret;

	/*
	 * Save some performance in the case that qgroups are not
	 * enabled. If this check races with the ioctl, rescan will
	 * kick in anyway.
	 */
1526
	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
1527 1528
		return 0;

1529
	/*
1530
	 * Ensure dirty @src will be committed.  Or, after coming
1531 1532 1533 1534
	 * commit_fs_roots() and switch_commit_roots(), any dirty but not
	 * recorded root will never be updated again, causing an outdated root
	 * item.
	 */
1535 1536 1537
	ret = record_root_in_trans(trans, src, 1);
	if (ret)
		return ret;
1538

1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
	/*
	 * btrfs_qgroup_inherit relies on a consistent view of the usage for the
	 * src root, so we must run the delayed refs here.
	 *
	 * However this isn't particularly fool proof, because there's no
	 * synchronization keeping us from changing the tree after this point
	 * before we do the qgroup_inherit, or even from making changes while
	 * we're doing the qgroup_inherit.  But that's a problem for the future,
	 * for now flush the delayed refs to narrow the race window where the
	 * qgroup counters could end up wrong.
	 */
	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
	if (ret) {
		btrfs_abort_transaction(trans, ret);
1553
		return ret;
1554 1555
	}

1556
	ret = commit_fs_roots(trans);
1557 1558
	if (ret)
		goto out;
1559
	ret = btrfs_qgroup_account_extents(trans);
1560 1561 1562 1563
	if (ret < 0)
		goto out;

	/* Now qgroup are all updated, we can inherit it to new qgroups */
1564
	ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid,
1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580
				   inherit);
	if (ret < 0)
		goto out;

	/*
	 * Now we do a simplified commit transaction, which will:
	 * 1) commit all subvolume and extent tree
	 *    To ensure all subvolume and extent tree have a valid
	 *    commit_root to accounting later insert_dir_item()
	 * 2) write all btree blocks onto disk
	 *    This is to make sure later btree modification will be cowed
	 *    Or commit_root can be populated and cause wrong qgroup numbers
	 * In this simplified commit, we don't really care about other trees
	 * like chunk and root tree, as they won't affect qgroup.
	 * And we don't write super to avoid half committed status.
	 */
1581
	ret = commit_cowonly_roots(trans);
1582 1583
	if (ret)
		goto out;
1584
	switch_commit_roots(trans);
1585
	ret = btrfs_write_and_wait_transaction(trans);
1586
	if (ret)
1587
		btrfs_handle_fs_error(fs_info, ret,
1588 1589 1590 1591 1592 1593 1594 1595 1596 1597
			"Error while writing out transaction for qgroup");

out:
	/*
	 * Force parent root to be updated, as we recorded it before so its
	 * last_trans == cur_transid.
	 * Or it won't be committed again onto disk after later
	 * insert_dir_item()
	 */
	if (!ret)
1598
		ret = record_root_in_trans(trans, parent, 1);
1599 1600 1601
	return ret;
}

C
Chris Mason 已提交
1602 1603
/*
 * new snapshots need to be created at a very specific time in the
1604 1605 1606 1607 1608 1609
 * transaction commit.  This does the actual creation.
 *
 * Note:
 * If the error which may affect the commitment of the current transaction
 * happens, we should return the error number. If the error which just affect
 * the creation of the pending snapshots, just return 0.
C
Chris Mason 已提交
1610
 */
1611
static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1612 1613
				   struct btrfs_pending_snapshot *pending)
{
1614 1615

	struct btrfs_fs_info *fs_info = trans->fs_info;
1616
	struct btrfs_key key;
1617
	struct btrfs_root_item *new_root_item;
1618 1619
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root = pending->root;
1620
	struct btrfs_root *parent_root;
L
Liu Bo 已提交
1621
	struct btrfs_block_rsv *rsv;
1622
	struct inode *parent_inode = pending->dir;
1623 1624
	struct btrfs_path *path;
	struct btrfs_dir_item *dir_item;
1625
	struct extent_buffer *tmp;
1626
	struct extent_buffer *old;
1627
	struct timespec64 cur_time;
1628
	int ret = 0;
1629
	u64 to_reserve = 0;
1630
	u64 index = 0;
1631
	u64 objectid;
L
Li Zefan 已提交
1632
	u64 root_flags;
1633 1634
	unsigned int nofs_flags;
	struct fscrypt_name fname;
1635

1636 1637
	ASSERT(pending->path);
	path = pending->path;
1638

1639 1640
	ASSERT(pending->root_item);
	new_root_item = pending->root_item;
1641

1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654
	/*
	 * We're inside a transaction and must make sure that any potential
	 * allocations with GFP_KERNEL in fscrypt won't recurse back to
	 * filesystem.
	 */
	nofs_flags = memalloc_nofs_save();
	pending->error = fscrypt_setup_filename(parent_inode,
						&pending->dentry->d_name, 0,
						&fname);
	memalloc_nofs_restore(nofs_flags);
	if (pending->error)
		goto free_pending;

1655
	pending->error = btrfs_get_free_objectid(tree_root, &objectid);
1656
	if (pending->error)
1657
		goto free_fname;
1658

1659 1660 1661 1662 1663 1664
	/*
	 * Make qgroup to skip current new snapshot's qgroupid, as it is
	 * accounted by later btrfs_qgroup_inherit().
	 */
	btrfs_set_skip_qgroup(trans, objectid);

1665
	btrfs_reloc_pre_snapshot(pending, &to_reserve);
1666 1667

	if (to_reserve > 0) {
1668
		pending->error = btrfs_block_rsv_add(fs_info,
1669 1670 1671 1672
						     &pending->block_rsv,
						     to_reserve,
						     BTRFS_RESERVE_NO_FLUSH);
		if (pending->error)
1673
			goto clear_skip_qgroup;
1674 1675
	}

1676
	key.objectid = objectid;
1677 1678
	key.offset = (u64)-1;
	key.type = BTRFS_ROOT_ITEM_KEY;
1679

1680
	rsv = trans->block_rsv;
1681
	trans->block_rsv = &pending->block_rsv;
1682
	trans->bytes_reserved = trans->block_rsv->reserved;
1683
	trace_btrfs_space_reservation(fs_info, "transaction",
1684 1685
				      trans->transid,
				      trans->bytes_reserved, 1);
1686
	parent_root = BTRFS_I(parent_inode)->root;
1687 1688 1689
	ret = record_root_in_trans(trans, parent_root, 0);
	if (ret)
		goto fail;
1690
	cur_time = current_time(parent_inode);
1691

1692 1693 1694
	/*
	 * insert the directory item
	 */
1695
	ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index);
1696
	BUG_ON(ret); /* -ENOMEM */
1697 1698 1699

	/* check if there is a file/dir which has the same name. */
	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
1700
					 btrfs_ino(BTRFS_I(parent_inode)),
1701
					 &fname.disk_name, 0);
1702
	if (dir_item != NULL && !IS_ERR(dir_item)) {
1703
		pending->error = -EEXIST;
1704
		goto dir_item_existed;
1705 1706
	} else if (IS_ERR(dir_item)) {
		ret = PTR_ERR(dir_item);
1707
		btrfs_abort_transaction(trans, ret);
1708
		goto fail;
1709
	}
1710
	btrfs_release_path(path);
1711

1712 1713 1714 1715 1716 1717
	/*
	 * pull in the delayed directory update
	 * and the delayed inode item
	 * otherwise we corrupt the FS during
	 * snapshot
	 */
1718
	ret = btrfs_run_delayed_items(trans);
1719
	if (ret) {	/* Transaction aborted */
1720
		btrfs_abort_transaction(trans, ret);
1721 1722
		goto fail;
	}
1723

1724 1725 1726 1727 1728
	ret = record_root_in_trans(trans, root, 0);
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto fail;
	}
1729 1730
	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1731
	btrfs_check_and_init_root_item(new_root_item);
1732

L
Li Zefan 已提交
1733 1734 1735 1736 1737 1738 1739
	root_flags = btrfs_root_flags(new_root_item);
	if (pending->readonly)
		root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
	else
		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
	btrfs_set_root_flags(new_root_item, root_flags);

1740 1741
	btrfs_set_root_generation_v2(new_root_item,
			trans->transid);
1742
	generate_random_guid(new_root_item->uuid);
1743 1744
	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
			BTRFS_UUID_SIZE);
1745 1746 1747 1748 1749 1750 1751 1752
	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
		memset(new_root_item->received_uuid, 0,
		       sizeof(new_root_item->received_uuid));
		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
		btrfs_set_root_stransid(new_root_item, 0);
		btrfs_set_root_rtransid(new_root_item, 0);
	}
1753 1754
	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
1755 1756
	btrfs_set_root_otransid(new_root_item, trans->transid);

1757
	old = btrfs_lock_root_node(root);
1758 1759
	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old,
			      BTRFS_NESTING_COW);
1760 1761 1762
	if (ret) {
		btrfs_tree_unlock(old);
		free_extent_buffer(old);
1763
		btrfs_abort_transaction(trans, ret);
1764
		goto fail;
1765
	}
1766 1767

	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
1768
	/* clean up in any case */
1769 1770
	btrfs_tree_unlock(old);
	free_extent_buffer(old);
1771
	if (ret) {
1772
		btrfs_abort_transaction(trans, ret);
1773 1774
		goto fail;
	}
1775
	/* see comments in should_cow_block() */
1776
	set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
1777 1778
	smp_wmb();

1779
	btrfs_set_root_node(new_root_item, tmp);
1780 1781 1782
	/* record when the snapshot was created in key.offset */
	key.offset = trans->transid;
	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
1783 1784
	btrfs_tree_unlock(tmp);
	free_extent_buffer(tmp);
1785
	if (ret) {
1786
		btrfs_abort_transaction(trans, ret);
1787 1788
		goto fail;
	}
1789

1790 1791 1792
	/*
	 * insert root back/forward references
	 */
1793
	ret = btrfs_add_root_ref(trans, objectid,
1794
				 parent_root->root_key.objectid,
1795
				 btrfs_ino(BTRFS_I(parent_inode)), index,
1796
				 &fname.disk_name);
1797
	if (ret) {
1798
		btrfs_abort_transaction(trans, ret);
1799 1800
		goto fail;
	}
1801

1802
	key.offset = (u64)-1;
1803
	pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
1804 1805
	if (IS_ERR(pending->snap)) {
		ret = PTR_ERR(pending->snap);
1806
		pending->snap = NULL;
1807
		btrfs_abort_transaction(trans, ret);
1808
		goto fail;
1809
	}
1810

1811
	ret = btrfs_reloc_post_snapshot(trans, pending);
1812
	if (ret) {
1813
		btrfs_abort_transaction(trans, ret);
1814 1815
		goto fail;
	}
1816

1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827
	/*
	 * Do special qgroup accounting for snapshot, as we do some qgroup
	 * snapshot hack to do fast snapshot.
	 * To co-operate with that hack, we do hack again.
	 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
	 */
	ret = qgroup_account_snapshot(trans, root, parent_root,
				      pending->inherit, objectid);
	if (ret < 0)
		goto fail;

1828 1829 1830
	ret = btrfs_insert_dir_item(trans, &fname.disk_name,
				    BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
				    index);
1831
	/* We have check then name at the beginning, so it is impossible. */
C
Chris Mason 已提交
1832
	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
1833
	if (ret) {
1834
		btrfs_abort_transaction(trans, ret);
1835 1836
		goto fail;
	}
1837

1838
	btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
1839
						  fname.disk_name.len * 2);
1840 1841
	parent_inode->i_mtime = current_time(parent_inode);
	parent_inode->i_ctime = parent_inode->i_mtime;
1842
	ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
1843
	if (ret) {
1844
		btrfs_abort_transaction(trans, ret);
1845 1846
		goto fail;
	}
1847 1848
	ret = btrfs_uuid_tree_add(trans, new_root_item->uuid,
				  BTRFS_UUID_KEY_SUBVOL,
1849
				  objectid);
1850
	if (ret) {
1851
		btrfs_abort_transaction(trans, ret);
1852 1853 1854
		goto fail;
	}
	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
1855
		ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid,
1856 1857 1858
					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
					  objectid);
		if (ret && ret != -EEXIST) {
1859
			btrfs_abort_transaction(trans, ret);
1860 1861 1862
			goto fail;
		}
	}
1863

1864
fail:
1865 1866
	pending->error = ret;
dir_item_existed:
L
Liu Bo 已提交
1867
	trans->block_rsv = rsv;
1868
	trans->bytes_reserved = 0;
1869 1870
clear_skip_qgroup:
	btrfs_clear_skip_qgroup(trans);
1871 1872 1873
free_fname:
	fscrypt_free_filename(&fname);
free_pending:
1874
	kfree(new_root_item);
1875
	pending->root_item = NULL;
1876
	btrfs_free_path(path);
1877 1878
	pending->path = NULL;

1879
	return ret;
1880 1881
}

C
Chris Mason 已提交
1882 1883 1884
/*
 * create all the snapshots we've scheduled for creation
 */
1885
static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans)
1886
{
1887
	struct btrfs_pending_snapshot *pending, *next;
1888
	struct list_head *head = &trans->transaction->pending_snapshots;
1889
	int ret = 0;
1890

1891 1892
	list_for_each_entry_safe(pending, next, head, list) {
		list_del(&pending->list);
1893
		ret = create_pending_snapshot(trans, pending);
1894 1895 1896 1897
		if (ret)
			break;
	}
	return ret;
1898 1899
}

1900
static void update_super_roots(struct btrfs_fs_info *fs_info)
1901 1902 1903 1904
{
	struct btrfs_root_item *root_item;
	struct btrfs_super_block *super;

1905
	super = fs_info->super_copy;
1906

1907
	root_item = &fs_info->chunk_root->root_item;
1908 1909 1910
	super->chunk_root = root_item->bytenr;
	super->chunk_root_generation = root_item->generation;
	super->chunk_root_level = root_item->level;
1911

1912
	root_item = &fs_info->tree_root->root_item;
1913 1914 1915
	super->root = root_item->bytenr;
	super->generation = root_item->generation;
	super->root_level = root_item->level;
1916
	if (btrfs_test_opt(fs_info, SPACE_CACHE))
1917
		super->cache_generation = root_item->generation;
1918 1919
	else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags))
		super->cache_generation = 0;
1920
	if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
1921
		super->uuid_tree_generation = root_item->generation;
1922 1923
}

1924 1925
int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
{
1926
	struct btrfs_transaction *trans;
1927
	int ret = 0;
1928

J
Josef Bacik 已提交
1929
	spin_lock(&info->trans_lock);
1930 1931 1932
	trans = info->running_transaction;
	if (trans)
		ret = (trans->state >= TRANS_STATE_COMMIT_START);
J
Josef Bacik 已提交
1933
	spin_unlock(&info->trans_lock);
1934 1935 1936
	return ret;
}

1937 1938
int btrfs_transaction_blocked(struct btrfs_fs_info *info)
{
1939
	struct btrfs_transaction *trans;
1940
	int ret = 0;
1941

J
Josef Bacik 已提交
1942
	spin_lock(&info->trans_lock);
1943 1944 1945
	trans = info->running_transaction;
	if (trans)
		ret = is_transaction_blocked(trans);
J
Josef Bacik 已提交
1946
	spin_unlock(&info->trans_lock);
1947 1948 1949
	return ret;
}

1950
void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans)
S
Sage Weil 已提交
1951
{
1952
	struct btrfs_fs_info *fs_info = trans->fs_info;
S
Sage Weil 已提交
1953 1954
	struct btrfs_transaction *cur_trans;

1955 1956 1957
	/* Kick the transaction kthread. */
	set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);
	wake_up_process(fs_info->transaction_kthread);
S
Sage Weil 已提交
1958 1959 1960

	/* take transaction reference */
	cur_trans = trans->transaction;
1961
	refcount_inc(&cur_trans->use_count);
S
Sage Weil 已提交
1962

1963
	btrfs_end_transaction(trans);
1964

1965 1966 1967 1968
	/*
	 * Wait for the current transaction commit to start and block
	 * subsequent transaction joins
	 */
1969
	btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
1970 1971 1972
	wait_event(fs_info->transaction_blocked_wait,
		   cur_trans->state >= TRANS_STATE_COMMIT_START ||
		   TRANS_ABORTED(cur_trans));
1973
	btrfs_put_transaction(cur_trans);
S
Sage Weil 已提交
1974 1975
}

1976
static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
1977
{
1978
	struct btrfs_fs_info *fs_info = trans->fs_info;
1979 1980
	struct btrfs_transaction *cur_trans = trans->transaction;

1981
	WARN_ON(refcount_read(&trans->use_count) > 1);
1982

1983
	btrfs_abort_transaction(trans, err);
1984

1985
	spin_lock(&fs_info->trans_lock);
1986

1987 1988 1989 1990 1991 1992
	/*
	 * If the transaction is removed from the list, it means this
	 * transaction has been committed successfully, so it is impossible
	 * to call the cleanup function.
	 */
	BUG_ON(list_empty(&cur_trans->list));
1993

1994
	if (cur_trans == fs_info->running_transaction) {
1995
		cur_trans->state = TRANS_STATE_COMMIT_DOING;
1996
		spin_unlock(&fs_info->trans_lock);
1997 1998 1999 2000 2001 2002

		/*
		 * The thread has already released the lockdep map as reader
		 * already in btrfs_commit_transaction().
		 */
		btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
2003 2004 2005
		wait_event(cur_trans->writer_wait,
			   atomic_read(&cur_trans->num_writers) == 1);

2006
		spin_lock(&fs_info->trans_lock);
2007
	}
2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018

	/*
	 * Now that we know no one else is still using the transaction we can
	 * remove the transaction from the list of transactions. This avoids
	 * the transaction kthread from cleaning up the transaction while some
	 * other task is still using it, which could result in a use-after-free
	 * on things like log trees, as it forces the transaction kthread to
	 * wait for this transaction to be cleaned up by us.
	 */
	list_del_init(&cur_trans->list);

2019
	spin_unlock(&fs_info->trans_lock);
2020

2021
	btrfs_cleanup_one_transaction(trans->transaction, fs_info);
2022

2023 2024 2025 2026
	spin_lock(&fs_info->trans_lock);
	if (cur_trans == fs_info->running_transaction)
		fs_info->running_transaction = NULL;
	spin_unlock(&fs_info->trans_lock);
2027

2028
	if (trans->type & __TRANS_FREEZABLE)
2029
		sb_end_intwrite(fs_info->sb);
2030 2031
	btrfs_put_transaction(cur_trans);
	btrfs_put_transaction(cur_trans);
2032

2033
	trace_btrfs_transaction_commit(fs_info);
2034 2035 2036

	if (current->journal_info == trans)
		current->journal_info = NULL;
2037
	btrfs_scrub_cancel(fs_info);
2038 2039 2040 2041

	kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

2042 2043 2044 2045 2046 2047 2048
/*
 * Release reserved delayed ref space of all pending block groups of the
 * transaction and remove them from the list
 */
static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans)
{
       struct btrfs_fs_info *fs_info = trans->fs_info;
2049
       struct btrfs_block_group *block_group, *tmp;
2050 2051 2052 2053 2054 2055 2056

       list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
               btrfs_delayed_refs_rsv_release(fs_info, 1);
               list_del_init(&block_group->bg_list);
       }
}

2057
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
2058
{
2059
	/*
2060
	 * We use try_to_writeback_inodes_sb() here because if we used
2061 2062 2063 2064 2065 2066
	 * btrfs_start_delalloc_roots we would deadlock with fs freeze.
	 * Currently are holding the fs freeze lock, if we do an async flush
	 * we'll do btrfs_join_transaction() and deadlock because we need to
	 * wait for the fs freeze lock.  Using the direct flushing we benefit
	 * from already being in a transaction and our join_transaction doesn't
	 * have to re-take the fs freeze lock.
2067 2068 2069 2070 2071 2072 2073 2074
	 *
	 * Note that try_to_writeback_inodes_sb() will only trigger writeback
	 * if it can read lock sb->s_umount. It will always be able to lock it,
	 * except when the filesystem is being unmounted or being frozen, but in
	 * those cases sync_filesystem() is called, which results in calling
	 * writeback_inodes_sb() while holding a write lock on sb->s_umount.
	 * Note that we don't call writeback_inodes_sb() directly, because it
	 * will emit a warning if sb->s_umount is not locked.
2075
	 */
2076
	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
2077
		try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC);
2078 2079 2080
	return 0;
}

2081
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
2082
{
2083
	if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
2084
		btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
2085 2086
}

2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107
/*
 * Add a pending snapshot associated with the given transaction handle to the
 * respective handle. This must be called after the transaction commit started
 * and while holding fs_info->trans_lock.
 * This serves to guarantee a caller of btrfs_commit_transaction() that it can
 * safely free the pending snapshot pointer in case btrfs_commit_transaction()
 * returns an error.
 */
static void add_pending_snapshot(struct btrfs_trans_handle *trans)
{
	struct btrfs_transaction *cur_trans = trans->transaction;

	if (!trans->pending_snapshot)
		return;

	lockdep_assert_held(&trans->fs_info->trans_lock);
	ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START);

	list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots);
}

2108 2109 2110 2111 2112 2113 2114 2115 2116
static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval)
{
	fs_info->commit_stats.commit_count++;
	fs_info->commit_stats.last_commit_dur = interval;
	fs_info->commit_stats.max_commit_dur =
			max_t(u64, fs_info->commit_stats.max_commit_dur, interval);
	fs_info->commit_stats.total_commit_dur += interval;
}

2117
int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
C
Chris Mason 已提交
2118
{
2119
	struct btrfs_fs_info *fs_info = trans->fs_info;
2120
	struct btrfs_transaction *cur_trans = trans->transaction;
C
Chris Mason 已提交
2121
	struct btrfs_transaction *prev_trans = NULL;
2122
	int ret;
2123 2124
	ktime_t start_time;
	ktime_t interval;
C
Chris Mason 已提交
2125

2126
	ASSERT(refcount_read(&trans->use_count) == 1);
2127
	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
2128

2129 2130
	clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);

2131
	/* Stop the commit early if ->aborted is set */
2132
	if (TRANS_ABORTED(cur_trans)) {
2133
		ret = cur_trans->aborted;
2134
		goto lockdep_trans_commit_start_release;
2135
	}
2136

2137 2138 2139
	btrfs_trans_release_metadata(trans);
	trans->block_rsv = NULL;

2140
	/*
2141 2142
	 * We only want one transaction commit doing the flushing so we do not
	 * waste a bunch of time on lock contention on the extent root node.
2143
	 */
2144 2145 2146 2147 2148 2149 2150
	if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING,
			      &cur_trans->delayed_refs.flags)) {
		/*
		 * Make a pass through all the delayed refs we have so far.
		 * Any running threads may add more while we are here.
		 */
		ret = btrfs_run_delayed_refs(trans, 0);
2151 2152
		if (ret)
			goto lockdep_trans_commit_start_release;
2153
	}
2154

2155
	btrfs_create_pending_block_groups(trans);
2156

2157
	if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
2158 2159 2160 2161 2162 2163 2164 2165
		int run_it = 0;

		/* this mutex is also taken before trying to set
		 * block groups readonly.  We need to make sure
		 * that nobody has set a block group readonly
		 * after a extents from that block group have been
		 * allocated for cache files.  btrfs_set_block_group_ro
		 * will wait for the transaction to commit if it
2166
		 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
2167
		 *
2168 2169
		 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
		 * only one process starts all the block group IO.  It wouldn't
2170 2171 2172
		 * hurt to have more than one go through, but there's no
		 * real advantage to it either.
		 */
2173
		mutex_lock(&fs_info->ro_block_group_mutex);
2174 2175
		if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
				      &cur_trans->flags))
2176
			run_it = 1;
2177
		mutex_unlock(&fs_info->ro_block_group_mutex);
2178

2179
		if (run_it) {
2180
			ret = btrfs_start_dirty_block_groups(trans);
2181 2182
			if (ret)
				goto lockdep_trans_commit_start_release;
2183
		}
2184 2185
	}

2186
	spin_lock(&fs_info->trans_lock);
2187
	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
2188 2189
		enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;

2190 2191
		add_pending_snapshot(trans);

2192
		spin_unlock(&fs_info->trans_lock);
2193
		refcount_inc(&cur_trans->use_count);
C
Chris Mason 已提交
2194

2195 2196
		if (trans->in_fsync)
			want_state = TRANS_STATE_SUPER_COMMITTED;
2197 2198 2199

		btrfs_trans_state_lockdep_release(fs_info,
						  BTRFS_LOCKDEP_TRANS_COMMIT_START);
2200 2201
		ret = btrfs_end_transaction(trans);
		wait_for_commit(cur_trans, want_state);
2202

2203
		if (TRANS_ABORTED(cur_trans))
2204 2205
			ret = cur_trans->aborted;

2206
		btrfs_put_transaction(cur_trans);
2207

2208
		return ret;
C
Chris Mason 已提交
2209
	}
2210

2211
	cur_trans->state = TRANS_STATE_COMMIT_START;
2212
	wake_up(&fs_info->transaction_blocked_wait);
2213
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
S
Sage Weil 已提交
2214

2215
	if (cur_trans->list.prev != &fs_info->trans_list) {
2216 2217 2218 2219 2220
		enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED;

		if (trans->in_fsync)
			want_state = TRANS_STATE_SUPER_COMMITTED;

C
Chris Mason 已提交
2221 2222
		prev_trans = list_entry(cur_trans->list.prev,
					struct btrfs_transaction, list);
2223
		if (prev_trans->state < want_state) {
2224
			refcount_inc(&prev_trans->use_count);
2225
			spin_unlock(&fs_info->trans_lock);
C
Chris Mason 已提交
2226

2227 2228
			wait_for_commit(prev_trans, want_state);

2229
			ret = READ_ONCE(prev_trans->aborted);
C
Chris Mason 已提交
2230

2231
			btrfs_put_transaction(prev_trans);
2232
			if (ret)
2233
				goto lockdep_release;
J
Josef Bacik 已提交
2234
		} else {
2235
			spin_unlock(&fs_info->trans_lock);
C
Chris Mason 已提交
2236
		}
J
Josef Bacik 已提交
2237
	} else {
2238
		spin_unlock(&fs_info->trans_lock);
2239 2240 2241 2242 2243 2244
		/*
		 * The previous transaction was aborted and was already removed
		 * from the list of transactions at fs_info->trans_list. So we
		 * abort to prevent writing a new superblock that reflects a
		 * corrupt state (pointing to trees with unwritten nodes/leafs).
		 */
J
Josef Bacik 已提交
2245
		if (BTRFS_FS_ERROR(fs_info)) {
2246
			ret = -EROFS;
2247
			goto lockdep_release;
2248
		}
C
Chris Mason 已提交
2249
	}
2250

2251 2252 2253 2254 2255 2256
	/*
	 * Get the time spent on the work done by the commit thread and not
	 * the time spent waiting on a previous commit
	 */
	start_time = ktime_get_ns();

2257 2258
	extwriter_counter_dec(cur_trans, trans->type);

2259
	ret = btrfs_start_delalloc_flush(fs_info);
2260
	if (ret)
2261
		goto lockdep_release;
2262

2263
	ret = btrfs_run_delayed_items(trans);
2264
	if (ret)
2265
		goto lockdep_release;
2266

2267 2268 2269 2270 2271 2272 2273
	/*
	 * The thread has started/joined the transaction thus it holds the
	 * lockdep map as a reader. It has to release it before acquiring the
	 * lockdep map as a writer.
	 */
	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters);
2274 2275
	wait_event(cur_trans->writer_wait,
		   extwriter_counter_read(cur_trans) == 0);
2276

2277
	/* some pending stuffs might be added after the previous flush. */
2278
	ret = btrfs_run_delayed_items(trans);
2279 2280
	if (ret) {
		btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
2281
		goto cleanup_transaction;
2282
	}
2283

2284
	btrfs_wait_delalloc_flush(fs_info);
2285

2286 2287 2288 2289 2290
	/*
	 * Wait for all ordered extents started by a fast fsync that joined this
	 * transaction. Otherwise if this transaction commits before the ordered
	 * extents complete we lose logged data after a power failure.
	 */
2291
	btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered);
2292 2293 2294
	wait_event(cur_trans->pending_wait,
		   atomic_read(&cur_trans->pending_ordered) == 0);

2295
	btrfs_scrub_pause(fs_info);
2296 2297 2298
	/*
	 * Ok now we need to make sure to block out any other joins while we
	 * commit the transaction.  We could have started a join before setting
2299
	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
2300
	 */
2301
	spin_lock(&fs_info->trans_lock);
2302
	add_pending_snapshot(trans);
2303
	cur_trans->state = TRANS_STATE_COMMIT_DOING;
2304
	spin_unlock(&fs_info->trans_lock);
2305 2306 2307 2308 2309 2310 2311 2312

	/*
	 * The thread has started/joined the transaction thus it holds the
	 * lockdep map as a reader. It has to release it before acquiring the
	 * lockdep map as a writer.
	 */
	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
	btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers);
2313 2314 2315
	wait_event(cur_trans->writer_wait,
		   atomic_read(&cur_trans->num_writers) == 1);

2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
	/*
	 * Make lockdep happy by acquiring the state locks after
	 * btrfs_trans_num_writers is released. If we acquired the state locks
	 * before releasing the btrfs_trans_num_writers lock then lockdep would
	 * complain because we did not follow the reverse order unlocking rule.
	 */
	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
	btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);

2326 2327 2328 2329 2330 2331 2332
	/*
	 * We've started the commit, clear the flag in case we were triggered to
	 * do an async commit but somebody else started before the transaction
	 * kthread could do the work.
	 */
	clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags);

2333
	if (TRANS_ABORTED(cur_trans)) {
2334
		ret = cur_trans->aborted;
2335
		btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2336
		goto scrub_continue;
2337
	}
C
Chris Mason 已提交
2338 2339 2340 2341 2342
	/*
	 * the reloc mutex makes sure that we stop
	 * the balancing code from coming in and moving
	 * extents around in the middle of the commit
	 */
2343
	mutex_lock(&fs_info->reloc_mutex);
C
Chris Mason 已提交
2344

2345 2346 2347 2348 2349
	/*
	 * We needn't worry about the delayed items because we will
	 * deal with them in create_pending_snapshot(), which is the
	 * core function of the snapshot creation.
	 */
2350
	ret = create_pending_snapshots(trans);
2351 2352
	if (ret)
		goto unlock_reloc;
2353

2354 2355 2356 2357 2358 2359 2360 2361 2362 2363
	/*
	 * We insert the dir indexes of the snapshots and update the inode
	 * of the snapshots' parents after the snapshot creation, so there
	 * are some delayed items which are not dealt with. Now deal with
	 * them.
	 *
	 * We needn't worry that this operation will corrupt the snapshots,
	 * because all the tree which are snapshoted will be forced to COW
	 * the nodes and leaves.
	 */
2364
	ret = btrfs_run_delayed_items(trans);
2365 2366
	if (ret)
		goto unlock_reloc;
2367

2368
	ret = btrfs_run_delayed_refs(trans, (unsigned long)-1);
2369 2370
	if (ret)
		goto unlock_reloc;
2371

2372 2373 2374 2375
	/*
	 * make sure none of the code above managed to slip in a
	 * delayed item
	 */
2376
	btrfs_assert_delayed_root_empty(fs_info);
2377

C
Chris Mason 已提交
2378
	WARN_ON(cur_trans != trans->transaction);
C
Chris Mason 已提交
2379

2380
	ret = commit_fs_roots(trans);
2381
	if (ret)
2382
		goto unlock_reloc;
2383

2384
	/* commit_fs_roots gets rid of all the tree log roots, it is now
2385 2386
	 * safe to free the root of tree log roots
	 */
2387
	btrfs_free_log_root_tree(trans, fs_info);
2388

2389 2390 2391 2392
	/*
	 * Since fs roots are all committed, we can get a quite accurate
	 * new_roots. So let's do quota accounting.
	 */
2393
	ret = btrfs_qgroup_account_extents(trans);
2394
	if (ret < 0)
2395
		goto unlock_reloc;
2396

2397
	ret = commit_cowonly_roots(trans);
2398
	if (ret)
2399
		goto unlock_reloc;
2400

2401 2402 2403 2404
	/*
	 * The tasks which save the space cache and inode cache may also
	 * update ->aborted, check it.
	 */
2405
	if (TRANS_ABORTED(cur_trans)) {
2406
		ret = cur_trans->aborted;
2407
		goto unlock_reloc;
2408 2409
	}

2410
	cur_trans = fs_info->running_transaction;
2411

2412 2413 2414
	btrfs_set_root_node(&fs_info->tree_root->root_item,
			    fs_info->tree_root->node);
	list_add_tail(&fs_info->tree_root->dirty_list,
2415
		      &cur_trans->switch_commits);
2416

2417 2418 2419
	btrfs_set_root_node(&fs_info->chunk_root->root_item,
			    fs_info->chunk_root->node);
	list_add_tail(&fs_info->chunk_root->dirty_list,
2420 2421
		      &cur_trans->switch_commits);

2422 2423 2424 2425 2426 2427 2428
	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
		btrfs_set_root_node(&fs_info->block_group_root->root_item,
				    fs_info->block_group_root->node);
		list_add_tail(&fs_info->block_group_root->dirty_list,
			      &cur_trans->switch_commits);
	}

2429
	switch_commit_roots(trans);
2430

2431
	ASSERT(list_empty(&cur_trans->dirty_bgs));
2432
	ASSERT(list_empty(&cur_trans->io_bgs));
2433
	update_super_roots(fs_info);
2434

2435 2436 2437 2438
	btrfs_set_super_log_root(fs_info->super_copy, 0);
	btrfs_set_super_log_root_level(fs_info->super_copy, 0);
	memcpy(fs_info->super_for_commit, fs_info->super_copy,
	       sizeof(*fs_info->super_copy));
C
Chris Mason 已提交
2439

2440
	btrfs_commit_device_sizes(cur_trans);
2441

2442 2443
	clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
	clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
2444

2445 2446
	btrfs_trans_release_chunk_metadata(trans);

2447 2448 2449 2450 2451 2452 2453 2454 2455 2456
	/*
	 * Before changing the transaction state to TRANS_STATE_UNBLOCKED and
	 * setting fs_info->running_transaction to NULL, lock tree_log_mutex to
	 * make sure that before we commit our superblock, no other task can
	 * start a new transaction and commit a log tree before we commit our
	 * superblock. Anyone trying to commit a log tree locks this mutex before
	 * writing its superblock.
	 */
	mutex_lock(&fs_info->tree_log_mutex);

2457
	spin_lock(&fs_info->trans_lock);
2458
	cur_trans->state = TRANS_STATE_UNBLOCKED;
2459 2460 2461
	fs_info->running_transaction = NULL;
	spin_unlock(&fs_info->trans_lock);
	mutex_unlock(&fs_info->reloc_mutex);
2462

2463
	wake_up(&fs_info->transaction_wait);
2464
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2465

2466
	ret = btrfs_write_and_wait_transaction(trans);
2467
	if (ret) {
2468 2469 2470
		btrfs_handle_fs_error(fs_info, ret,
				      "Error while writing out transaction");
		mutex_unlock(&fs_info->tree_log_mutex);
2471
		goto scrub_continue;
2472 2473
	}

2474 2475 2476 2477 2478 2479 2480
	/*
	 * At this point, we should have written all the tree blocks allocated
	 * in this transaction. So it's now safe to free the redirtyied extent
	 * buffers.
	 */
	btrfs_free_redirty_list(cur_trans);

2481
	ret = write_all_supers(fs_info, 0);
2482 2483 2484 2485
	/*
	 * the super is written, we can safely allow the tree-loggers
	 * to go about their business
	 */
2486
	mutex_unlock(&fs_info->tree_log_mutex);
2487 2488
	if (ret)
		goto scrub_continue;
2489

2490 2491 2492 2493 2494 2495
	/*
	 * We needn't acquire the lock here because there is no other task
	 * which can change it.
	 */
	cur_trans->state = TRANS_STATE_SUPER_COMMITTED;
	wake_up(&cur_trans->commit_wait);
2496
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
2497

2498
	btrfs_finish_extent_commit(trans);
2499

2500
	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
2501
		btrfs_clear_space_info_full(fs_info);
Z
Zhao Lei 已提交
2502

2503
	fs_info->last_trans_committed = cur_trans->transid;
2504 2505 2506 2507 2508
	/*
	 * We needn't acquire the lock here because there is no other task
	 * which can change it.
	 */
	cur_trans->state = TRANS_STATE_COMPLETED;
C
Chris Mason 已提交
2509
	wake_up(&cur_trans->commit_wait);
2510
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2511

2512
	spin_lock(&fs_info->trans_lock);
2513
	list_del_init(&cur_trans->list);
2514
	spin_unlock(&fs_info->trans_lock);
J
Josef Bacik 已提交
2515

2516 2517
	btrfs_put_transaction(cur_trans);
	btrfs_put_transaction(cur_trans);
2518

2519
	if (trans->type & __TRANS_FREEZABLE)
2520
		sb_end_intwrite(fs_info->sb);
2521

2522
	trace_btrfs_transaction_commit(fs_info);
2523

2524 2525
	interval = ktime_get_ns() - start_time;

2526
	btrfs_scrub_continue(fs_info);
A
Arne Jansen 已提交
2527

J
Josef Bacik 已提交
2528 2529 2530
	if (current->journal_info == trans)
		current->journal_info = NULL;

C
Chris Mason 已提交
2531
	kmem_cache_free(btrfs_trans_handle_cachep, trans);
Y
Yan, Zheng 已提交
2532

2533 2534
	update_commit_stats(fs_info, interval);

C
Chris Mason 已提交
2535
	return ret;
2536

2537 2538
unlock_reloc:
	mutex_unlock(&fs_info->reloc_mutex);
2539
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
2540
scrub_continue:
2541 2542
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED);
2543
	btrfs_scrub_continue(fs_info);
2544
cleanup_transaction:
2545
	btrfs_trans_release_metadata(trans);
2546
	btrfs_cleanup_pending_block_groups(trans);
2547
	btrfs_trans_release_chunk_metadata(trans);
2548
	trans->block_rsv = NULL;
2549
	btrfs_warn(fs_info, "Skipping commit of aborted transaction.");
2550 2551
	if (current->journal_info == trans)
		current->journal_info = NULL;
2552
	cleanup_transaction(trans, ret);
2553 2554

	return ret;
2555 2556

lockdep_release:
2557
	btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters);
2558 2559
	btrfs_lockdep_release(fs_info, btrfs_trans_num_writers);
	goto cleanup_transaction;
2560 2561 2562 2563 2564

lockdep_trans_commit_start_release:
	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
	btrfs_end_transaction(trans);
	return ret;
C
Chris Mason 已提交
2565 2566
}

C
Chris Mason 已提交
2567
/*
D
David Sterba 已提交
2568 2569 2570 2571 2572 2573 2574 2575
 * return < 0 if error
 * 0 if there are no more dead_roots at the time of call
 * 1 there are more to be processed, call me again
 *
 * The return value indicates there are certainly more snapshots to delete, but
 * if there comes a new one during processing, it may return 0. We don't mind,
 * because btrfs_commit_super will poke cleaner thread and it will process it a
 * few seconds later.
C
Chris Mason 已提交
2576
 */
2577
int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
2578
{
2579
	struct btrfs_root *root;
D
David Sterba 已提交
2580
	int ret;
2581

J
Josef Bacik 已提交
2582
	spin_lock(&fs_info->trans_lock);
D
David Sterba 已提交
2583 2584 2585 2586 2587 2588
	if (list_empty(&fs_info->dead_roots)) {
		spin_unlock(&fs_info->trans_lock);
		return 0;
	}
	root = list_first_entry(&fs_info->dead_roots,
			struct btrfs_root, root_list);
2589
	list_del_init(&root->root_list);
J
Josef Bacik 已提交
2590
	spin_unlock(&fs_info->trans_lock);
2591

2592
	btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
2593

D
David Sterba 已提交
2594
	btrfs_kill_all_delayed_nodes(root);
2595

D
David Sterba 已提交
2596 2597
	if (btrfs_header_backref_rev(root->node) <
			BTRFS_MIXED_BACKREF_REV)
2598
		ret = btrfs_drop_snapshot(root, 0, 0);
D
David Sterba 已提交
2599
	else
2600
		ret = btrfs_drop_snapshot(root, 1, 0);
2601

2602
	btrfs_put_root(root);
2603
	return (ret < 0) ? 0 : 1;
2604
}
2605

2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619
int __init btrfs_transaction_init(void)
{
	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
			sizeof(struct btrfs_trans_handle), 0,
			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
	if (!btrfs_trans_handle_cachep)
		return -ENOMEM;
	return 0;
}

void __cold btrfs_transaction_exit(void)
{
	kmem_cache_destroy(btrfs_trans_handle_cachep);
}