inode.c 299.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6
#include <crypto/hash.h>
7
#include <linux/kernel.h>
8
#include <linux/bio.h>
9
#include <linux/blk-cgroup.h>
S
Sage Weil 已提交
10
#include <linux/file.h>
C
Chris Mason 已提交
11 12 13 14 15 16 17 18 19
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/compat.h>
J
Josef Bacik 已提交
20
#include <linux/xattr.h>
J
Josef Bacik 已提交
21
#include <linux/posix_acl.h>
Y
Yan Zheng 已提交
22
#include <linux/falloc.h>
23
#include <linux/slab.h>
24
#include <linux/ratelimit.h>
25
#include <linux/btrfs.h>
D
David Woodhouse 已提交
26
#include <linux/blkdev.h>
27
#include <linux/posix_acl_xattr.h>
28
#include <linux/uio.h>
29
#include <linux/magic.h>
30
#include <linux/iversion.h>
O
Omar Sandoval 已提交
31
#include <linux/swap.h>
32
#include <linux/migrate.h>
33
#include <linux/sched/mm.h>
34
#include <linux/iomap.h>
35
#include <asm/unaligned.h>
B
Boris Burkov 已提交
36
#include <linux/fsverity.h>
37
#include "misc.h"
C
Chris Mason 已提交
38 39 40 41 42
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
43
#include "ordered-data.h"
44
#include "xattr.h"
45
#include "tree-log.h"
46
#include "volumes.h"
C
Chris Mason 已提交
47
#include "compression.h"
48
#include "locking.h"
49
#include "free-space-cache.h"
50
#include "props.h"
51
#include "qgroup.h"
52
#include "delalloc-space.h"
53
#include "block-group.h"
54
#include "space-info.h"
55
#include "zoned.h"
56
#include "subpage.h"
J
Josef Bacik 已提交
57
#include "inode-item.h"
C
Chris Mason 已提交
58 59

struct btrfs_iget_args {
D
David Sterba 已提交
60
	u64 ino;
C
Chris Mason 已提交
61 62 63
	struct btrfs_root *root;
};

64
struct btrfs_dio_data {
65 66
	ssize_t submitted;
	struct extent_changeset *data_reserved;
67 68
};

69 70 71 72 73
struct btrfs_rename_ctx {
	/* Output field. Stores the index number of the old directory entry. */
	u64 index;
};

74 75 76 77
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
78
static const struct address_space_operations btrfs_aops;
79
static const struct file_operations btrfs_dir_file_operations;
C
Chris Mason 已提交
80 81 82 83

static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
84
struct kmem_cache *btrfs_free_space_cachep;
85
struct kmem_cache *btrfs_free_space_bitmap_cachep;
C
Chris Mason 已提交
86

87
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
88
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
89
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
90
static noinline int cow_file_range(struct btrfs_inode *inode,
91
				   struct page *locked_page,
92
				   u64 start, u64 end, int *page_started,
93
				   unsigned long *nr_written, int unlock);
94 95
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
				       u64 len, u64 orig_start, u64 block_start,
96 97 98
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type);
99

100
static void __endio_write_update_ordered(struct btrfs_inode *inode,
101 102 103
					 const u64 offset, const u64 bytes,
					 const bool uptodate);

104 105 106 107 108 109 110 111
/*
 * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
 *
 * ilock_flags can have the following bit set:
 *
 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
 *		     return -EAGAIN
112
 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 */
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
{
	if (ilock_flags & BTRFS_ILOCK_SHARED) {
		if (ilock_flags & BTRFS_ILOCK_TRY) {
			if (!inode_trylock_shared(inode))
				return -EAGAIN;
			else
				return 0;
		}
		inode_lock_shared(inode);
	} else {
		if (ilock_flags & BTRFS_ILOCK_TRY) {
			if (!inode_trylock(inode))
				return -EAGAIN;
			else
				return 0;
		}
		inode_lock(inode);
	}
133 134
	if (ilock_flags & BTRFS_ILOCK_MMAP)
		down_write(&BTRFS_I(inode)->i_mmap_lock);
135 136 137 138 139 140 141 142 143 144 145
	return 0;
}

/*
 * btrfs_inode_unlock - unock inode i_rwsem
 *
 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
 * to decide whether the lock acquired is shared or exclusive.
 */
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
{
146 147
	if (ilock_flags & BTRFS_ILOCK_MMAP)
		up_write(&BTRFS_I(inode)->i_mmap_lock);
148 149 150 151 152 153
	if (ilock_flags & BTRFS_ILOCK_SHARED)
		inode_unlock_shared(inode);
	else
		inode_unlock(inode);
}

154 155
/*
 * Cleanup all submitted ordered extents in specified range to handle errors
156
 * from the btrfs_run_delalloc_range() callback.
157 158 159 160 161
 *
 * NOTE: caller must ensure that when an error happens, it can not call
 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 * to be released, which we want to happen only when finishing the ordered
162
 * extent (btrfs_finish_ordered_io()).
163
 */
164
static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
165 166
						 struct page *locked_page,
						 u64 offset, u64 bytes)
167
{
168 169
	unsigned long index = offset >> PAGE_SHIFT;
	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
170 171 172
	u64 page_start = page_offset(locked_page);
	u64 page_end = page_start + PAGE_SIZE - 1;

173 174 175
	struct page *page;

	while (index <= end_index) {
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
		/*
		 * For locked page, we will call end_extent_writepage() on it
		 * in run_delalloc_range() for the error handling.  That
		 * end_extent_writepage() function will call
		 * btrfs_mark_ordered_io_finished() to clear page Ordered and
		 * run the ordered extent accounting.
		 *
		 * Here we can't just clear the Ordered bit, or
		 * btrfs_mark_ordered_io_finished() would skip the accounting
		 * for the page range, and the ordered extent will never finish.
		 */
		if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
			index++;
			continue;
		}
191
		page = find_get_page(inode->vfs_inode.i_mapping, index);
192 193 194
		index++;
		if (!page)
			continue;
195 196 197 198 199 200

		/*
		 * Here we just clear all Ordered bits for every page in the
		 * range, then __endio_write_update_ordered() will handle
		 * the ordered extent accounting for the range.
		 */
201 202
		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
					       offset, bytes);
203 204
		put_page(page);
	}
205

206 207 208
	/* The locked page covers the full range, nothing needs to be done */
	if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
		return;
209 210 211 212 213 214
	/*
	 * In case this page belongs to the delalloc range being instantiated
	 * then skip it, since the first page of a range is going to be
	 * properly cleaned up by the caller of run_delalloc_range
	 */
	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
215 216
		bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
		offset = page_offset(locked_page) + PAGE_SIZE;
217 218
	}

219
	return __endio_write_update_ordered(inode, offset, bytes, false);
220 221
}

222
static int btrfs_dirty_inode(struct inode *inode);
223

224
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
225 226
				     struct inode *inode,  struct inode *dir,
				     const struct qstr *qstr)
J
Jim Owens 已提交
227 228 229
{
	int err;

230
	err = btrfs_init_acl(trans, inode, dir);
J
Jim Owens 已提交
231
	if (!err)
232
		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
J
Jim Owens 已提交
233 234 235
	return err;
}

C
Chris Mason 已提交
236 237 238 239 240
/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
241
static int insert_inline_extent(struct btrfs_trans_handle *trans,
242
				struct btrfs_path *path, bool extent_inserted,
C
Chris Mason 已提交
243 244
				struct btrfs_root *root, struct inode *inode,
				u64 start, size_t size, size_t compressed_size,
245
				int compress_type,
C
Chris Mason 已提交
246 247 248 249 250 251 252 253 254 255 256
				struct page **compressed_pages)
{
	struct extent_buffer *leaf;
	struct page *page = NULL;
	char *kaddr;
	unsigned long ptr;
	struct btrfs_file_extent_item *ei;
	int ret;
	size_t cur_size = size;
	unsigned long offset;

257 258 259
	ASSERT((compressed_size > 0 && compressed_pages) ||
	       (compressed_size == 0 && !compressed_pages));

260
	if (compressed_size && compressed_pages)
C
Chris Mason 已提交
261 262
		cur_size = compressed_size;

263 264 265
	if (!extent_inserted) {
		struct btrfs_key key;
		size_t datasize;
C
Chris Mason 已提交
266

267
		key.objectid = btrfs_ino(BTRFS_I(inode));
268
		key.offset = start;
269
		key.type = BTRFS_EXTENT_DATA_KEY;
C
Chris Mason 已提交
270

271 272 273
		datasize = btrfs_file_extent_calc_inline_size(cur_size);
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
274
		if (ret)
275
			goto fail;
C
Chris Mason 已提交
276 277 278 279 280 281 282 283 284 285 286
	}
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
	ptr = btrfs_file_extent_inline_start(ei);

287
	if (compress_type != BTRFS_COMPRESS_NONE) {
C
Chris Mason 已提交
288 289
		struct page *cpage;
		int i = 0;
C
Chris Mason 已提交
290
		while (compressed_size > 0) {
C
Chris Mason 已提交
291
			cpage = compressed_pages[i];
292
			cur_size = min_t(unsigned long, compressed_size,
293
				       PAGE_SIZE);
C
Chris Mason 已提交
294

295
			kaddr = kmap_atomic(cpage);
C
Chris Mason 已提交
296
			write_extent_buffer(leaf, kaddr, ptr, cur_size);
297
			kunmap_atomic(kaddr);
C
Chris Mason 已提交
298 299 300 301 302 303

			i++;
			ptr += cur_size;
			compressed_size -= cur_size;
		}
		btrfs_set_file_extent_compression(leaf, ei,
304
						  compress_type);
C
Chris Mason 已提交
305 306
	} else {
		page = find_get_page(inode->i_mapping,
307
				     start >> PAGE_SHIFT);
C
Chris Mason 已提交
308
		btrfs_set_file_extent_compression(leaf, ei, 0);
309
		kaddr = kmap_atomic(page);
310
		offset = offset_in_page(start);
C
Chris Mason 已提交
311
		write_extent_buffer(leaf, kaddr + offset, ptr, size);
312
		kunmap_atomic(kaddr);
313
		put_page(page);
C
Chris Mason 已提交
314 315
	}
	btrfs_mark_buffer_dirty(leaf);
316
	btrfs_release_path(path);
C
Chris Mason 已提交
317

318 319 320 321 322 323 324 325 326
	/*
	 * We align size to sectorsize for inline extents just for simplicity
	 * sake.
	 */
	size = ALIGN(size, root->fs_info->sectorsize);
	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
	if (ret)
		goto fail;

327 328 329 330 331 332 333 334 335
	/*
	 * we're an inline extent, so nobody can
	 * extend the file past i_size without locking
	 * a page we already have locked.
	 *
	 * We must do any isize and inode updates
	 * before we unlock the pages.  Otherwise we
	 * could end up racing with unlink.
	 */
C
Chris Mason 已提交
336 337
	BTRFS_I(inode)->disk_i_size = inode->i_size;
fail:
338
	return ret;
C
Chris Mason 已提交
339 340 341 342 343 344 345 346
}


/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
347
static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 start,
348 349 350
					  u64 end, size_t compressed_size,
					  int compress_type,
					  struct page **compressed_pages)
C
Chris Mason 已提交
351
{
352
	struct btrfs_drop_extents_args drop_args = { 0 };
353
	struct btrfs_root *root = inode->root;
354
	struct btrfs_fs_info *fs_info = root->fs_info;
355
	struct btrfs_trans_handle *trans;
356
	u64 isize = i_size_read(&inode->vfs_inode);
C
Chris Mason 已提交
357 358
	u64 actual_end = min(end + 1, isize);
	u64 inline_len = actual_end - start;
359
	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
C
Chris Mason 已提交
360 361
	u64 data_len = inline_len;
	int ret;
362
	struct btrfs_path *path;
C
Chris Mason 已提交
363 364 365 366 367

	if (compressed_size)
		data_len = compressed_size;

	if (start > 0 ||
368 369
	    actual_end > fs_info->sectorsize ||
	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
C
Chris Mason 已提交
370
	    (!compressed_size &&
371
	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
C
Chris Mason 已提交
372
	    end + 1 < isize ||
373
	    data_len > fs_info->max_inline) {
C
Chris Mason 已提交
374 375 376
		return 1;
	}

377 378 379 380
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

381
	trans = btrfs_join_transaction(root);
382 383
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
384
		return PTR_ERR(trans);
385
	}
386
	trans->block_rsv = &inode->block_rsv;
387

388 389 390 391 392 393
	drop_args.path = path;
	drop_args.start = start;
	drop_args.end = aligned_end;
	drop_args.drop_cache = true;
	drop_args.replace_extent = true;

394
	if (compressed_size && compressed_pages)
395
		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
396 397
		   compressed_size);
	else
398
		drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(
399 400
		    inline_len);

401
	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
402
	if (ret) {
403
		btrfs_abort_transaction(trans, ret);
404 405
		goto out;
	}
C
Chris Mason 已提交
406 407 408

	if (isize > actual_end)
		inline_len = min_t(u64, isize, actual_end);
409
	ret = insert_inline_extent(trans, path, drop_args.extent_inserted,
410
				   root, &inode->vfs_inode, start,
C
Chris Mason 已提交
411
				   inline_len, compressed_size,
412
				   compress_type, compressed_pages);
413
	if (ret && ret != -ENOSPC) {
414
		btrfs_abort_transaction(trans, ret);
415
		goto out;
416
	} else if (ret == -ENOSPC) {
417 418
		ret = 1;
		goto out;
419
	}
420

421
	btrfs_update_inode_bytes(inode, inline_len, drop_args.bytes_found);
422
	ret = btrfs_update_inode(trans, root, inode);
423 424 425 426 427 428 429 430
	if (ret && ret != -ENOSPC) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	} else if (ret == -ENOSPC) {
		ret = 1;
		goto out;
	}

431
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
432
out:
433 434 435 436 437 438
	/*
	 * Don't forget to free the reserved space, as for inlined extent
	 * it won't count as data extent, free them directly here.
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
439
	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
440
	btrfs_free_path(path);
441
	btrfs_end_transaction(trans);
442
	return ret;
C
Chris Mason 已提交
443 444
}

445 446 447 448 449 450
struct async_extent {
	u64 start;
	u64 ram_size;
	u64 compressed_size;
	struct page **pages;
	unsigned long nr_pages;
451
	int compress_type;
452 453 454
	struct list_head list;
};

455
struct async_chunk {
456 457 458 459
	struct inode *inode;
	struct page *locked_page;
	u64 start;
	u64 end;
460
	unsigned int write_flags;
461
	struct list_head extents;
462
	struct cgroup_subsys_state *blkcg_css;
463
	struct btrfs_work work;
464
	struct async_cow *async_cow;
465 466
};

467 468 469
struct async_cow {
	atomic_t num_chunks;
	struct async_chunk chunks[];
470 471
};

472
static noinline int add_async_extent(struct async_chunk *cow,
473 474 475
				     u64 start, u64 ram_size,
				     u64 compressed_size,
				     struct page **pages,
476 477
				     unsigned long nr_pages,
				     int compress_type)
478 479 480 481
{
	struct async_extent *async_extent;

	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
482
	BUG_ON(!async_extent); /* -ENOMEM */
483 484 485 486 487
	async_extent->start = start;
	async_extent->ram_size = ram_size;
	async_extent->compressed_size = compressed_size;
	async_extent->pages = pages;
	async_extent->nr_pages = nr_pages;
488
	async_extent->compress_type = compress_type;
489 490 491 492
	list_add_tail(&async_extent->list, &cow->extents);
	return 0;
}

493 494 495
/*
 * Check if the inode has flags compatible with compression
 */
496
static inline bool inode_can_compress(struct btrfs_inode *inode)
497
{
498 499
	if (inode->flags & BTRFS_INODE_NODATACOW ||
	    inode->flags & BTRFS_INODE_NODATASUM)
500 501 502 503 504 505 506 507
		return false;
	return true;
}

/*
 * Check if the inode needs to be submitted to compression, based on mount
 * options, defragmentation, properties or heuristics.
 */
508 509
static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
				      u64 end)
510
{
511
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
512

513
	if (!inode_can_compress(inode)) {
514 515
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
516
			btrfs_ino(inode));
517 518
		return 0;
	}
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550
	/*
	 * Special check for subpage.
	 *
	 * We lock the full page then run each delalloc range in the page, thus
	 * for the following case, we will hit some subpage specific corner case:
	 *
	 * 0		32K		64K
	 * |	|///////|	|///////|
	 *		\- A		\- B
	 *
	 * In above case, both range A and range B will try to unlock the full
	 * page [0, 64K), causing the one finished later will have page
	 * unlocked already, triggering various page lock requirement BUG_ON()s.
	 *
	 * So here we add an artificial limit that subpage compression can only
	 * if the range is fully page aligned.
	 *
	 * In theory we only need to ensure the first page is fully covered, but
	 * the tailing partial page will be locked until the full compression
	 * finishes, delaying the write of other range.
	 *
	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
	 * first to prevent any submitted async extent to unlock the full page.
	 * By this, we can ensure for subpage case that only the last async_cow
	 * will unlock the full page.
	 */
	if (fs_info->sectorsize < PAGE_SIZE) {
		if (!IS_ALIGNED(start, PAGE_SIZE) ||
		    !IS_ALIGNED(end + 1, PAGE_SIZE))
			return 0;
	}

551
	/* force compress */
552
	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
553
		return 1;
554
	/* defrag ioctl */
555
	if (inode->defrag_compress)
556
		return 1;
557
	/* bad compression ratios */
558
	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
559
		return 0;
560
	if (btrfs_test_opt(fs_info, COMPRESS) ||
561 562 563
	    inode->flags & BTRFS_INODE_COMPRESS ||
	    inode->prop_compress)
		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
564 565 566
	return 0;
}

567
static inline void inode_should_defrag(struct btrfs_inode *inode,
568
		u64 start, u64 end, u64 num_bytes, u32 small_write)
569 570 571
{
	/* If this is a small write inside eof, kick off a defrag */
	if (num_bytes < small_write &&
572
	    (start > 0 || end + 1 < inode->disk_i_size))
573
		btrfs_add_inode_defrag(NULL, inode, small_write);
574 575
}

C
Chris Mason 已提交
576
/*
577 578 579
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
 * locked (both pages and state bits are locked).
C
Chris Mason 已提交
580
 *
581 582 583 584 585
 * This is done inside an ordered work queue, and the compression
 * is spread across many cpus.  The actual IO submission is step
 * two, and the ordered work queue takes care of making sure that
 * happens in the same order things were put onto the queue by
 * writepages and friends.
C
Chris Mason 已提交
586
 *
587 588 589
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
590 591
 * are written in the same order that the flusher thread sent them
 * down.
C
Chris Mason 已提交
592
 */
593
static noinline int compress_file_range(struct async_chunk *async_chunk)
594
{
595
	struct inode *inode = async_chunk->inode;
596 597
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	u64 blocksize = fs_info->sectorsize;
598 599
	u64 start = async_chunk->start;
	u64 end = async_chunk->end;
C
Chris Mason 已提交
600
	u64 actual_end;
601
	u64 i_size;
602
	int ret = 0;
C
Chris Mason 已提交
603 604 605 606 607 608
	struct page **pages = NULL;
	unsigned long nr_pages;
	unsigned long total_compressed = 0;
	unsigned long total_in = 0;
	int i;
	int will_compress;
609
	int compress_type = fs_info->compress_type;
610
	int compressed_extents = 0;
611
	int redirty = 0;
612

613 614
	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
			SZ_16K);
C
Chris Mason 已提交
615

616 617 618 619 620 621 622 623 624 625 626 627 628
	/*
	 * We need to save i_size before now because it could change in between
	 * us evaluating the size and assigning it.  This is because we lock and
	 * unlock the page in truncate and fallocate, and then modify the i_size
	 * later on.
	 *
	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
	 * does that for us.
	 */
	barrier();
	i_size = i_size_read(inode);
	barrier();
	actual_end = min_t(u64, i_size, end + 1);
C
Chris Mason 已提交
629 630
again:
	will_compress = 0;
631
	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
632 633
	nr_pages = min_t(unsigned long, nr_pages,
			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
634

635 636 637 638 639 640 641 642 643 644 645 646 647
	/*
	 * we don't want to send crud past the end of i_size through
	 * compression, that's just a waste of CPU time.  So, if the
	 * end of the file is before the start of our current
	 * requested range of bytes, we bail out to the uncompressed
	 * cleanup code that can deal with all of this.
	 *
	 * It isn't really the fastest way to fix things, but this is a
	 * very uncommon corner.
	 */
	if (actual_end <= start)
		goto cleanup_and_bail_uncompressed;

C
Chris Mason 已提交
648 649
	total_compressed = actual_end - start;

650
	/*
651
	 * Skip compression for a small file range(<=blocksize) that
652
	 * isn't an inline extent, since it doesn't save disk space at all.
653 654 655 656 657
	 */
	if (total_compressed <= blocksize &&
	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
		goto cleanup_and_bail_uncompressed;

658 659 660 661 662 663 664 665 666 667 668
	/*
	 * For subpage case, we require full page alignment for the sector
	 * aligned range.
	 * Thus we must also check against @actual_end, not just @end.
	 */
	if (blocksize < PAGE_SIZE) {
		if (!IS_ALIGNED(start, PAGE_SIZE) ||
		    !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
			goto cleanup_and_bail_uncompressed;
	}

669 670
	total_compressed = min_t(unsigned long, total_compressed,
			BTRFS_MAX_UNCOMPRESSED);
C
Chris Mason 已提交
671 672
	total_in = 0;
	ret = 0;
673

674 675 676 677
	/*
	 * we do compression for mount -o compress and when the
	 * inode has not been flagged as nocompress.  This flag can
	 * change at any time if we discover bad compression ratios.
C
Chris Mason 已提交
678
	 */
679
	if (inode_need_compress(BTRFS_I(inode), start, end)) {
C
Chris Mason 已提交
680
		WARN_ON(pages);
681
		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
682 683
		if (!pages) {
			/* just bail out to the uncompressed code */
684
			nr_pages = 0;
685 686
			goto cont;
		}
C
Chris Mason 已提交
687

688 689 690
		if (BTRFS_I(inode)->defrag_compress)
			compress_type = BTRFS_I(inode)->defrag_compress;
		else if (BTRFS_I(inode)->prop_compress)
691
			compress_type = BTRFS_I(inode)->prop_compress;
692

693 694 695 696 697 698 699 700
		/*
		 * we need to call clear_page_dirty_for_io on each
		 * page in the range.  Otherwise applications with the file
		 * mmap'd can wander in and change the page contents while
		 * we are compressing them.
		 *
		 * If the compression fails for any reason, we set the pages
		 * dirty again later on.
701 702 703
		 *
		 * Note that the remaining part is redirtied, the start pointer
		 * has moved, the end is the original one.
704
		 */
705 706 707 708
		if (!redirty) {
			extent_range_clear_dirty_for_io(inode, start, end);
			redirty = 1;
		}
709 710 711 712

		/* Compression level is applied here and only here */
		ret = btrfs_compress_pages(
			compress_type | (fs_info->compress_level << 4),
713
					   inode->i_mapping, start,
714
					   pages,
715
					   &nr_pages,
716
					   &total_in,
717
					   &total_compressed);
C
Chris Mason 已提交
718 719

		if (!ret) {
720
			unsigned long offset = offset_in_page(total_compressed);
721
			struct page *page = pages[nr_pages - 1];
C
Chris Mason 已提交
722 723 724 725

			/* zero the tail end of the last page, we might be
			 * sending it down to disk
			 */
726 727
			if (offset)
				memzero_page(page, offset, PAGE_SIZE - offset);
C
Chris Mason 已提交
728 729 730
			will_compress = 1;
		}
	}
731
cont:
732 733 734 735 736
	/*
	 * Check cow_file_range() for why we don't even try to create inline
	 * extent for subpage case.
	 */
	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
C
Chris Mason 已提交
737
		/* lets try to make an inline extent */
738
		if (ret || total_in < actual_end) {
C
Chris Mason 已提交
739
			/* we didn't compress the entire range, try
740
			 * to make an uncompressed inline extent.
C
Chris Mason 已提交
741
			 */
742 743 744
			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
						    0, BTRFS_COMPRESS_NONE,
						    NULL);
C
Chris Mason 已提交
745
		} else {
746
			/* try making a compressed inline extent */
747
			ret = cow_file_range_inline(BTRFS_I(inode), start, end,
748 749
						    total_compressed,
						    compress_type, pages);
C
Chris Mason 已提交
750
		}
751
		if (ret <= 0) {
752
			unsigned long clear_flags = EXTENT_DELALLOC |
J
Josef Bacik 已提交
753 754
				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				EXTENT_DO_ACCOUNTING;
755 756 757
			unsigned long page_error_op;

			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
758

759
			/*
760 761 762
			 * inline extent creation worked or returned error,
			 * we don't need to create any more async work items.
			 * Unlock and free up our temp pages.
J
Josef Bacik 已提交
763 764 765 766 767
			 *
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be done _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
768
			 */
769 770
			extent_clear_unlock_delalloc(BTRFS_I(inode), start, end,
						     NULL,
771
						     clear_flags,
772
						     PAGE_UNLOCK |
773
						     PAGE_START_WRITEBACK |
774
						     page_error_op |
775
						     PAGE_END_WRITEBACK);
776

777 778 779 780 781 782 783 784 785 786 787
			/*
			 * Ensure we only free the compressed pages if we have
			 * them allocated, as we can still reach here with
			 * inode_need_compress() == false.
			 */
			if (pages) {
				for (i = 0; i < nr_pages; i++) {
					WARN_ON(pages[i]->mapping);
					put_page(pages[i]);
				}
				kfree(pages);
788 789
			}
			return 0;
C
Chris Mason 已提交
790 791 792 793 794 795 796 797 798
		}
	}

	if (will_compress) {
		/*
		 * we aren't doing an inline extent round the compressed size
		 * up to a block size boundary so the allocator does sane
		 * things
		 */
799
		total_compressed = ALIGN(total_compressed, blocksize);
C
Chris Mason 已提交
800 801 802

		/*
		 * one last check to make sure the compression is really a
803 804
		 * win, compare the page count read with the blocks on disk,
		 * compression must free at least one sector size
C
Chris Mason 已提交
805
		 */
806
		total_in = round_up(total_in, fs_info->sectorsize);
807
		if (total_compressed + blocksize <= total_in) {
808
			compressed_extents++;
809 810 811 812 813 814

			/*
			 * The async work queues will take care of doing actual
			 * allocation on disk for these compressed pages, and
			 * will submit them to the elevator.
			 */
815
			add_async_extent(async_chunk, start, total_in,
816
					total_compressed, pages, nr_pages,
817 818
					compress_type);

819 820
			if (start + total_in < end) {
				start += total_in;
821 822 823 824
				pages = NULL;
				cond_resched();
				goto again;
			}
825
			return compressed_extents;
C
Chris Mason 已提交
826 827
		}
	}
828
	if (pages) {
C
Chris Mason 已提交
829 830 831 832
		/*
		 * the compression code ran but failed to make things smaller,
		 * free any pages it allocated and our page pointer array
		 */
833
		for (i = 0; i < nr_pages; i++) {
C
Chris Mason 已提交
834
			WARN_ON(pages[i]->mapping);
835
			put_page(pages[i]);
C
Chris Mason 已提交
836 837 838 839
		}
		kfree(pages);
		pages = NULL;
		total_compressed = 0;
840
		nr_pages = 0;
C
Chris Mason 已提交
841 842

		/* flag the file so we don't compress in the future */
843
		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
844
		    !(BTRFS_I(inode)->prop_compress)) {
C
Chris Mason 已提交
845
			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
C
Chris Mason 已提交
846
		}
C
Chris Mason 已提交
847
	}
848
cleanup_and_bail_uncompressed:
849 850 851 852 853 854
	/*
	 * No compression, but we still need to write the pages in the file
	 * we've been given so far.  redirty the locked page if it corresponds
	 * to our extent and set things up for the async work queue to run
	 * cow_file_range to do the normal delalloc dance.
	 */
855 856 857
	if (async_chunk->locked_page &&
	    (page_offset(async_chunk->locked_page) >= start &&
	     page_offset(async_chunk->locked_page)) <= end) {
858
		__set_page_dirty_nobuffers(async_chunk->locked_page);
859
		/* unlocked later on in the async handlers */
860
	}
861 862 863

	if (redirty)
		extent_range_redirty_for_io(inode, start, end);
864
	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
865
			 BTRFS_COMPRESS_NONE);
866
	compressed_extents++;
867

868
	return compressed_extents;
869 870
}

871 872 873 874 875 876 877 878 879
static void free_async_extent_pages(struct async_extent *async_extent)
{
	int i;

	if (!async_extent->pages)
		return;

	for (i = 0; i < async_extent->nr_pages; i++) {
		WARN_ON(async_extent->pages[i]->mapping);
880
		put_page(async_extent->pages[i]);
881 882 883 884
	}
	kfree(async_extent->pages);
	async_extent->nr_pages = 0;
	async_extent->pages = NULL;
885 886
}

887 888 889
static int submit_uncompressed_range(struct btrfs_inode *inode,
				     struct async_extent *async_extent,
				     struct page *locked_page)
890
{
891 892 893 894 895
	u64 start = async_extent->start;
	u64 end = async_extent->start + async_extent->ram_size - 1;
	unsigned long nr_written = 0;
	int page_started = 0;
	int ret;
896

897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
	/*
	 * Call cow_file_range() to run the delalloc range directly, since we
	 * won't go to NOCOW or async path again.
	 *
	 * Also we call cow_file_range() with @unlock_page == 0, so that we
	 * can directly submit them without interruption.
	 */
	ret = cow_file_range(inode, locked_page, start, end, &page_started,
			     &nr_written, 0);
	/* Inline extent inserted, page gets unlocked and everything is done */
	if (page_started) {
		ret = 0;
		goto out;
	}
	if (ret < 0) {
		if (locked_page)
			unlock_page(locked_page);
		goto out;
	}
916

917 918 919 920 921 922
	ret = extent_write_locked_range(&inode->vfs_inode, start, end);
	/* All pages will be unlocked, including @locked_page */
out:
	kfree(async_extent);
	return ret;
}
923

924 925 926 927
static int submit_one_async_extent(struct btrfs_inode *inode,
				   struct async_chunk *async_chunk,
				   struct async_extent *async_extent,
				   u64 *alloc_hint)
928
{
929 930 931
	struct extent_io_tree *io_tree = &inode->io_tree;
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
932
	struct btrfs_key ins;
933
	struct page *locked_page = NULL;
934
	struct extent_map *em;
935
	int ret = 0;
936 937
	u64 start = async_extent->start;
	u64 end = async_extent->start + async_extent->ram_size - 1;
938

939 940 941 942 943 944 945
	/*
	 * If async_chunk->locked_page is in the async_extent range, we need to
	 * handle it.
	 */
	if (async_chunk->locked_page) {
		u64 locked_page_start = page_offset(async_chunk->locked_page);
		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
946

947 948
		if (!(start >= locked_page_end || end <= locked_page_start))
			locked_page = async_chunk->locked_page;
949
	}
950
	lock_extent(io_tree, start, end);
951

952 953 954
	/* We have fall back to uncompressed write */
	if (!async_extent->pages)
		return submit_uncompressed_range(inode, async_extent, locked_page);
955

956 957 958 959 960 961
	ret = btrfs_reserve_extent(root, async_extent->ram_size,
				   async_extent->compressed_size,
				   async_extent->compressed_size,
				   0, *alloc_hint, &ins, 1, 1);
	if (ret) {
		free_async_extent_pages(async_extent);
962
		/*
963 964 965 966 967
		 * Here we used to try again by going back to non-compressed
		 * path for ENOSPC.  But we can't reserve space even for
		 * compressed size, how could it work for uncompressed size
		 * which requires larger size?  So here we directly go error
		 * path.
968
		 */
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
		goto out_free;
	}

	/* Here we're doing allocation and writeback of the compressed pages */
	em = create_io_em(inode, start,
			  async_extent->ram_size,	/* len */
			  start,			/* orig_start */
			  ins.objectid,			/* block_start */
			  ins.offset,			/* block_len */
			  ins.offset,			/* orig_block_len */
			  async_extent->ram_size,	/* ram_bytes */
			  async_extent->compress_type,
			  BTRFS_ORDERED_COMPRESSED);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out_free_reserve;
	}
	free_extent_map(em);
987

988 989 990 991 992 993 994 995
	ret = btrfs_add_ordered_extent_compress(inode, start,	/* file_offset */
					ins.objectid,		/* disk_bytenr */
					async_extent->ram_size, /* num_bytes */
					ins.offset,		/* disk_num_bytes */
					async_extent->compress_type);
	if (ret) {
		btrfs_drop_extent_cache(inode, start, end, 0);
		goto out_free_reserve;
996
	}
997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);

	/* Clear dirty, set writeback and unlock the pages. */
	extent_clear_unlock_delalloc(inode, start, end,
			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
			PAGE_UNLOCK | PAGE_START_WRITEBACK);
	if (btrfs_submit_compressed_write(inode, start,	/* file_offset */
			    async_extent->ram_size,	/* num_bytes */
			    ins.objectid,		/* disk_bytenr */
			    ins.offset,			/* compressed_len */
			    async_extent->pages,	/* compressed_pages */
			    async_extent->nr_pages,
			    async_chunk->write_flags,
			    async_chunk->blkcg_css)) {
		const u64 start = async_extent->start;
		const u64 end = start + async_extent->ram_size - 1;

		btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);

		extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
					     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
		free_async_extent_pages(async_extent);
1019
	}
1020 1021 1022 1023
	*alloc_hint = ins.objectid + ins.offset;
	kfree(async_extent);
	return ret;

1024
out_free_reserve:
1025
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1026
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1027
out_free:
1028
	extent_clear_unlock_delalloc(inode, start, end,
1029
				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1030
				     EXTENT_DELALLOC_NEW |
1031
				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1032 1033
				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
1034
	free_async_extent_pages(async_extent);
1035
	kfree(async_extent);
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
	return ret;
}

/*
 * Phase two of compressed writeback.  This is the ordered portion of the code,
 * which only gets called in the order the work was queued.  We walk all the
 * async extents created by compress_file_range and send them down to the disk.
 */
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
{
	struct btrfs_inode *inode = BTRFS_I(async_chunk->inode);
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct async_extent *async_extent;
	u64 alloc_hint = 0;
	int ret = 0;

	while (!list_empty(&async_chunk->extents)) {
		u64 extent_start;
		u64 ram_size;

		async_extent = list_entry(async_chunk->extents.next,
					  struct async_extent, list);
		list_del(&async_extent->list);
		extent_start = async_extent->start;
		ram_size = async_extent->ram_size;

		ret = submit_one_async_extent(inode, async_chunk, async_extent,
					      &alloc_hint);
		btrfs_debug(fs_info,
"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
			    inode->root->root_key.objectid,
			    btrfs_ino(inode), extent_start, ram_size, ret);
	}
1069 1070
}

1071
static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1072 1073
				      u64 num_bytes)
{
1074
	struct extent_map_tree *em_tree = &inode->extent_tree;
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
	struct extent_map *em;
	u64 alloc_hint = 0;

	read_lock(&em_tree->lock);
	em = search_extent_mapping(em_tree, start, num_bytes);
	if (em) {
		/*
		 * if block start isn't an actual block number then find the
		 * first block in this inode and use that as a hint.  If that
		 * block is also bogus then just don't worry about it.
		 */
		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
			free_extent_map(em);
			em = search_extent_mapping(em_tree, 0, 0);
			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
				alloc_hint = em->block_start;
			if (em)
				free_extent_map(em);
		} else {
			alloc_hint = em->block_start;
			free_extent_map(em);
		}
	}
	read_unlock(&em_tree->lock);

	return alloc_hint;
}

1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
/*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
 * allocate extents on disk for the range, and create ordered data structs
 * in ram to track those extents.
 *
 * locked_page is the page that writepage had locked already.  We use
 * it to make sure we don't do extra locks or unlocks.
 *
 * *page_started is set to one if we unlock locked_page and do everything
 * required to start IO on it.  It may be clean and already done with
 * IO when we return.
 */
1116
static noinline int cow_file_range(struct btrfs_inode *inode,
1117
				   struct page *locked_page,
1118
				   u64 start, u64 end, int *page_started,
1119
				   unsigned long *nr_written, int unlock)
1120
{
1121 1122
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
1123 1124 1125
	u64 alloc_hint = 0;
	u64 num_bytes;
	unsigned long ram_size;
1126
	u64 cur_alloc_size = 0;
1127
	u64 min_alloc_size;
1128
	u64 blocksize = fs_info->sectorsize;
1129 1130
	struct btrfs_key ins;
	struct extent_map *em;
1131 1132 1133
	unsigned clear_bits;
	unsigned long page_ops;
	bool extent_reserved = false;
1134 1135
	int ret = 0;

1136
	if (btrfs_is_free_space_inode(inode)) {
1137
		WARN_ON_ONCE(1);
1138 1139
		ret = -EINVAL;
		goto out_unlock;
1140
	}
1141

1142
	num_bytes = ALIGN(end - start + 1, blocksize);
1143
	num_bytes = max(blocksize,  num_bytes);
1144
	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1145

1146
	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
C
Chris Mason 已提交
1147

1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
	/*
	 * Due to the page size limit, for subpage we can only trigger the
	 * writeback for the dirty sectors of page, that means data writeback
	 * is doing more writeback than what we want.
	 *
	 * This is especially unexpected for some call sites like fallocate,
	 * where we only increase i_size after everything is done.
	 * This means we can trigger inline extent even if we didn't want to.
	 * So here we skip inline extent creation completely.
	 */
	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
1159
		/* lets try to make an inline extent */
1160
		ret = cow_file_range_inline(inode, start, end, 0,
1161
					    BTRFS_COMPRESS_NONE, NULL);
1162
		if (ret == 0) {
J
Josef Bacik 已提交
1163 1164 1165 1166 1167 1168
			/*
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be run _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
			 */
1169 1170
			extent_clear_unlock_delalloc(inode, start, end,
				     locked_page,
1171
				     EXTENT_LOCKED | EXTENT_DELALLOC |
J
Josef Bacik 已提交
1172 1173
				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1174
				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1175
			*nr_written = *nr_written +
1176
			     (end - start + PAGE_SIZE) / PAGE_SIZE;
1177
			*page_started = 1;
1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190
			/*
			 * locked_page is locked by the caller of
			 * writepage_delalloc(), not locked by
			 * __process_pages_contig().
			 *
			 * We can't let __process_pages_contig() to unlock it,
			 * as it doesn't have any subpage::writers recorded.
			 *
			 * Here we manually unlock the page, since the caller
			 * can't use page_started to determine if it's an
			 * inline extent or a compressed extent.
			 */
			unlock_page(locked_page);
1191
			goto out;
1192 1193
		} else if (ret < 0) {
			goto out_unlock;
1194 1195 1196
		}
	}

1197 1198
	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
1199

1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210
	/*
	 * Relocation relies on the relocated extents to have exactly the same
	 * size as the original extents. Normally writeback for relocation data
	 * extents follows a NOCOW path because relocation preallocates the
	 * extents. However, due to an operation such as scrub turning a block
	 * group to RO mode, it may fallback to COW mode, so we must make sure
	 * an extent allocated during COW has exactly the requested size and can
	 * not be split into smaller extents, otherwise relocation breaks and
	 * fails during the stage where it updates the bytenr of file extent
	 * items.
	 */
1211
	if (btrfs_is_data_reloc_root(root))
1212 1213 1214 1215
		min_alloc_size = num_bytes;
	else
		min_alloc_size = fs_info->sectorsize;

1216 1217
	while (num_bytes > 0) {
		cur_alloc_size = num_bytes;
1218
		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1219
					   min_alloc_size, 0, alloc_hint,
1220
					   &ins, 1, 1);
1221
		if (ret < 0)
1222
			goto out_unlock;
1223 1224
		cur_alloc_size = ins.offset;
		extent_reserved = true;
C
Chris Mason 已提交
1225

1226
		ram_size = ins.offset;
1227
		em = create_io_em(inode, start, ins.offset, /* len */
1228 1229 1230 1231 1232 1233
				  start, /* orig_start */
				  ins.objectid, /* block_start */
				  ins.offset, /* block_len */
				  ins.offset, /* orig_block_len */
				  ram_size, /* ram_bytes */
				  BTRFS_COMPRESS_NONE, /* compress_type */
1234
				  BTRFS_ORDERED_REGULAR /* type */);
1235 1236
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
1237
			goto out_reserve;
1238
		}
1239
		free_extent_map(em);
1240

1241
		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1242 1243
					       ram_size, cur_alloc_size,
					       BTRFS_ORDERED_REGULAR);
1244
		if (ret)
1245
			goto out_drop_extent_cache;
C
Chris Mason 已提交
1246

1247
		if (btrfs_is_data_reloc_root(root)) {
1248
			ret = btrfs_reloc_clone_csums(inode, start,
1249
						      cur_alloc_size);
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260
			/*
			 * Only drop cache here, and process as normal.
			 *
			 * We must not allow extent_clear_unlock_delalloc()
			 * at out_unlock label to free meta of this ordered
			 * extent, as its meta should be freed by
			 * btrfs_finish_ordered_io().
			 *
			 * So we must continue until @start is increased to
			 * skip current ordered extent.
			 */
1261
			if (ret)
1262
				btrfs_drop_extent_cache(inode, start,
1263
						start + ram_size - 1, 0);
1264 1265
		}

1266
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1267

1268 1269 1270 1271
		/*
		 * We're not doing compressed IO, don't unlock the first page
		 * (which the caller expects to stay locked), don't clear any
		 * dirty bits and don't set any writeback bits
1272
		 *
1273 1274
		 * Do set the Ordered (Private2) bit so we know this page was
		 * properly setup for writepage.
C
Chris Mason 已提交
1275
		 */
1276
		page_ops = unlock ? PAGE_UNLOCK : 0;
1277
		page_ops |= PAGE_SET_ORDERED;
1278

1279
		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1280
					     locked_page,
1281
					     EXTENT_LOCKED | EXTENT_DELALLOC,
1282
					     page_ops);
1283 1284
		if (num_bytes < cur_alloc_size)
			num_bytes = 0;
1285
		else
1286
			num_bytes -= cur_alloc_size;
1287 1288
		alloc_hint = ins.objectid + ins.offset;
		start += cur_alloc_size;
1289
		extent_reserved = false;
1290 1291 1292 1293 1294 1295 1296 1297

		/*
		 * btrfs_reloc_clone_csums() error, since start is increased
		 * extent_clear_unlock_delalloc() at out_unlock label won't
		 * free metadata of current ordered extent, we're OK to exit.
		 */
		if (ret)
			goto out_unlock;
1298
	}
1299
out:
1300
	return ret;
1301

1302
out_drop_extent_cache:
1303
	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1304
out_reserve:
1305
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1306
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1307
out_unlock:
1308 1309
	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1310
	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321
	/*
	 * If we reserved an extent for our delalloc range (or a subrange) and
	 * failed to create the respective ordered extent, then it means that
	 * when we reserved the extent we decremented the extent's size from
	 * the data space_info's bytes_may_use counter and incremented the
	 * space_info's bytes_reserved counter by the same amount. We must make
	 * sure extent_clear_unlock_delalloc() does not try to decrement again
	 * the data space_info's bytes_may_use counter, therefore we do not pass
	 * it the flag EXTENT_CLEAR_DATA_RESV.
	 */
	if (extent_reserved) {
1322
		extent_clear_unlock_delalloc(inode, start,
1323
					     start + cur_alloc_size - 1,
1324 1325 1326 1327 1328 1329 1330
					     locked_page,
					     clear_bits,
					     page_ops);
		start += cur_alloc_size;
		if (start >= end)
			goto out;
	}
1331
	extent_clear_unlock_delalloc(inode, start, end, locked_page,
1332 1333
				     clear_bits | EXTENT_CLEAR_DATA_RESV,
				     page_ops);
1334
	goto out;
1335
}
C
Chris Mason 已提交
1336

1337 1338 1339 1340 1341
/*
 * work queue call back to started compression on a file and pages
 */
static noinline void async_cow_start(struct btrfs_work *work)
{
1342
	struct async_chunk *async_chunk;
1343
	int compressed_extents;
1344

1345
	async_chunk = container_of(work, struct async_chunk, work);
1346

1347 1348
	compressed_extents = compress_file_range(async_chunk);
	if (compressed_extents == 0) {
1349 1350
		btrfs_add_delayed_iput(async_chunk->inode);
		async_chunk->inode = NULL;
1351
	}
1352 1353 1354 1355 1356 1357 1358
}

/*
 * work queue call back to submit previously compressed pages
 */
static noinline void async_cow_submit(struct btrfs_work *work)
{
1359 1360 1361
	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
						     work);
	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1362 1363
	unsigned long nr_pages;

1364
	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1365
		PAGE_SHIFT;
1366

1367
	/*
1368
	 * ->inode could be NULL if async_chunk_start has failed to compress,
1369 1370 1371 1372
	 * in which case we don't have anything to submit, yet we need to
	 * always adjust ->async_delalloc_pages as its paired with the init
	 * happening in cow_file_range_async
	 */
1373 1374
	if (async_chunk->inode)
		submit_compressed_extents(async_chunk);
1375 1376 1377 1378 1379

	/* atomic_sub_return implies a barrier */
	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
	    5 * SZ_1M)
		cond_wake_up_nomb(&fs_info->async_submit_wait);
1380
}
C
Chris Mason 已提交
1381

1382 1383
static noinline void async_cow_free(struct btrfs_work *work)
{
1384
	struct async_chunk *async_chunk;
1385
	struct async_cow *async_cow;
1386

1387 1388 1389
	async_chunk = container_of(work, struct async_chunk, work);
	if (async_chunk->inode)
		btrfs_add_delayed_iput(async_chunk->inode);
1390 1391
	if (async_chunk->blkcg_css)
		css_put(async_chunk->blkcg_css);
1392 1393 1394 1395

	async_cow = async_chunk->async_cow;
	if (atomic_dec_and_test(&async_cow->num_chunks))
		kvfree(async_cow);
1396 1397
}

1398
static int cow_file_range_async(struct btrfs_inode *inode,
1399 1400
				struct writeback_control *wbc,
				struct page *locked_page,
1401
				u64 start, u64 end, int *page_started,
1402
				unsigned long *nr_written)
1403
{
1404
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1405
	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1406 1407
	struct async_cow *ctx;
	struct async_chunk *async_chunk;
1408 1409
	unsigned long nr_pages;
	u64 cur_end;
1410 1411 1412
	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
	int i;
	bool should_compress;
1413
	unsigned nofs_flag;
1414
	const unsigned int write_flags = wbc_to_write_flags(wbc);
1415

1416
	unlock_extent(&inode->io_tree, start, end);
1417

1418
	if (inode->flags & BTRFS_INODE_NOCOMPRESS &&
1419 1420 1421 1422 1423 1424 1425
	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
		num_chunks = 1;
		should_compress = false;
	} else {
		should_compress = true;
	}

1426 1427 1428 1429
	nofs_flag = memalloc_nofs_save();
	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
	memalloc_nofs_restore(nofs_flag);

1430 1431 1432 1433
	if (!ctx) {
		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
			EXTENT_DO_ACCOUNTING;
1434 1435
		unsigned long page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK |
					 PAGE_END_WRITEBACK | PAGE_SET_ERROR;
1436

1437 1438
		extent_clear_unlock_delalloc(inode, start, end, locked_page,
					     clear_bits, page_ops);
1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449
		return -ENOMEM;
	}

	async_chunk = ctx->chunks;
	atomic_set(&ctx->num_chunks, num_chunks);

	for (i = 0; i < num_chunks; i++) {
		if (should_compress)
			cur_end = min(end, start + SZ_512K - 1);
		else
			cur_end = end;
1450

1451 1452 1453 1454
		/*
		 * igrab is called higher up in the call chain, take only the
		 * lightweight reference for the callback lifetime
		 */
1455
		ihold(&inode->vfs_inode);
1456
		async_chunk[i].async_cow = ctx;
1457
		async_chunk[i].inode = &inode->vfs_inode;
1458 1459 1460 1461 1462
		async_chunk[i].start = start;
		async_chunk[i].end = cur_end;
		async_chunk[i].write_flags = write_flags;
		INIT_LIST_HEAD(&async_chunk[i].extents);

1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
		/*
		 * The locked_page comes all the way from writepage and its
		 * the original page we were actually given.  As we spread
		 * this large delalloc region across multiple async_chunk
		 * structs, only the first struct needs a pointer to locked_page
		 *
		 * This way we don't need racey decisions about who is supposed
		 * to unlock it.
		 */
		if (locked_page) {
1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483
			/*
			 * Depending on the compressibility, the pages might or
			 * might not go through async.  We want all of them to
			 * be accounted against wbc once.  Let's do it here
			 * before the paths diverge.  wbc accounting is used
			 * only for foreign writeback detection and doesn't
			 * need full accuracy.  Just account the whole thing
			 * against the first page.
			 */
			wbc_account_cgroup_owner(wbc, locked_page,
						 cur_end - start);
1484 1485 1486 1487 1488 1489
			async_chunk[i].locked_page = locked_page;
			locked_page = NULL;
		} else {
			async_chunk[i].locked_page = NULL;
		}

1490 1491 1492 1493 1494 1495 1496
		if (blkcg_css != blkcg_root_css) {
			css_get(blkcg_css);
			async_chunk[i].blkcg_css = blkcg_css;
		} else {
			async_chunk[i].blkcg_css = NULL;
		}

1497 1498
		btrfs_init_work(&async_chunk[i].work, async_cow_start,
				async_cow_submit, async_cow_free);
1499

1500
		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1501
		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1502

1503
		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1504 1505 1506 1507 1508 1509

		*nr_written += nr_pages;
		start = cur_end + 1;
	}
	*page_started = 1;
	return 0;
1510 1511
}

1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
				       struct page *locked_page, u64 start,
				       u64 end, int *page_started,
				       unsigned long *nr_written)
{
	int ret;

	ret = cow_file_range(inode, locked_page, start, end, page_started,
			     nr_written, 0);
	if (ret)
		return ret;

	if (*page_started)
		return 0;

	__set_page_dirty_nobuffers(locked_page);
	account_page_redirty(locked_page);
1529
	extent_write_locked_range(&inode->vfs_inode, start, end);
1530 1531 1532 1533 1534
	*page_started = 1;

	return 0;
}

1535
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1536 1537
					u64 bytenr, u64 num_bytes)
{
1538
	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1539
	struct btrfs_ordered_sum *sums;
1540
	int ret;
1541 1542
	LIST_HEAD(list);

1543
	ret = btrfs_lookup_csums_range(csum_root, bytenr,
A
Arne Jansen 已提交
1544
				       bytenr + num_bytes - 1, &list, 0);
1545 1546 1547 1548 1549 1550 1551 1552
	if (ret == 0 && list_empty(&list))
		return 0;

	while (!list_empty(&list)) {
		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
		list_del(&sums->list);
		kfree(sums);
	}
1553 1554
	if (ret < 0)
		return ret;
1555 1556 1557
	return 1;
}

1558
static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1559 1560 1561
			   const u64 start, const u64 end,
			   int *page_started, unsigned long *nr_written)
{
1562
	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1563
	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1564
	const u64 range_bytes = end + 1 - start;
1565
	struct extent_io_tree *io_tree = &inode->io_tree;
1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591
	u64 range_start = start;
	u64 count;

	/*
	 * If EXTENT_NORESERVE is set it means that when the buffered write was
	 * made we had not enough available data space and therefore we did not
	 * reserve data space for it, since we though we could do NOCOW for the
	 * respective file range (either there is prealloc extent or the inode
	 * has the NOCOW bit set).
	 *
	 * However when we need to fallback to COW mode (because for example the
	 * block group for the corresponding extent was turned to RO mode by a
	 * scrub or relocation) we need to do the following:
	 *
	 * 1) We increment the bytes_may_use counter of the data space info.
	 *    If COW succeeds, it allocates a new data extent and after doing
	 *    that it decrements the space info's bytes_may_use counter and
	 *    increments its bytes_reserved counter by the same amount (we do
	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
	 *    bytes_may_use counter to compensate (when space is reserved at
	 *    buffered write time, the bytes_may_use counter is incremented);
	 *
	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
	 *    that if the COW path fails for any reason, it decrements (through
	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
	 *    data space info, which we incremented in the step above.
1592 1593
	 *
	 * If we need to fallback to cow and the inode corresponds to a free
1594 1595 1596
	 * space cache inode or an inode of the data relocation tree, we must
	 * also increment bytes_may_use of the data space_info for the same
	 * reason. Space caches and relocated data extents always get a prealloc
1597
	 * extent for them, however scrub or balance may have set the block
1598 1599
	 * group that contains that extent to RO mode and therefore force COW
	 * when starting writeback.
1600
	 */
1601
	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1602
				 EXTENT_NORESERVE, 0);
1603 1604
	if (count > 0 || is_space_ino || is_reloc_ino) {
		u64 bytes = count;
1605
		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1606 1607
		struct btrfs_space_info *sinfo = fs_info->data_sinfo;

1608 1609 1610
		if (is_space_ino || is_reloc_ino)
			bytes = range_bytes;

1611
		spin_lock(&sinfo->lock);
1612
		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1613 1614
		spin_unlock(&sinfo->lock);

1615 1616 1617
		if (count > 0)
			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
					 0, 0, NULL);
1618 1619
	}

1620 1621
	return cow_file_range(inode, locked_page, start, end, page_started,
			      nr_written, 1);
1622 1623
}

C
Chris Mason 已提交
1624 1625 1626 1627 1628 1629 1630
/*
 * when nowcow writeback call back.  This checks for snapshots or COW copies
 * of the extents that exist in the file, and COWs the file as required.
 *
 * If no cow copies or snapshots exist, we write directly to the existing
 * blocks on disk
 */
1631
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1632
				       struct page *locked_page,
1633
				       const u64 start, const u64 end,
1634
				       int *page_started,
1635
				       unsigned long *nr_written)
1636
{
1637 1638
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct btrfs_root *root = inode->root;
1639
	struct btrfs_path *path;
1640 1641
	u64 cow_start = (u64)-1;
	u64 cur_offset = start;
1642
	int ret;
1643
	bool check_prev = true;
1644 1645
	const bool freespace_inode = btrfs_is_free_space_inode(inode);
	u64 ino = btrfs_ino(inode);
1646 1647
	bool nocow = false;
	u64 disk_bytenr = 0;
1648
	const bool force = inode->flags & BTRFS_INODE_NODATACOW;
1649 1650

	path = btrfs_alloc_path();
1651
	if (!path) {
1652
		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1653
					     EXTENT_LOCKED | EXTENT_DELALLOC |
1654 1655
					     EXTENT_DO_ACCOUNTING |
					     EXTENT_DEFRAG, PAGE_UNLOCK |
1656
					     PAGE_START_WRITEBACK |
1657
					     PAGE_END_WRITEBACK);
1658
		return -ENOMEM;
1659
	}
1660

Y
Yan Zheng 已提交
1661
	while (1) {
1662 1663 1664 1665 1666 1667 1668 1669 1670
		struct btrfs_key found_key;
		struct btrfs_file_extent_item *fi;
		struct extent_buffer *leaf;
		u64 extent_end;
		u64 extent_offset;
		u64 num_bytes = 0;
		u64 disk_num_bytes;
		u64 ram_bytes;
		int extent_type;
1671 1672

		nocow = false;
1673

1674
		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
Y
Yan Zheng 已提交
1675
					       cur_offset, 0);
1676
		if (ret < 0)
1677
			goto error;
1678 1679 1680 1681 1682 1683

		/*
		 * If there is no extent for our range when doing the initial
		 * search, then go back to the previous slot as it will be the
		 * one containing the search offset
		 */
Y
Yan Zheng 已提交
1684 1685 1686 1687
		if (ret > 0 && path->slots[0] > 0 && check_prev) {
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &found_key,
					      path->slots[0] - 1);
L
Li Zefan 已提交
1688
			if (found_key.objectid == ino &&
Y
Yan Zheng 已提交
1689 1690 1691
			    found_key.type == BTRFS_EXTENT_DATA_KEY)
				path->slots[0]--;
		}
1692
		check_prev = false;
Y
Yan Zheng 已提交
1693
next_slot:
1694
		/* Go to next leaf if we have exhausted the current one */
Y
Yan Zheng 已提交
1695 1696 1697
		leaf = path->nodes[0];
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
1698 1699 1700
			if (ret < 0) {
				if (cow_start != (u64)-1)
					cur_offset = cow_start;
1701
				goto error;
1702
			}
Y
Yan Zheng 已提交
1703 1704 1705 1706
			if (ret > 0)
				break;
			leaf = path->nodes[0];
		}
1707

Y
Yan Zheng 已提交
1708 1709
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

1710
		/* Didn't find anything for our INO */
1711 1712
		if (found_key.objectid > ino)
			break;
1713 1714 1715 1716
		/*
		 * Keep searching until we find an EXTENT_ITEM or there are no
		 * more extents for this inode
		 */
1717 1718 1719 1720 1721
		if (WARN_ON_ONCE(found_key.objectid < ino) ||
		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
			path->slots[0]++;
			goto next_slot;
		}
1722 1723

		/* Found key is not EXTENT_DATA_KEY or starts after req range */
1724
		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
Y
Yan Zheng 已提交
1725 1726 1727
		    found_key.offset > end)
			break;

1728 1729 1730 1731
		/*
		 * If the found extent starts after requested offset, then
		 * adjust extent_end to be right before this extent begins
		 */
Y
Yan Zheng 已提交
1732 1733
		if (found_key.offset > cur_offset) {
			extent_end = found_key.offset;
1734
			extent_type = 0;
Y
Yan Zheng 已提交
1735 1736 1737
			goto out_check;
		}

1738 1739 1740 1741
		/*
		 * Found extent which begins before our range and potentially
		 * intersect it
		 */
Y
Yan Zheng 已提交
1742 1743 1744 1745
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_type = btrfs_file_extent_type(leaf, fi);

J
Josef Bacik 已提交
1746
		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
Y
Yan Zheng 已提交
1747 1748
		if (extent_type == BTRFS_FILE_EXTENT_REG ||
		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
Y
Yan Zheng 已提交
1749
			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1750
			extent_offset = btrfs_file_extent_offset(leaf, fi);
Y
Yan Zheng 已提交
1751 1752
			extent_end = found_key.offset +
				btrfs_file_extent_num_bytes(leaf, fi);
1753 1754
			disk_num_bytes =
				btrfs_file_extent_disk_num_bytes(leaf, fi);
1755
			/*
1756 1757
			 * If the extent we got ends before our current offset,
			 * skip to the next extent.
1758
			 */
1759
			if (extent_end <= cur_offset) {
Y
Yan Zheng 已提交
1760 1761 1762
				path->slots[0]++;
				goto next_slot;
			}
1763
			/* Skip holes */
1764 1765
			if (disk_bytenr == 0)
				goto out_check;
1766
			/* Skip compressed/encrypted/encoded extents */
Y
Yan Zheng 已提交
1767 1768 1769 1770
			if (btrfs_file_extent_compression(leaf, fi) ||
			    btrfs_file_extent_encryption(leaf, fi) ||
			    btrfs_file_extent_other_encoding(leaf, fi))
				goto out_check;
1771
			/*
1772 1773 1774 1775 1776
			 * If extent is created before the last volume's snapshot
			 * this implies the extent is shared, hence we can't do
			 * nocow. This is the same check as in
			 * btrfs_cross_ref_exist but without calling
			 * btrfs_search_slot.
1777
			 */
1778
			if (!freespace_inode &&
1779
			    btrfs_file_extent_generation(leaf, fi) <=
1780 1781
			    btrfs_root_last_snapshot(&root->root_item))
				goto out_check;
Y
Yan Zheng 已提交
1782 1783
			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
				goto out_check;
1784 1785 1786 1787 1788 1789 1790 1791 1792

			/*
			 * The following checks can be expensive, as they need to
			 * take other locks and do btree or rbtree searches, so
			 * release the path to avoid blocking other tasks for too
			 * long.
			 */
			btrfs_release_path(path);

1793 1794
			ret = btrfs_cross_ref_exist(root, ino,
						    found_key.offset -
1795
						    extent_offset, disk_bytenr, false);
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806
			if (ret) {
				/*
				 * ret could be -EIO if the above fails to read
				 * metadata.
				 */
				if (ret < 0) {
					if (cow_start != (u64)-1)
						cur_offset = cow_start;
					goto error;
				}

1807
				WARN_ON_ONCE(freespace_inode);
1808
				goto out_check;
1809
			}
1810
			disk_bytenr += extent_offset;
1811 1812
			disk_bytenr += cur_offset - found_key.offset;
			num_bytes = min(end + 1, extent_end) - cur_offset;
1813
			/*
1814 1815
			 * If there are pending snapshots for this root, we
			 * fall into common COW way
1816
			 */
1817
			if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1818
				goto out_check;
1819 1820 1821 1822 1823
			/*
			 * force cow if csum exists in the range.
			 * this ensure that csum for a given extent are
			 * either valid or do not exist.
			 */
1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835
			ret = csum_exist_in_range(fs_info, disk_bytenr,
						  num_bytes);
			if (ret) {
				/*
				 * ret could be -EIO if the above fails to read
				 * metadata.
				 */
				if (ret < 0) {
					if (cow_start != (u64)-1)
						cur_offset = cow_start;
					goto error;
				}
1836
				WARN_ON_ONCE(freespace_inode);
1837
				goto out_check;
1838
			}
1839
			/* If the extent's block group is RO, we must COW */
1840
			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1841
				goto out_check;
1842
			nocow = true;
Y
Yan Zheng 已提交
1843
		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1844 1845
			extent_end = found_key.offset + ram_bytes;
			extent_end = ALIGN(extent_end, fs_info->sectorsize);
1846 1847 1848 1849 1850
			/* Skip extents outside of our requested range */
			if (extent_end <= start) {
				path->slots[0]++;
				goto next_slot;
			}
Y
Yan Zheng 已提交
1851
		} else {
1852
			/* If this triggers then we have a memory corruption */
1853
			BUG();
Y
Yan Zheng 已提交
1854 1855
		}
out_check:
1856 1857 1858 1859
		/*
		 * If nocow is false then record the beginning of the range
		 * that needs to be COWed
		 */
Y
Yan Zheng 已提交
1860 1861 1862 1863 1864 1865
		if (!nocow) {
			if (cow_start == (u64)-1)
				cow_start = cur_offset;
			cur_offset = extent_end;
			if (cur_offset > end)
				break;
1866 1867
			if (!path->nodes[0])
				continue;
Y
Yan Zheng 已提交
1868 1869
			path->slots[0]++;
			goto next_slot;
1870 1871
		}

1872 1873 1874 1875 1876
		/*
		 * COW range from cow_start to found_key.offset - 1. As the key
		 * will contain the beginning of the first extent that can be
		 * NOCOW, following one which needs to be COW'ed
		 */
Y
Yan Zheng 已提交
1877
		if (cow_start != (u64)-1) {
1878
			ret = fallback_to_cow(inode, locked_page,
1879
					      cow_start, found_key.offset - 1,
1880
					      page_started, nr_written);
1881
			if (ret)
1882
				goto error;
Y
Yan Zheng 已提交
1883
			cow_start = (u64)-1;
1884
		}
Y
Yan Zheng 已提交
1885

Y
Yan Zheng 已提交
1886
		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1887
			u64 orig_start = found_key.offset - extent_offset;
1888
			struct extent_map *em;
1889

1890
			em = create_io_em(inode, cur_offset, num_bytes,
1891 1892 1893 1894 1895 1896 1897 1898 1899
					  orig_start,
					  disk_bytenr, /* block_start */
					  num_bytes, /* block_len */
					  disk_num_bytes, /* orig_block_len */
					  ram_bytes, BTRFS_COMPRESS_NONE,
					  BTRFS_ORDERED_PREALLOC);
			if (IS_ERR(em)) {
				ret = PTR_ERR(em);
				goto error;
Y
Yan Zheng 已提交
1900
			}
1901
			free_extent_map(em);
1902
			ret = btrfs_add_ordered_extent(inode, cur_offset,
1903 1904 1905
						       disk_bytenr, num_bytes,
						       num_bytes,
						       BTRFS_ORDERED_PREALLOC);
1906
			if (ret) {
1907
				btrfs_drop_extent_cache(inode, cur_offset,
1908 1909 1910 1911
							cur_offset + num_bytes - 1,
							0);
				goto error;
			}
Y
Yan Zheng 已提交
1912
		} else {
1913
			ret = btrfs_add_ordered_extent(inode, cur_offset,
1914 1915 1916
						       disk_bytenr, num_bytes,
						       num_bytes,
						       BTRFS_ORDERED_NOCOW);
1917 1918
			if (ret)
				goto error;
Y
Yan Zheng 已提交
1919
		}
Y
Yan Zheng 已提交
1920

1921
		if (nocow)
1922
			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1923
		nocow = false;
1924

1925
		if (btrfs_is_data_reloc_root(root))
1926 1927 1928 1929 1930
			/*
			 * Error handled later, as we must prevent
			 * extent_clear_unlock_delalloc() in error handler
			 * from freeing metadata of created ordered extent.
			 */
1931
			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1932 1933
						      num_bytes);

1934
		extent_clear_unlock_delalloc(inode, cur_offset,
1935
					     cur_offset + num_bytes - 1,
1936
					     locked_page, EXTENT_LOCKED |
1937 1938
					     EXTENT_DELALLOC |
					     EXTENT_CLEAR_DATA_RESV,
1939
					     PAGE_UNLOCK | PAGE_SET_ORDERED);
1940

Y
Yan Zheng 已提交
1941
		cur_offset = extent_end;
1942 1943 1944 1945 1946 1947 1948 1949

		/*
		 * btrfs_reloc_clone_csums() error, now we're OK to call error
		 * handler, as metadata for created ordered extent will only
		 * be freed by btrfs_finish_ordered_io().
		 */
		if (ret)
			goto error;
Y
Yan Zheng 已提交
1950 1951
		if (cur_offset > end)
			break;
1952
	}
1953
	btrfs_release_path(path);
Y
Yan Zheng 已提交
1954

1955
	if (cur_offset <= end && cow_start == (u64)-1)
Y
Yan Zheng 已提交
1956
		cow_start = cur_offset;
1957

Y
Yan Zheng 已提交
1958
	if (cow_start != (u64)-1) {
1959
		cur_offset = end;
1960 1961
		ret = fallback_to_cow(inode, locked_page, cow_start, end,
				      page_started, nr_written);
1962
		if (ret)
1963
			goto error;
Y
Yan Zheng 已提交
1964 1965
	}

1966
error:
1967 1968 1969
	if (nocow)
		btrfs_dec_nocow_writers(fs_info, disk_bytenr);

1970
	if (ret && cur_offset < end)
1971
		extent_clear_unlock_delalloc(inode, cur_offset, end,
1972
					     locked_page, EXTENT_LOCKED |
1973 1974
					     EXTENT_DELALLOC | EXTENT_DEFRAG |
					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1975
					     PAGE_START_WRITEBACK |
1976
					     PAGE_END_WRITEBACK);
1977
	btrfs_free_path(path);
1978
	return ret;
1979 1980
}

1981
static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
1982
{
1983 1984 1985 1986 1987 1988 1989 1990
	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
		if (inode->defrag_bytes &&
		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
				   0, NULL))
			return false;
		return true;
	}
	return false;
1991 1992
}

C
Chris Mason 已提交
1993
/*
1994 1995
 * Function to process delayed allocation (create CoW) for ranges which are
 * being touched for the first time.
C
Chris Mason 已提交
1996
 */
1997
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
1998 1999
		u64 start, u64 end, int *page_started, unsigned long *nr_written,
		struct writeback_control *wbc)
2000 2001
{
	int ret;
2002
	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2003

2004 2005 2006 2007 2008 2009 2010
	/*
	 * The range must cover part of the @locked_page, or the returned
	 * @page_started can confuse the caller.
	 */
	ASSERT(!(end <= page_offset(locked_page) ||
		 start >= page_offset(locked_page) + PAGE_SIZE));

2011
	if (should_nocow(inode, start, end)) {
2012 2013 2014 2015 2016 2017 2018 2019 2020
		/*
		 * Normally on a zoned device we're only doing COW writes, but
		 * in case of relocation on a zoned filesystem we have taken
		 * precaution, that we're only writing sequentially. It's safe
		 * to use run_delalloc_nocow() here, like for  regular
		 * preallocated inodes.
		 */
		ASSERT(!zoned ||
		       (zoned && btrfs_is_data_reloc_root(inode->root)));
2021
		ret = run_delalloc_nocow(inode, locked_page, start, end,
2022
					 page_started, nr_written);
2023 2024
	} else if (!inode_can_compress(inode) ||
		   !inode_need_compress(inode, start, end)) {
2025 2026 2027 2028 2029 2030
		if (zoned)
			ret = run_delalloc_zoned(inode, locked_page, start, end,
						 page_started, nr_written);
		else
			ret = cow_file_range(inode, locked_page, start, end,
					     page_started, nr_written, 1);
2031
	} else {
2032 2033
		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
		ret = cow_file_range_async(inode, wbc, locked_page, start, end,
2034
					   page_started, nr_written);
2035
	}
2036
	ASSERT(ret <= 0);
2037
	if (ret)
2038
		btrfs_cleanup_ordered_extents(inode, locked_page, start,
2039
					      end - start + 1);
2040 2041 2042
	return ret;
}

2043 2044
void btrfs_split_delalloc_extent(struct inode *inode,
				 struct extent_state *orig, u64 split)
J
Josef Bacik 已提交
2045
{
2046 2047
	u64 size;

2048
	/* not delalloc, ignore it */
J
Josef Bacik 已提交
2049
	if (!(orig->state & EXTENT_DELALLOC))
2050
		return;
J
Josef Bacik 已提交
2051

2052 2053
	size = orig->end - orig->start + 1;
	if (size > BTRFS_MAX_EXTENT_SIZE) {
2054
		u32 num_extents;
2055 2056 2057
		u64 new_size;

		/*
2058
		 * See the explanation in btrfs_merge_delalloc_extent, the same
2059
		 * applies here, just in reverse.
2060 2061
		 */
		new_size = orig->end - split + 1;
2062
		num_extents = count_max_extents(new_size);
2063
		new_size = split - orig->start;
2064 2065
		num_extents += count_max_extents(new_size);
		if (count_max_extents(size) >= num_extents)
2066 2067 2068
			return;
	}

2069
	spin_lock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
2070
	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
2071
	spin_unlock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
2072 2073 2074
}

/*
2075 2076 2077
 * Handle merged delayed allocation extents so we can keep track of new extents
 * that are just merged onto old extents, such as when we are doing sequential
 * writes, so we can properly account for the metadata space we'll need.
J
Josef Bacik 已提交
2078
 */
2079 2080
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
				 struct extent_state *other)
J
Josef Bacik 已提交
2081
{
2082
	u64 new_size, old_size;
2083
	u32 num_extents;
2084

J
Josef Bacik 已提交
2085 2086
	/* not delalloc, ignore it */
	if (!(other->state & EXTENT_DELALLOC))
2087
		return;
J
Josef Bacik 已提交
2088

J
Josef Bacik 已提交
2089 2090 2091 2092
	if (new->start > other->start)
		new_size = new->end - other->start + 1;
	else
		new_size = other->end - new->start + 1;
2093 2094 2095 2096

	/* we're not bigger than the max, unreserve the space and go */
	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
		spin_lock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
2097
		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2098 2099 2100 2101 2102
		spin_unlock(&BTRFS_I(inode)->lock);
		return;
	}

	/*
2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118
	 * We have to add up either side to figure out how many extents were
	 * accounted for before we merged into one big extent.  If the number of
	 * extents we accounted for is <= the amount we need for the new range
	 * then we can return, otherwise drop.  Think of it like this
	 *
	 * [ 4k][MAX_SIZE]
	 *
	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
	 * need 2 outstanding extents, on one side we have 1 and the other side
	 * we have 1 so they are == and we can return.  But in this case
	 *
	 * [MAX_SIZE+4k][MAX_SIZE+4k]
	 *
	 * Each range on their own accounts for 2 extents, but merged together
	 * they are only 3 extents worth of accounting, so we need to drop in
	 * this case.
2119
	 */
2120
	old_size = other->end - other->start + 1;
2121
	num_extents = count_max_extents(old_size);
2122
	old_size = new->end - new->start + 1;
2123 2124
	num_extents += count_max_extents(old_size);
	if (count_max_extents(new_size) >= num_extents)
2125 2126
		return;

2127
	spin_lock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
2128
	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
2129
	spin_unlock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
2130 2131
}

2132 2133 2134
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
				      struct inode *inode)
{
2135 2136
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

2137 2138 2139 2140 2141 2142 2143 2144
	spin_lock(&root->delalloc_lock);
	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
			      &root->delalloc_inodes);
		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
			&BTRFS_I(inode)->runtime_flags);
		root->nr_delalloc_inodes++;
		if (root->nr_delalloc_inodes == 1) {
2145
			spin_lock(&fs_info->delalloc_root_lock);
2146 2147
			BUG_ON(!list_empty(&root->delalloc_root));
			list_add_tail(&root->delalloc_root,
2148 2149
				      &fs_info->delalloc_roots);
			spin_unlock(&fs_info->delalloc_root_lock);
2150 2151 2152 2153 2154
		}
	}
	spin_unlock(&root->delalloc_lock);
}

2155 2156 2157

void __btrfs_del_delalloc_inode(struct btrfs_root *root,
				struct btrfs_inode *inode)
2158
{
2159
	struct btrfs_fs_info *fs_info = root->fs_info;
2160

2161 2162
	if (!list_empty(&inode->delalloc_inodes)) {
		list_del_init(&inode->delalloc_inodes);
2163
		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2164
			  &inode->runtime_flags);
2165 2166
		root->nr_delalloc_inodes--;
		if (!root->nr_delalloc_inodes) {
2167
			ASSERT(list_empty(&root->delalloc_inodes));
2168
			spin_lock(&fs_info->delalloc_root_lock);
2169 2170
			BUG_ON(list_empty(&root->delalloc_root));
			list_del_init(&root->delalloc_root);
2171
			spin_unlock(&fs_info->delalloc_root_lock);
2172 2173
		}
	}
2174 2175 2176 2177 2178 2179 2180
}

static void btrfs_del_delalloc_inode(struct btrfs_root *root,
				     struct btrfs_inode *inode)
{
	spin_lock(&root->delalloc_lock);
	__btrfs_del_delalloc_inode(root, inode);
2181 2182 2183
	spin_unlock(&root->delalloc_lock);
}

C
Chris Mason 已提交
2184
/*
2185 2186
 * Properly track delayed allocation bytes in the inode and to maintain the
 * list of inodes that have pending delalloc work to be done.
C
Chris Mason 已提交
2187
 */
2188 2189
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
			       unsigned *bits)
2190
{
2191 2192
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

2193 2194
	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
		WARN_ON(1);
2195 2196
	/*
	 * set_bit and clear bit hooks normally require _irqsave/restore
2197
	 * but in this case, we are only testing for the DELALLOC
2198 2199
	 * bit, which is only set or cleared with irqs on
	 */
2200
	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2201
		struct btrfs_root *root = BTRFS_I(inode)->root;
2202
		u64 len = state->end + 1 - state->start;
J
Josef Bacik 已提交
2203
		u32 num_extents = count_max_extents(len);
2204
		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
J
Josef Bacik 已提交
2205

J
Josef Bacik 已提交
2206 2207 2208
		spin_lock(&BTRFS_I(inode)->lock);
		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
		spin_unlock(&BTRFS_I(inode)->lock);
2209

2210
		/* For sanity tests */
2211
		if (btrfs_is_testing(fs_info))
2212 2213
			return;

2214 2215
		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
					 fs_info->delalloc_batch);
2216
		spin_lock(&BTRFS_I(inode)->lock);
2217
		BTRFS_I(inode)->delalloc_bytes += len;
2218 2219
		if (*bits & EXTENT_DEFRAG)
			BTRFS_I(inode)->defrag_bytes += len;
2220
		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2221 2222
					 &BTRFS_I(inode)->runtime_flags))
			btrfs_add_delalloc_inodes(root, inode);
2223
		spin_unlock(&BTRFS_I(inode)->lock);
2224
	}
2225 2226 2227 2228 2229 2230 2231 2232

	if (!(state->state & EXTENT_DELALLOC_NEW) &&
	    (*bits & EXTENT_DELALLOC_NEW)) {
		spin_lock(&BTRFS_I(inode)->lock);
		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
			state->start;
		spin_unlock(&BTRFS_I(inode)->lock);
	}
2233 2234
}

C
Chris Mason 已提交
2235
/*
2236 2237
 * Once a range is no longer delalloc this function ensures that proper
 * accounting happens.
C
Chris Mason 已提交
2238
 */
2239 2240
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
				 struct extent_state *state, unsigned *bits)
2241
{
2242 2243
	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
2244
	u64 len = state->end + 1 - state->start;
2245
	u32 num_extents = count_max_extents(len);
2246

2247 2248
	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
		spin_lock(&inode->lock);
2249
		inode->defrag_bytes -= len;
2250 2251
		spin_unlock(&inode->lock);
	}
2252

2253 2254
	/*
	 * set_bit and clear bit hooks normally require _irqsave/restore
2255
	 * but in this case, we are only testing for the DELALLOC
2256 2257
	 * bit, which is only set or cleared with irqs on
	 */
2258
	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
2259
		struct btrfs_root *root = inode->root;
2260
		bool do_list = !btrfs_is_free_space_inode(inode);
2261

J
Josef Bacik 已提交
2262 2263 2264
		spin_lock(&inode->lock);
		btrfs_mod_outstanding_extents(inode, -num_extents);
		spin_unlock(&inode->lock);
2265

2266 2267
		/*
		 * We don't reserve metadata space for space cache inodes so we
2268
		 * don't need to call delalloc_release_metadata if there is an
2269 2270
		 * error.
		 */
2271
		if (*bits & EXTENT_CLEAR_META_RESV &&
2272
		    root != fs_info->tree_root)
2273
			btrfs_delalloc_release_metadata(inode, len, false);
2274

2275
		/* For sanity tests. */
2276
		if (btrfs_is_testing(fs_info))
2277 2278
			return;

2279
		if (!btrfs_is_data_reloc_root(root) &&
2280 2281
		    do_list && !(state->state & EXTENT_NORESERVE) &&
		    (*bits & EXTENT_CLEAR_DATA_RESV))
2282
			btrfs_free_reserved_data_space_noquota(fs_info, len);
J
Josef Bacik 已提交
2283

2284 2285
		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
					 fs_info->delalloc_batch);
2286 2287 2288
		spin_lock(&inode->lock);
		inode->delalloc_bytes -= len;
		if (do_list && inode->delalloc_bytes == 0 &&
2289
		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2290
					&inode->runtime_flags))
2291
			btrfs_del_delalloc_inode(root, inode);
2292
		spin_unlock(&inode->lock);
2293
	}
2294 2295 2296 2297 2298 2299

	if ((state->state & EXTENT_DELALLOC_NEW) &&
	    (*bits & EXTENT_DELALLOC_NEW)) {
		spin_lock(&inode->lock);
		ASSERT(inode->new_delalloc_bytes >= len);
		inode->new_delalloc_bytes -= len;
2300 2301
		if (*bits & EXTENT_ADD_INODE_BYTES)
			inode_add_bytes(&inode->vfs_inode, len);
2302 2303
		spin_unlock(&inode->lock);
	}
2304 2305
}

C
Chris Mason 已提交
2306 2307 2308 2309 2310 2311 2312 2313
/*
 * in order to insert checksums into the metadata in large chunks,
 * we wait until bio submission time.   All the pages in the bio are
 * checksummed and sums are attached onto the ordered extent record.
 *
 * At IO completion time the cums attached on the ordered extent record
 * are inserted into the btree
 */
2314
static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
2315
					   u64 dio_file_offset)
2316
{
2317
	return btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
C
Chris Mason 已提交
2318
}
2319

2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362
/*
 * Split an extent_map at [start, start + len]
 *
 * This function is intended to be used only for extract_ordered_extent().
 */
static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
			  u64 pre, u64 post)
{
	struct extent_map_tree *em_tree = &inode->extent_tree;
	struct extent_map *em;
	struct extent_map *split_pre = NULL;
	struct extent_map *split_mid = NULL;
	struct extent_map *split_post = NULL;
	int ret = 0;
	unsigned long flags;

	/* Sanity check */
	if (pre == 0 && post == 0)
		return 0;

	split_pre = alloc_extent_map();
	if (pre)
		split_mid = alloc_extent_map();
	if (post)
		split_post = alloc_extent_map();
	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
		ret = -ENOMEM;
		goto out;
	}

	ASSERT(pre + post < len);

	lock_extent(&inode->io_tree, start, start + len - 1);
	write_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, start, len);
	if (!em) {
		ret = -EIO;
		goto out_unlock;
	}

	ASSERT(em->len == len);
	ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
	ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
2363 2364 2365
	ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
	ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
	ASSERT(!list_empty(&em->list));
2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381

	flags = em->flags;
	clear_bit(EXTENT_FLAG_PINNED, &em->flags);

	/* First, replace the em with a new extent_map starting from * em->start */
	split_pre->start = em->start;
	split_pre->len = (pre ? pre : em->len - post);
	split_pre->orig_start = split_pre->start;
	split_pre->block_start = em->block_start;
	split_pre->block_len = split_pre->len;
	split_pre->orig_block_len = split_pre->block_len;
	split_pre->ram_bytes = split_pre->len;
	split_pre->flags = flags;
	split_pre->compress_type = em->compress_type;
	split_pre->generation = em->generation;

2382
	replace_extent_mapping(em_tree, em, split_pre, 1);
2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401

	/*
	 * Now we only have an extent_map at:
	 *     [em->start, em->start + pre] if pre != 0
	 *     [em->start, em->start + em->len - post] if pre == 0
	 */

	if (pre) {
		/* Insert the middle extent_map */
		split_mid->start = em->start + pre;
		split_mid->len = em->len - pre - post;
		split_mid->orig_start = split_mid->start;
		split_mid->block_start = em->block_start + pre;
		split_mid->block_len = split_mid->len;
		split_mid->orig_block_len = split_mid->block_len;
		split_mid->ram_bytes = split_mid->len;
		split_mid->flags = flags;
		split_mid->compress_type = em->compress_type;
		split_mid->generation = em->generation;
2402
		add_extent_mapping(em_tree, split_mid, 1);
2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415
	}

	if (post) {
		split_post->start = em->start + em->len - post;
		split_post->len = post;
		split_post->orig_start = split_post->start;
		split_post->block_start = em->block_start + em->len - post;
		split_post->block_len = split_post->len;
		split_post->orig_block_len = split_post->block_len;
		split_post->ram_bytes = split_post->len;
		split_post->flags = flags;
		split_post->compress_type = em->compress_type;
		split_post->generation = em->generation;
2416
		add_extent_mapping(em_tree, split_post, 1);
2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434
	}

	/* Once for us */
	free_extent_map(em);
	/* Once for the tree */
	free_extent_map(em);

out_unlock:
	write_unlock(&em_tree->lock);
	unlock_extent(&inode->io_tree, start, start + len - 1);
out:
	free_extent_map(split_pre);
	free_extent_map(split_mid);
	free_extent_map(split_post);

	return ret;
}

2435 2436 2437 2438 2439
static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
					   struct bio *bio, loff_t file_offset)
{
	struct btrfs_ordered_extent *ordered;
	u64 start = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
2440
	u64 file_len;
2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479
	u64 len = bio->bi_iter.bi_size;
	u64 end = start + len;
	u64 ordered_end;
	u64 pre, post;
	int ret = 0;

	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
	if (WARN_ON_ONCE(!ordered))
		return BLK_STS_IOERR;

	/* No need to split */
	if (ordered->disk_num_bytes == len)
		goto out;

	/* We cannot split once end_bio'd ordered extent */
	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
		ret = -EINVAL;
		goto out;
	}

	/* We cannot split a compressed ordered extent */
	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
		ret = -EINVAL;
		goto out;
	}

	ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
	/* bio must be in one ordered extent */
	if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
		ret = -EINVAL;
		goto out;
	}

	/* Checksum list should be empty */
	if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
		ret = -EINVAL;
		goto out;
	}

2480
	file_len = ordered->num_bytes;
2481 2482 2483 2484 2485 2486
	pre = start - ordered->disk_bytenr;
	post = ordered_end - end;

	ret = btrfs_split_ordered_extent(ordered, pre, post);
	if (ret)
		goto out;
2487
	ret = split_zoned_em(inode, file_offset, file_len, pre, post);
2488 2489 2490 2491 2492 2493 2494

out:
	btrfs_put_ordered_extent(ordered);

	return errno_to_blk_status(ret);
}

C
Chris Mason 已提交
2495
/*
2496
 * extent_io.c submission hook. This does the right thing for csum calculation
2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511
 * on write, or reading the csums from the tree before a read.
 *
 * Rules about async/sync submit,
 * a) read:				sync submit
 *
 * b) write without checksum:		sync submit
 *
 * c) write with checksum:
 *    c-1) if bio is issued by fsync:	sync submit
 *         (sync_writers != 0)
 *
 *    c-2) if root is reloc root:	sync submit
 *         (only in case of buffered IO)
 *
 *    c-3) otherwise:			async submit
C
Chris Mason 已提交
2512
 */
2513 2514
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
				   int mirror_num, unsigned long bio_flags)
2515

2516
{
2517
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2518
	struct btrfs_root *root = BTRFS_I(inode)->root;
2519
	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2520
	blk_status_t ret = 0;
2521
	int skip_sum;
2522
	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2523

2524
	skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
2525
		test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
2526

2527
	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2528
		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2529

2530 2531 2532 2533 2534 2535 2536 2537 2538
	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
		struct page *page = bio_first_bvec_all(bio)->bv_page;
		loff_t file_offset = page_offset(page);

		ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
		if (ret)
			goto out;
	}

2539
	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
2540
		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2541
		if (ret)
2542
			goto out;
2543

2544
		if (bio_flags & EXTENT_BIO_COMPRESSED) {
2545 2546 2547 2548
			ret = btrfs_submit_compressed_read(inode, bio,
							   mirror_num,
							   bio_flags);
			goto out;
2549 2550 2551 2552 2553 2554
		} else {
			/*
			 * Lookup bio sums does extra checks around whether we
			 * need to csum or not, which is why we ignore skip_sum
			 * here.
			 */
2555
			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2556
			if (ret)
2557
				goto out;
2558
		}
2559
		goto mapit;
2560
	} else if (async && !skip_sum) {
2561
		/* csum items have already been cloned */
2562
		if (btrfs_is_data_reloc_root(root))
2563
			goto mapit;
2564
		/* we're doing a write, do the async checksumming */
2565 2566
		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags,
					  0, btrfs_submit_bio_start);
2567
		goto out;
2568
	} else if (!skip_sum) {
2569
		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, 0, 0);
2570 2571
		if (ret)
			goto out;
2572 2573
	}

2574
mapit:
2575
	ret = btrfs_map_bio(fs_info, bio, mirror_num);
2576 2577

out:
2578 2579
	if (ret) {
		bio->bi_status = ret;
2580 2581
		bio_endio(bio);
	}
2582
	return ret;
2583
}
C
Chris Mason 已提交
2584

C
Chris Mason 已提交
2585 2586 2587 2588
/*
 * given a list of ordered sums record them in the inode.  This happens
 * at IO completion time based on sums calculated at bio submission time.
 */
2589 2590
static int add_pending_csums(struct btrfs_trans_handle *trans,
			     struct list_head *list)
2591 2592
{
	struct btrfs_ordered_sum *sum;
2593
	struct btrfs_root *csum_root = NULL;
2594
	int ret;
2595

Q
Qinghuang Feng 已提交
2596
	list_for_each_entry(sum, list, list) {
2597
		trans->adding_csums = true;
2598 2599 2600 2601
		if (!csum_root)
			csum_root = btrfs_csum_root(trans->fs_info,
						    sum->bytenr);
		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2602
		trans->adding_csums = false;
2603 2604
		if (ret)
			return ret;
2605 2606 2607 2608
	}
	return 0;
}

2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
					 const u64 start,
					 const u64 len,
					 struct extent_state **cached_state)
{
	u64 search_start = start;
	const u64 end = start + len - 1;

	while (search_start < end) {
		const u64 search_len = end - search_start + 1;
		struct extent_map *em;
		u64 em_len;
		int ret = 0;

		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
		if (IS_ERR(em))
			return PTR_ERR(em);

		if (em->block_start != EXTENT_MAP_HOLE)
			goto next;

		em_len = em->len;
		if (em->start < search_start)
			em_len -= search_start - em->start;
		if (em_len > search_len)
			em_len = search_len;

		ret = set_extent_bit(&inode->io_tree, search_start,
				     search_start + em_len - 1,
2638 2639
				     EXTENT_DELALLOC_NEW, 0, NULL, cached_state,
				     GFP_NOFS, NULL);
2640 2641 2642 2643 2644 2645 2646 2647 2648
next:
		search_start = extent_map_end(em);
		free_extent_map(em);
		if (ret)
			return ret;
	}
	return 0;
}

2649
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2650
			      unsigned int extra_bits,
2651
			      struct extent_state **cached_state)
2652
{
2653
	WARN_ON(PAGE_ALIGNED(end));
2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671

	if (start >= i_size_read(&inode->vfs_inode) &&
	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
		/*
		 * There can't be any extents following eof in this case so just
		 * set the delalloc new bit for the range directly.
		 */
		extra_bits |= EXTENT_DELALLOC_NEW;
	} else {
		int ret;

		ret = btrfs_find_new_delalloc_bytes(inode, start,
						    end + 1 - start,
						    cached_state);
		if (ret)
			return ret;
	}

2672 2673
	return set_extent_delalloc(&inode->io_tree, start, end, extra_bits,
				   cached_state);
2674 2675
}

C
Chris Mason 已提交
2676
/* see btrfs_writepage_start_hook for details on why this is required */
2677 2678
struct btrfs_writepage_fixup {
	struct page *page;
2679
	struct inode *inode;
2680 2681 2682
	struct btrfs_work work;
};

2683
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2684 2685 2686
{
	struct btrfs_writepage_fixup *fixup;
	struct btrfs_ordered_extent *ordered;
2687
	struct extent_state *cached_state = NULL;
2688
	struct extent_changeset *data_reserved = NULL;
2689
	struct page *page;
2690
	struct btrfs_inode *inode;
2691 2692
	u64 page_start;
	u64 page_end;
2693
	int ret = 0;
2694
	bool free_delalloc_space = true;
2695 2696 2697

	fixup = container_of(work, struct btrfs_writepage_fixup, work);
	page = fixup->page;
2698
	inode = BTRFS_I(fixup->inode);
2699 2700 2701 2702 2703 2704 2705
	page_start = page_offset(page);
	page_end = page_offset(page) + PAGE_SIZE - 1;

	/*
	 * This is similar to page_mkwrite, we need to reserve the space before
	 * we take the page lock.
	 */
2706 2707
	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
					   PAGE_SIZE);
C
Chris Mason 已提交
2708
again:
2709
	lock_page(page);
2710 2711 2712 2713 2714 2715

	/*
	 * Before we queued this fixup, we took a reference on the page.
	 * page->mapping may go NULL, but it shouldn't be moved to a different
	 * address space.
	 */
2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734
	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
		/*
		 * Unfortunately this is a little tricky, either
		 *
		 * 1) We got here and our page had already been dealt with and
		 *    we reserved our space, thus ret == 0, so we need to just
		 *    drop our space reservation and bail.  This can happen the
		 *    first time we come into the fixup worker, or could happen
		 *    while waiting for the ordered extent.
		 * 2) Our page was already dealt with, but we happened to get an
		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
		 *    this case we obviously don't have anything to release, but
		 *    because the page was already dealt with we don't want to
		 *    mark the page with an error, so make sure we're resetting
		 *    ret to 0.  This is why we have this check _before_ the ret
		 *    check, because we do not want to have a surprise ENOSPC
		 *    when the page was already properly dealt with.
		 */
		if (!ret) {
2735 2736
			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
			btrfs_delalloc_release_space(inode, data_reserved,
2737 2738 2739 2740
						     page_start, PAGE_SIZE,
						     true);
		}
		ret = 0;
2741
		goto out_page;
2742
	}
2743

2744
	/*
2745 2746
	 * We can't mess with the page state unless it is locked, so now that
	 * it is locked bail if we failed to make our space reservation.
2747
	 */
2748 2749
	if (ret)
		goto out_page;
2750

2751
	lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
C
Chris Mason 已提交
2752 2753

	/* already ordered? We're done */
2754
	if (PageOrdered(page))
2755
		goto out_reserved;
C
Chris Mason 已提交
2756

2757
	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
C
Chris Mason 已提交
2758
	if (ordered) {
2759 2760
		unlock_extent_cached(&inode->io_tree, page_start, page_end,
				     &cached_state);
C
Chris Mason 已提交
2761
		unlock_page(page);
2762
		btrfs_start_ordered_extent(ordered, 1);
2763
		btrfs_put_ordered_extent(ordered);
C
Chris Mason 已提交
2764 2765
		goto again;
	}
2766

2767
	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2768
					&cached_state);
2769
	if (ret)
2770
		goto out_reserved;
2771

2772 2773 2774 2775 2776 2777 2778 2779
	/*
	 * Everything went as planned, we're now the owner of a dirty page with
	 * delayed allocation bits set and space reserved for our COW
	 * destination.
	 *
	 * The page was dirty when we started, nothing should have cleaned it.
	 */
	BUG_ON(!PageDirty(page));
2780
	free_delalloc_space = false;
2781
out_reserved:
2782
	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2783
	if (free_delalloc_space)
2784 2785 2786
		btrfs_delalloc_release_space(inode, data_reserved, page_start,
					     PAGE_SIZE, true);
	unlock_extent_cached(&inode->io_tree, page_start, page_end,
2787
			     &cached_state);
2788
out_page:
2789 2790 2791 2792 2793 2794 2795 2796 2797 2798
	if (ret) {
		/*
		 * We hit ENOSPC or other errors.  Update the mapping and page
		 * to reflect the errors and clean the page.
		 */
		mapping_set_error(page->mapping, ret);
		end_extent_writepage(page, ret, page_start, page_end);
		clear_page_dirty_for_io(page);
		SetPageError(page);
	}
2799
	btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE);
2800
	unlock_page(page);
2801
	put_page(page);
2802
	kfree(fixup);
2803
	extent_changeset_free(data_reserved);
2804 2805 2806 2807 2808
	/*
	 * As a precaution, do a delayed iput in case it would be the last iput
	 * that could need flushing space. Recursing back to fixup worker would
	 * deadlock.
	 */
2809
	btrfs_add_delayed_iput(&inode->vfs_inode);
2810 2811 2812 2813 2814 2815 2816 2817
}

/*
 * There are a few paths in the higher layers of the kernel that directly
 * set the page dirty bit without asking the filesystem if it is a
 * good idea.  This causes problems because we want to make sure COW
 * properly happens and the data=ordered rules are followed.
 *
C
Chris Mason 已提交
2818
 * In our case any range that doesn't have the ORDERED bit set
2819 2820 2821 2822
 * hasn't been properly setup for IO.  We kick off an async process
 * to fix it up.  The async helper will wait for ordered extents, set
 * the delalloc bit and make it safe to write the page.
 */
2823
int btrfs_writepage_cow_fixup(struct page *page)
2824 2825
{
	struct inode *inode = page->mapping->host;
2826
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2827 2828
	struct btrfs_writepage_fixup *fixup;

2829 2830
	/* This page has ordered extent covering it already */
	if (PageOrdered(page))
2831 2832
		return 0;

2833 2834 2835 2836 2837 2838 2839
	/*
	 * PageChecked is set below when we create a fixup worker for this page,
	 * don't try to create another one if we're already PageChecked()
	 *
	 * The extent_io writepage code will redirty the page if we send back
	 * EAGAIN.
	 */
2840 2841 2842 2843 2844 2845
	if (PageChecked(page))
		return -EAGAIN;

	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
	if (!fixup)
		return -EAGAIN;
2846

2847 2848 2849 2850 2851 2852 2853
	/*
	 * We are already holding a reference to this inode from
	 * write_cache_pages.  We need to hold it because the space reservation
	 * takes place outside of the page lock, and we can't trust
	 * page->mapping outside of the page lock.
	 */
	ihold(inode);
2854
	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2855
	get_page(page);
2856
	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2857
	fixup->page = page;
2858
	fixup->inode = inode;
2859
	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2860 2861

	return -EAGAIN;
2862 2863
}

Y
Yan Zheng 已提交
2864
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2865
				       struct btrfs_inode *inode, u64 file_pos,
2866
				       struct btrfs_file_extent_item *stack_fi,
2867
				       const bool update_inode_bytes,
2868
				       u64 qgroup_reserved)
Y
Yan Zheng 已提交
2869
{
2870
	struct btrfs_root *root = inode->root;
2871
	const u64 sectorsize = root->fs_info->sectorsize;
Y
Yan Zheng 已提交
2872 2873 2874
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key ins;
2875 2876 2877 2878
	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2879
	struct btrfs_drop_extents_args drop_args = { 0 };
Y
Yan Zheng 已提交
2880 2881 2882
	int ret;

	path = btrfs_alloc_path();
2883 2884
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
2885

C
Chris Mason 已提交
2886 2887 2888 2889 2890 2891 2892 2893 2894
	/*
	 * we may be replacing one extent in the tree with another.
	 * The new extent is pinned in the extent map, and we don't want
	 * to drop it from the cache until it is completely in the btree.
	 *
	 * So, tell btrfs_drop_extents to leave this extent in the cache.
	 * the caller is expected to unpin it and allow it to be merged
	 * with the others.
	 */
2895 2896 2897 2898 2899 2900
	drop_args.path = path;
	drop_args.start = file_pos;
	drop_args.end = file_pos + num_bytes;
	drop_args.replace_extent = true;
	drop_args.extent_item_size = sizeof(*stack_fi);
	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2901 2902
	if (ret)
		goto out;
Y
Yan Zheng 已提交
2903

2904
	if (!drop_args.extent_inserted) {
2905
		ins.objectid = btrfs_ino(inode);
2906 2907 2908 2909
		ins.offset = file_pos;
		ins.type = BTRFS_EXTENT_DATA_KEY;

		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2910
					      sizeof(*stack_fi));
2911 2912 2913
		if (ret)
			goto out;
	}
Y
Yan Zheng 已提交
2914
	leaf = path->nodes[0];
2915 2916 2917 2918
	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
	write_extent_buffer(leaf, stack_fi,
			btrfs_item_ptr_offset(leaf, path->slots[0]),
			sizeof(struct btrfs_file_extent_item));
2919

Y
Yan Zheng 已提交
2920
	btrfs_mark_buffer_dirty(leaf);
2921
	btrfs_release_path(path);
Y
Yan Zheng 已提交
2922

2923 2924 2925
	/*
	 * If we dropped an inline extent here, we know the range where it is
	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
D
David Sterba 已提交
2926
	 * number of bytes only for that range containing the inline extent.
2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940
	 * The remaining of the range will be processed when clearning the
	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
	 */
	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);

		inline_size = drop_args.bytes_found - inline_size;
		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
		drop_args.bytes_found -= inline_size;
		num_bytes -= sectorsize;
	}

	if (update_inode_bytes)
		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
Y
Yan Zheng 已提交
2941 2942 2943 2944

	ins.objectid = disk_bytenr;
	ins.offset = disk_num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;
2945

2946
	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2947 2948 2949
	if (ret)
		goto out;

2950
	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2951
					       file_pos, qgroup_reserved, &ins);
2952
out:
Y
Yan Zheng 已提交
2953
	btrfs_free_path(path);
2954

2955
	return ret;
Y
Yan Zheng 已提交
2956 2957
}

2958
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2959 2960
					 u64 start, u64 len)
{
2961
	struct btrfs_block_group *cache;
2962

2963
	cache = btrfs_lookup_block_group(fs_info, start);
2964 2965 2966 2967 2968 2969 2970 2971 2972
	ASSERT(cache);

	spin_lock(&cache->lock);
	cache->delalloc_bytes -= len;
	spin_unlock(&cache->lock);

	btrfs_put_block_group(cache);
}

2973 2974 2975 2976 2977
static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
					     struct btrfs_ordered_extent *oe)
{
	struct btrfs_file_extent_item stack_fi;
	u64 logical_len;
2978
	bool update_inode_bytes;
2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993

	memset(&stack_fi, 0, sizeof(stack_fi));
	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
						   oe->disk_num_bytes);
	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
		logical_len = oe->truncated_len;
	else
		logical_len = oe->num_bytes;
	btrfs_set_stack_file_extent_num_bytes(&stack_fi, logical_len);
	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, logical_len);
	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
	/* Encryption and other encoding is reserved and all 0 */

2994 2995 2996 2997 2998 2999 3000 3001 3002
	/*
	 * For delalloc, when completing an ordered extent we update the inode's
	 * bytes when clearing the range in the inode's io tree, so pass false
	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
	 * except if the ordered extent was truncated.
	 */
	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);

3003 3004
	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
					   oe->file_offset, &stack_fi,
3005
					   update_inode_bytes, oe->qgroup_rsv);
3006 3007 3008 3009
}

/*
 * As ordered data IO finishes, this gets called so we can finish
C
Chris Mason 已提交
3010 3011 3012
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
 */
3013
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
3014
{
3015 3016 3017
	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
3018
	struct btrfs_trans_handle *trans = NULL;
3019
	struct extent_io_tree *io_tree = &inode->io_tree;
3020
	struct extent_state *cached_state = NULL;
3021
	u64 start, end;
3022
	int compress_type = 0;
3023
	int ret = 0;
3024
	u64 logical_len = ordered_extent->num_bytes;
3025
	bool freespace_inode;
3026
	bool truncated = false;
3027
	bool clear_reserved_extent = true;
3028
	unsigned int clear_bits = EXTENT_DEFRAG;
3029

3030 3031 3032
	start = ordered_extent->file_offset;
	end = start + ordered_extent->num_bytes - 1;

3033 3034 3035
	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
3036
		clear_bits |= EXTENT_DELALLOC_NEW;
3037

3038
	freespace_inode = btrfs_is_free_space_inode(inode);
J
Josef Bacik 已提交
3039

3040 3041 3042 3043 3044
	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
		ret = -EIO;
		goto out;
	}

3045 3046
	/* A valid bdev implies a write on a sequential zone */
	if (ordered_extent->bdev) {
3047
		btrfs_rewrite_logical_zoned(ordered_extent);
3048 3049 3050
		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
					ordered_extent->disk_num_bytes);
	}
3051

3052
	btrfs_free_io_failure_record(inode, start, end);
3053

3054 3055 3056 3057 3058 3059 3060 3061
	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
		truncated = true;
		logical_len = ordered_extent->truncated_len;
		/* Truncated the entire extent, don't bother adding */
		if (!logical_len)
			goto out;
	}

3062
	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3063
		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3064

3065
		btrfs_inode_safe_disk_i_size_write(inode, 0);
3066 3067
		if (freespace_inode)
			trans = btrfs_join_transaction_spacecache(root);
3068 3069 3070 3071 3072 3073
		else
			trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			trans = NULL;
			goto out;
3074
		}
3075
		trans->block_rsv = &inode->block_rsv;
3076
		ret = btrfs_update_inode_fallback(trans, root, inode);
3077
		if (ret) /* -ENOMEM or corruption */
3078
			btrfs_abort_transaction(trans, ret);
3079 3080
		goto out;
	}
3081

3082
	clear_bits |= EXTENT_LOCKED;
3083
	lock_extent_bits(io_tree, start, end, &cached_state);
3084

3085 3086
	if (freespace_inode)
		trans = btrfs_join_transaction_spacecache(root);
J
Josef Bacik 已提交
3087
	else
3088
		trans = btrfs_join_transaction(root);
3089 3090 3091
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		trans = NULL;
3092
		goto out;
3093
	}
C
Chris Mason 已提交
3094

3095
	trans->block_rsv = &inode->block_rsv;
3096

C
Chris Mason 已提交
3097
	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3098
		compress_type = ordered_extent->compress_type;
Y
Yan Zheng 已提交
3099
	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3100
		BUG_ON(compress_type);
3101
		ret = btrfs_mark_extent_written(trans, inode,
Y
Yan Zheng 已提交
3102 3103
						ordered_extent->file_offset,
						ordered_extent->file_offset +
3104
						logical_len);
Y
Yan Zheng 已提交
3105
	} else {
3106
		BUG_ON(root == fs_info->tree_root);
3107
		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3108 3109
		if (!ret) {
			clear_reserved_extent = false;
3110
			btrfs_release_delalloc_bytes(fs_info,
3111 3112
						ordered_extent->disk_bytenr,
						ordered_extent->disk_num_bytes);
3113
		}
Y
Yan Zheng 已提交
3114
	}
3115
	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3116
			   ordered_extent->num_bytes, trans->transid);
3117
	if (ret < 0) {
3118
		btrfs_abort_transaction(trans, ret);
3119
		goto out;
3120
	}
3121

3122
	ret = add_pending_csums(trans, &ordered_extent->list);
3123 3124 3125 3126
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3127

3128 3129 3130 3131 3132 3133 3134
	/*
	 * If this is a new delalloc range, clear its new delalloc flag to
	 * update the inode's number of bytes. This needs to be done first
	 * before updating the inode item.
	 */
	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3135
		clear_extent_bit(&inode->io_tree, start, end,
3136 3137 3138
				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
				 0, 0, &cached_state);

3139
	btrfs_inode_safe_disk_i_size_write(inode, 0);
3140
	ret = btrfs_update_inode_fallback(trans, root, inode);
3141
	if (ret) { /* -ENOMEM or corruption */
3142
		btrfs_abort_transaction(trans, ret);
3143
		goto out;
3144 3145
	}
	ret = 0;
3146
out:
3147
	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3148
			 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
3149
			 &cached_state);
3150

3151
	if (trans)
3152
		btrfs_end_transaction(trans);
J
Josef Bacik 已提交
3153

3154
	if (ret || truncated) {
3155
		u64 unwritten_start = start;
3156

3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168
		/*
		 * If we failed to finish this ordered extent for any reason we
		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
		 * extent, and mark the inode with the error if it wasn't
		 * already set.  Any error during writeback would have already
		 * set the mapping error, so we need to set it if we're the ones
		 * marking this ordered extent as failed.
		 */
		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
					     &ordered_extent->flags))
			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);

3169
		if (truncated)
3170 3171
			unwritten_start += logical_len;
		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3172 3173

		/* Drop the cache for the part of the extent we didn't write. */
3174
		btrfs_drop_extent_cache(inode, unwritten_start, end, 0);
3175

3176 3177 3178
		/*
		 * If the ordered extent had an IOERR or something else went
		 * wrong we need to return the space for this ordered extent
3179 3180
		 * back to the allocator.  We only free the extent in the
		 * truncated case if we didn't write out the extent at all.
3181 3182 3183 3184
		 *
		 * If we made it past insert_reserved_file_extent before we
		 * errored out then we don't need to do this as the accounting
		 * has already been done.
3185
		 */
3186
		if ((ret || !logical_len) &&
3187
		    clear_reserved_extent &&
3188
		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3189 3190 3191 3192 3193
		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
			/*
			 * Discard the range before returning it back to the
			 * free space pool
			 */
3194
			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3195
				btrfs_discard_extent(fs_info,
3196 3197 3198
						ordered_extent->disk_bytenr,
						ordered_extent->disk_num_bytes,
						NULL);
3199
			btrfs_free_reserved_extent(fs_info,
3200 3201
					ordered_extent->disk_bytenr,
					ordered_extent->disk_num_bytes, 1);
3202
		}
3203 3204
	}

3205
	/*
3206 3207
	 * This needs to be done to make sure anybody waiting knows we are done
	 * updating everything for this ordered extent.
3208
	 */
3209
	btrfs_remove_ordered_extent(inode, ordered_extent);
3210

3211 3212 3213 3214 3215
	/* once for us */
	btrfs_put_ordered_extent(ordered_extent);
	/* once for the tree */
	btrfs_put_ordered_extent(ordered_extent);

3216 3217 3218 3219 3220 3221 3222 3223
	return ret;
}

static void finish_ordered_fn(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered_extent;
	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
	btrfs_finish_ordered_io(ordered_extent);
3224 3225
}

3226 3227
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
					  struct page *page, u64 start,
3228
					  u64 end, bool uptodate)
3229
{
3230
	trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
3231

3232 3233
	btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
				       finish_ordered_fn, uptodate);
3234 3235
}

3236 3237
/*
 * check_data_csum - verify checksum of one sector of uncompressed data
3238
 * @inode:	inode
3239
 * @io_bio:	btrfs_io_bio which contains the csum
3240
 * @bio_offset:	offset to the beginning of the bio (in bytes)
3241 3242
 * @page:	page where is the data to be verified
 * @pgoff:	offset inside the page
3243
 * @start:	logical offset in the file
3244 3245 3246
 *
 * The length of such check is always one sector size.
 */
3247
static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
3248 3249
			   u32 bio_offset, struct page *page, u32 pgoff,
			   u64 start)
3250
{
3251 3252
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3253
	char *kaddr;
3254
	u32 len = fs_info->sectorsize;
3255
	const u32 csum_size = fs_info->csum_size;
3256
	unsigned int offset_sectors;
3257 3258
	u8 *csum_expected;
	u8 csum[BTRFS_CSUM_SIZE];
3259

3260 3261
	ASSERT(pgoff + len <= PAGE_SIZE);

3262
	offset_sectors = bio_offset >> fs_info->sectorsize_bits;
3263
	csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
3264 3265

	kaddr = kmap_atomic(page);
3266 3267
	shash->tfm = fs_info->csum_shash;

3268
	crypto_shash_digest(shash, kaddr + pgoff, len, csum);
3269 3270

	if (memcmp(csum, csum_expected, csum_size))
3271 3272 3273 3274 3275
		goto zeroit;

	kunmap_atomic(kaddr);
	return 0;
zeroit:
3276
	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3277 3278 3279
				    bbio->mirror_num);
	if (bbio->device)
		btrfs_dev_stat_inc_and_print(bbio->device,
3280
					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
3281 3282 3283 3284 3285 3286
	memset(kaddr + pgoff, 1, len);
	flush_dcache_page(page);
	kunmap_atomic(kaddr);
	return -EIO;
}

C
Chris Mason 已提交
3287
/*
3288
 * When reads are done, we need to check csums to verify the data is correct.
3289 3290
 * if there's a match, we allow the bio to finish.  If not, the code in
 * extent_io.c will try to find good copies for us.
3291 3292 3293 3294
 *
 * @bio_offset:	offset to the beginning of the bio (in bytes)
 * @start:	file offset of the range start
 * @end:	file offset of the range end (inclusive)
3295 3296 3297
 *
 * Return a bitmap where bit set means a csum mismatch, and bit not set means
 * csum match.
C
Chris Mason 已提交
3298
 */
3299 3300 3301
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
				    u32 bio_offset, struct page *page,
				    u64 start, u64 end)
3302 3303
{
	struct inode *inode = page->mapping->host;
3304
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3305
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3306
	struct btrfs_root *root = BTRFS_I(inode)->root;
3307 3308
	const u32 sectorsize = root->fs_info->sectorsize;
	u32 pg_off;
3309
	unsigned int result = 0;
3310

3311 3312
	if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
		btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
3313
		return 0;
3314
	}
3315

3316
	/*
3317 3318 3319
	 * This only happens for NODATASUM or compressed read.
	 * Normally this should be covered by above check for compressed read
	 * or the next check for NODATASUM.  Just do a quicker exit here.
3320
	 */
3321
	if (bbio->csum == NULL)
3322
		return 0;
3323

3324
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3325 3326
		return 0;

3327
	if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)))
3328
		return 0;
3329

3330 3331 3332 3333 3334
	ASSERT(page_offset(page) <= start &&
	       end <= page_offset(page) + PAGE_SIZE - 1);
	for (pg_off = offset_in_page(start);
	     pg_off < offset_in_page(end);
	     pg_off += sectorsize, bio_offset += sectorsize) {
3335
		u64 file_offset = pg_off + page_offset(page);
3336 3337
		int ret;

3338
		if (btrfs_is_data_reloc_root(root) &&
3339 3340 3341 3342 3343 3344 3345 3346 3347
		    test_range_bit(io_tree, file_offset,
				   file_offset + sectorsize - 1,
				   EXTENT_NODATASUM, 1, NULL)) {
			/* Skip the range without csum for data reloc inode */
			clear_extent_bits(io_tree, file_offset,
					  file_offset + sectorsize - 1,
					  EXTENT_NODATASUM);
			continue;
		}
3348
		ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
3349
				      page_offset(page) + pg_off);
3350 3351 3352 3353 3354 3355
		if (ret < 0) {
			const int nr_bit = (pg_off - offset_in_page(start)) >>
				     root->fs_info->sectorsize_bits;

			result |= (1U << nr_bit);
		}
3356
	}
3357
	return result;
3358
}
3359

3360 3361 3362 3363 3364 3365 3366 3367 3368 3369
/*
 * btrfs_add_delayed_iput - perform a delayed iput on @inode
 *
 * @inode: The inode we want to perform iput on
 *
 * This function uses the generic vfs_inode::i_count to track whether we should
 * just decrement it (in case it's > 1) or if this is the last iput then link
 * the inode to the delayed iput machinery. Delayed iputs are processed at
 * transaction commit time/superblock commit/cleaner kthread.
 */
Y
Yan, Zheng 已提交
3370 3371
void btrfs_add_delayed_iput(struct inode *inode)
{
3372
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3373
	struct btrfs_inode *binode = BTRFS_I(inode);
Y
Yan, Zheng 已提交
3374 3375 3376 3377

	if (atomic_add_unless(&inode->i_count, -1, 1))
		return;

3378
	atomic_inc(&fs_info->nr_delayed_iputs);
Y
Yan, Zheng 已提交
3379
	spin_lock(&fs_info->delayed_iput_lock);
3380 3381
	ASSERT(list_empty(&binode->delayed_iput));
	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
Y
Yan, Zheng 已提交
3382
	spin_unlock(&fs_info->delayed_iput_lock);
3383 3384
	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
		wake_up_process(fs_info->cleaner_kthread);
Y
Yan, Zheng 已提交
3385 3386
}

3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408
static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
				    struct btrfs_inode *inode)
{
	list_del_init(&inode->delayed_iput);
	spin_unlock(&fs_info->delayed_iput_lock);
	iput(&inode->vfs_inode);
	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
		wake_up(&fs_info->delayed_iputs_wait);
	spin_lock(&fs_info->delayed_iput_lock);
}

static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
				   struct btrfs_inode *inode)
{
	if (!list_empty(&inode->delayed_iput)) {
		spin_lock(&fs_info->delayed_iput_lock);
		if (!list_empty(&inode->delayed_iput))
			run_delayed_iput_locked(fs_info, inode);
		spin_unlock(&fs_info->delayed_iput_lock);
	}
}

3409
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
Y
Yan, Zheng 已提交
3410 3411 3412
{

	spin_lock(&fs_info->delayed_iput_lock);
3413 3414 3415 3416 3417
	while (!list_empty(&fs_info->delayed_iputs)) {
		struct btrfs_inode *inode;

		inode = list_first_entry(&fs_info->delayed_iputs,
				struct btrfs_inode, delayed_iput);
3418
		run_delayed_iput_locked(fs_info, inode);
3419
		cond_resched_lock(&fs_info->delayed_iput_lock);
Y
Yan, Zheng 已提交
3420
	}
3421
	spin_unlock(&fs_info->delayed_iput_lock);
Y
Yan, Zheng 已提交
3422 3423
}

3424
/**
3425 3426 3427
 * Wait for flushing all delayed iputs
 *
 * @fs_info:  the filesystem
3428 3429 3430 3431 3432
 *
 * This will wait on any delayed iputs that are currently running with KILLABLE
 * set.  Once they are all done running we will return, unless we are killed in
 * which case we return EINTR. This helps in user operations like fallocate etc
 * that might get blocked on the iputs.
3433 3434
 *
 * Return EINTR if we were killed, 0 if nothing's pending
3435 3436 3437 3438 3439 3440 3441 3442 3443 3444
 */
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
			atomic_read(&fs_info->nr_delayed_iputs) == 0);
	if (ret)
		return -EINTR;
	return 0;
}

3445
/*
3446 3447
 * This creates an orphan entry for the given inode in case something goes wrong
 * in the middle of an unlink.
3448
 */
3449
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3450
		     struct btrfs_inode *inode)
3451
{
3452
	int ret;
3453

3454 3455 3456 3457
	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
	if (ret && ret != -EEXIST) {
		btrfs_abort_transaction(trans, ret);
		return ret;
3458 3459 3460
	}

	return 0;
3461 3462 3463
}

/*
3464 3465
 * We have done the delete so we can go ahead and remove the orphan item for
 * this particular inode.
3466
 */
3467
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3468
			    struct btrfs_inode *inode)
3469
{
3470
	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3471 3472 3473 3474 3475 3476
}

/*
 * this cleans up any orphans that may be left on the list from the last use
 * of this root.
 */
3477
int btrfs_orphan_cleanup(struct btrfs_root *root)
3478
{
3479
	struct btrfs_fs_info *fs_info = root->fs_info;
3480 3481 3482 3483 3484
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key, found_key;
	struct btrfs_trans_handle *trans;
	struct inode *inode;
3485
	u64 last_objectid = 0;
3486
	int ret = 0, nr_unlink = 0;
3487

3488
	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3489
		return 0;
3490 3491

	path = btrfs_alloc_path();
3492 3493 3494 3495
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}
3496
	path->reada = READA_BACK;
3497 3498

	key.objectid = BTRFS_ORPHAN_OBJECTID;
3499
	key.type = BTRFS_ORPHAN_ITEM_KEY;
3500 3501 3502 3503
	key.offset = (u64)-1;

	while (1) {
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3504 3505
		if (ret < 0)
			goto out;
3506 3507 3508

		/*
		 * if ret == 0 means we found what we were searching for, which
L
Lucas De Marchi 已提交
3509
		 * is weird, but possible, so only screw with path if we didn't
3510 3511 3512
		 * find the key and see if we have stuff that matches
		 */
		if (ret > 0) {
3513
			ret = 0;
3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525
			if (path->slots[0] == 0)
				break;
			path->slots[0]--;
		}

		/* pull out the item */
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

		/* make sure the item matches what we want */
		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
			break;
3526
		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3527 3528 3529
			break;

		/* release the path since we're done with it */
3530
		btrfs_release_path(path);
3531 3532 3533 3534 3535 3536

		/*
		 * this is where we are basically btrfs_lookup, without the
		 * crossing root thing.  we store the inode number in the
		 * offset of the orphan item.
		 */
3537 3538

		if (found_key.offset == last_objectid) {
3539 3540
			btrfs_err(fs_info,
				  "Error removing orphan entry, stopping orphan cleanup");
3541 3542 3543 3544 3545 3546
			ret = -EINVAL;
			goto out;
		}

		last_objectid = found_key.offset;

3547 3548 3549
		found_key.objectid = found_key.offset;
		found_key.type = BTRFS_INODE_ITEM_KEY;
		found_key.offset = 0;
D
David Sterba 已提交
3550
		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3551
		ret = PTR_ERR_OR_ZERO(inode);
3552
		if (ret && ret != -ENOENT)
3553
			goto out;
3554

3555
		if (ret == -ENOENT && root == fs_info->tree_root) {
3556 3557 3558 3559
			struct btrfs_root *dead_root;
			int is_dead_root = 0;

			/*
3560
			 * This is an orphan in the tree root. Currently these
3561
			 * could come from 2 sources:
3562
			 *  a) a root (snapshot/subvolume) deletion in progress
3563
			 *  b) a free space cache inode
3564 3565 3566 3567 3568 3569 3570 3571 3572
			 * We need to distinguish those two, as the orphan item
			 * for a root must not get deleted before the deletion
			 * of the snapshot/subvolume's tree completes.
			 *
			 * btrfs_find_orphan_roots() ran before us, which has
			 * found all deleted roots and loaded them into
			 * fs_info->fs_roots_radix. So here we can find if an
			 * orphan item corresponds to a deleted root by looking
			 * up the root from that radix tree.
3573
			 */
3574 3575 3576 3577 3578 3579 3580 3581

			spin_lock(&fs_info->fs_roots_radix_lock);
			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
							 (unsigned long)found_key.objectid);
			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
				is_dead_root = 1;
			spin_unlock(&fs_info->fs_roots_radix_lock);

3582 3583 3584 3585 3586
			if (is_dead_root) {
				/* prevent this orphan from being found again */
				key.offset = found_key.objectid - 1;
				continue;
			}
3587

3588
		}
3589

3590
		/*
3591
		 * If we have an inode with links, there are a couple of
3592 3593 3594 3595 3596 3597 3598 3599
		 * possibilities:
		 *
		 * 1. We were halfway through creating fsverity metadata for the
		 * file. In that case, the orphan item represents incomplete
		 * fsverity metadata which must be cleaned up with
		 * btrfs_drop_verity_items and deleting the orphan item.

		 * 2. Old kernels (before v3.12) used to create an
3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614
		 * orphan item for truncate indicating that there were possibly
		 * extent items past i_size that needed to be deleted. In v3.12,
		 * truncate was changed to update i_size in sync with the extent
		 * items, but the (useless) orphan item was still created. Since
		 * v4.18, we don't create the orphan item for truncate at all.
		 *
		 * So, this item could mean that we need to do a truncate, but
		 * only if this filesystem was last used on a pre-v3.12 kernel
		 * and was not cleanly unmounted. The odds of that are quite
		 * slim, and it's a pain to do the truncate now, so just delete
		 * the orphan item.
		 *
		 * It's also possible that this orphan item was supposed to be
		 * deleted but wasn't. The inode number may have been reused,
		 * but either way, we can delete the orphan item.
3615
		 */
3616
		if (ret == -ENOENT || inode->i_nlink) {
3617 3618
			if (!ret) {
				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3619
				iput(inode);
3620 3621 3622
				if (ret)
					goto out;
			}
J
Josef Bacik 已提交
3623
			trans = btrfs_start_transaction(root, 1);
3624 3625 3626 3627
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				goto out;
			}
3628 3629
			btrfs_debug(fs_info, "auto deleting %Lu",
				    found_key.objectid);
J
Josef Bacik 已提交
3630 3631
			ret = btrfs_del_orphan_item(trans, root,
						    found_key.objectid);
3632
			btrfs_end_transaction(trans);
3633 3634
			if (ret)
				goto out;
3635 3636 3637
			continue;
		}

3638
		nr_unlink++;
3639 3640 3641 3642

		/* this will do delete_inode and everything for us */
		iput(inode);
	}
3643 3644 3645
	/* release the path since we're done with it */
	btrfs_release_path(path);

3646
	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3647
		trans = btrfs_join_transaction(root);
3648
		if (!IS_ERR(trans))
3649
			btrfs_end_transaction(trans);
3650
	}
3651 3652

	if (nr_unlink)
3653
		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3654 3655 3656

out:
	if (ret)
3657
		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3658 3659
	btrfs_free_path(path);
	return ret;
3660 3661
}

3662 3663 3664 3665 3666 3667 3668
/*
 * very simple check to peek ahead in the leaf looking for xattrs.  If we
 * don't find any xattrs, we know there can't be any acls.
 *
 * slot is the slot the inode is in, objectid is the objectid of the inode
 */
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3669 3670
					  int slot, u64 objectid,
					  int *first_xattr_slot)
3671 3672 3673
{
	u32 nritems = btrfs_header_nritems(leaf);
	struct btrfs_key found_key;
3674 3675
	static u64 xattr_access = 0;
	static u64 xattr_default = 0;
3676 3677
	int scanned = 0;

3678
	if (!xattr_access) {
3679 3680 3681 3682
		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3683 3684
	}

3685
	slot++;
3686
	*first_xattr_slot = -1;
3687 3688 3689 3690 3691 3692 3693 3694
	while (slot < nritems) {
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* we found a different objectid, there must not be acls */
		if (found_key.objectid != objectid)
			return 0;

		/* we found an xattr, assume we've got an acl */
3695
		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3696 3697
			if (*first_xattr_slot == -1)
				*first_xattr_slot = slot;
3698 3699 3700 3701
			if (found_key.offset == xattr_access ||
			    found_key.offset == xattr_default)
				return 1;
		}
3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725

		/*
		 * we found a key greater than an xattr key, there can't
		 * be any acls later on
		 */
		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
			return 0;

		slot++;
		scanned++;

		/*
		 * it goes inode, inode backrefs, xattrs, extents,
		 * so if there are a ton of hard links to an inode there can
		 * be a lot of backrefs.  Don't waste time searching too hard,
		 * this is just an optimization
		 */
		if (scanned >= 8)
			break;
	}
	/* we hit the end of the leaf before we found an xattr or
	 * something larger than an xattr.  We have to assume the inode
	 * has acls
	 */
3726 3727
	if (*first_xattr_slot == -1)
		*first_xattr_slot = slot;
3728 3729 3730
	return 1;
}

C
Chris Mason 已提交
3731 3732 3733
/*
 * read an inode from the btree into the in-memory inode
 */
3734 3735
static int btrfs_read_locked_inode(struct inode *inode,
				   struct btrfs_path *in_path)
C
Chris Mason 已提交
3736
{
3737
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3738
	struct btrfs_path *path = in_path;
3739
	struct extent_buffer *leaf;
C
Chris Mason 已提交
3740 3741 3742
	struct btrfs_inode_item *inode_item;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_key location;
3743
	unsigned long ptr;
3744
	int maybe_acls;
J
Josef Bacik 已提交
3745
	u32 rdev;
C
Chris Mason 已提交
3746
	int ret;
3747
	bool filled = false;
3748
	int first_xattr_slot;
3749 3750 3751 3752

	ret = btrfs_fill_inode(inode, &rdev);
	if (!ret)
		filled = true;
C
Chris Mason 已提交
3753

3754 3755 3756 3757 3758
	if (!path) {
		path = btrfs_alloc_path();
		if (!path)
			return -ENOMEM;
	}
3759

C
Chris Mason 已提交
3760
	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
C
Chris Mason 已提交
3761

C
Chris Mason 已提交
3762
	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3763
	if (ret) {
3764 3765
		if (path != in_path)
			btrfs_free_path(path);
A
Al Viro 已提交
3766
		return ret;
3767
	}
C
Chris Mason 已提交
3768

3769
	leaf = path->nodes[0];
3770 3771

	if (filled)
3772
		goto cache_index;
3773

3774 3775 3776
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_inode_item);
	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
M
Miklos Szeredi 已提交
3777
	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3778 3779
	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3780
	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3781 3782
	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
			round_up(i_size_read(inode), fs_info->sectorsize));
3783

3784 3785
	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3786

3787 3788
	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3789

3790 3791
	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3792

3793 3794 3795 3796
	BTRFS_I(inode)->i_otime.tv_sec =
		btrfs_timespec_sec(leaf, &inode_item->otime);
	BTRFS_I(inode)->i_otime.tv_nsec =
		btrfs_timespec_nsec(leaf, &inode_item->otime);
3797

3798
	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3799
	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
J
Josef Bacik 已提交
3800 3801
	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);

3802 3803
	inode_set_iversion_queried(inode,
				   btrfs_inode_sequence(leaf, inode_item));
3804 3805 3806 3807 3808
	inode->i_generation = BTRFS_I(inode)->generation;
	inode->i_rdev = 0;
	rdev = btrfs_inode_rdev(leaf, inode_item);

	BTRFS_I(inode)->index_cnt = (u64)-1;
3809 3810
	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3811 3812

cache_index:
J
Josef Bacik 已提交
3813 3814 3815 3816 3817
	/*
	 * If we were modified in the current generation and evicted from memory
	 * and then re-read we need to do a full sync since we don't have any
	 * idea about which extents were modified before we were evicted from
	 * cache.
3818 3819 3820
	 *
	 * This is required for both inode re-read from disk and delayed inode
	 * in delayed_nodes_tree.
J
Josef Bacik 已提交
3821
	 */
3822
	if (BTRFS_I(inode)->last_trans == fs_info->generation)
J
Josef Bacik 已提交
3823 3824 3825
		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			&BTRFS_I(inode)->runtime_flags);

3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848
	/*
	 * We don't persist the id of the transaction where an unlink operation
	 * against the inode was last made. So here we assume the inode might
	 * have been evicted, and therefore the exact value of last_unlink_trans
	 * lost, and set it to last_trans to avoid metadata inconsistencies
	 * between the inode and its parent if the inode is fsync'ed and the log
	 * replayed. For example, in the scenario:
	 *
	 * touch mydir/foo
	 * ln mydir/foo mydir/bar
	 * sync
	 * unlink mydir/bar
	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
	 * xfs_io -c fsync mydir/foo
	 * <power failure>
	 * mount fs, triggers fsync log replay
	 *
	 * We must make sure that when we fsync our inode foo we also log its
	 * parent inode, otherwise after log replay the parent still has the
	 * dentry with the "bar" name but our inode foo has a link count of 1
	 * and doesn't have an inode ref with the name "bar" anymore.
	 *
	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3849
	 * but it guarantees correctness at the expense of occasional full
3850 3851 3852 3853 3854
	 * transaction commits on fsync if our inode is a directory, or if our
	 * inode is not a directory, logging its parent unnecessarily.
	 */
	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;

3855 3856 3857 3858 3859 3860 3861 3862
	/*
	 * Same logic as for last_unlink_trans. We don't persist the generation
	 * of the last transaction where this inode was used for a reflink
	 * operation, so after eviction and reloading the inode we must be
	 * pessimistic and assume the last transaction that modified the inode.
	 */
	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;

3863 3864 3865 3866 3867 3868
	path->slots[0]++;
	if (inode->i_nlink != 1 ||
	    path->slots[0] >= btrfs_header_nritems(leaf))
		goto cache_acl;

	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3869
	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884
		goto cache_acl;

	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
	if (location.type == BTRFS_INODE_REF_KEY) {
		struct btrfs_inode_ref *ref;

		ref = (struct btrfs_inode_ref *)ptr;
		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
		struct btrfs_inode_extref *extref;

		extref = (struct btrfs_inode_extref *)ptr;
		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
								     extref);
	}
3885
cache_acl:
3886 3887 3888 3889
	/*
	 * try to precache a NULL acl entry for files that don't have
	 * any xattrs or acls
	 */
L
Li Zefan 已提交
3890
	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3891
			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3892 3893 3894 3895
	if (first_xattr_slot != -1) {
		path->slots[0] = first_xattr_slot;
		ret = btrfs_load_inode_props(inode, path);
		if (ret)
3896
			btrfs_err(fs_info,
3897
				  "error loading props for ino %llu (root %llu): %d",
3898
				  btrfs_ino(BTRFS_I(inode)),
3899 3900
				  root->root_key.objectid, ret);
	}
3901 3902
	if (path != in_path)
		btrfs_free_path(path);
3903

3904 3905
	if (!maybe_acls)
		cache_no_acl(inode);
3906

C
Chris Mason 已提交
3907 3908 3909 3910 3911 3912 3913 3914
	switch (inode->i_mode & S_IFMT) {
	case S_IFREG:
		inode->i_mapping->a_ops = &btrfs_aops;
		inode->i_fop = &btrfs_file_operations;
		inode->i_op = &btrfs_file_inode_operations;
		break;
	case S_IFDIR:
		inode->i_fop = &btrfs_dir_file_operations;
3915
		inode->i_op = &btrfs_dir_inode_operations;
C
Chris Mason 已提交
3916 3917 3918
		break;
	case S_IFLNK:
		inode->i_op = &btrfs_symlink_inode_operations;
3919
		inode_nohighmem(inode);
3920
		inode->i_mapping->a_ops = &btrfs_aops;
C
Chris Mason 已提交
3921
		break;
J
Josef Bacik 已提交
3922
	default:
J
Jim Owens 已提交
3923
		inode->i_op = &btrfs_special_inode_operations;
J
Josef Bacik 已提交
3924 3925
		init_special_inode(inode, inode->i_mode, rdev);
		break;
C
Chris Mason 已提交
3926
	}
3927

3928
	btrfs_sync_inode_flags_to_i_flags(inode);
3929
	return 0;
C
Chris Mason 已提交
3930 3931
}

C
Chris Mason 已提交
3932 3933 3934
/*
 * given a leaf and an inode, copy the inode fields into the leaf
 */
3935 3936
static void fill_inode_item(struct btrfs_trans_handle *trans,
			    struct extent_buffer *leaf,
3937
			    struct btrfs_inode_item *item,
C
Chris Mason 已提交
3938 3939
			    struct inode *inode)
{
3940
	struct btrfs_map_token token;
3941
	u64 flags;
3942

3943
	btrfs_init_map_token(&token, leaf);
3944

3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976
	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);

	btrfs_set_token_timespec_sec(&token, &item->atime,
				     inode->i_atime.tv_sec);
	btrfs_set_token_timespec_nsec(&token, &item->atime,
				      inode->i_atime.tv_nsec);

	btrfs_set_token_timespec_sec(&token, &item->mtime,
				     inode->i_mtime.tv_sec);
	btrfs_set_token_timespec_nsec(&token, &item->mtime,
				      inode->i_mtime.tv_nsec);

	btrfs_set_token_timespec_sec(&token, &item->ctime,
				     inode->i_ctime.tv_sec);
	btrfs_set_token_timespec_nsec(&token, &item->ctime,
				      inode->i_ctime.tv_nsec);

	btrfs_set_token_timespec_sec(&token, &item->otime,
				     BTRFS_I(inode)->i_otime.tv_sec);
	btrfs_set_token_timespec_nsec(&token, &item->otime,
				      BTRFS_I(inode)->i_otime.tv_nsec);

	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
	btrfs_set_token_inode_generation(&token, item,
					 BTRFS_I(inode)->generation);
	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
	btrfs_set_token_inode_transid(&token, item, trans->transid);
	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3977 3978 3979
	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
					  BTRFS_I(inode)->ro_flags);
	btrfs_set_token_inode_flags(&token, item, flags);
3980
	btrfs_set_token_inode_block_group(&token, item, 0);
C
Chris Mason 已提交
3981 3982
}

C
Chris Mason 已提交
3983 3984 3985
/*
 * copy everything in the in-memory inode into the btree.
 */
3986
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3987 3988
				struct btrfs_root *root,
				struct btrfs_inode *inode)
C
Chris Mason 已提交
3989 3990 3991
{
	struct btrfs_inode_item *inode_item;
	struct btrfs_path *path;
3992
	struct extent_buffer *leaf;
C
Chris Mason 已提交
3993 3994 3995
	int ret;

	path = btrfs_alloc_path();
3996 3997 3998
	if (!path)
		return -ENOMEM;

3999
	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
C
Chris Mason 已提交
4000 4001 4002 4003 4004 4005
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		goto failed;
	}

4006 4007
	leaf = path->nodes[0];
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
4008
				    struct btrfs_inode_item);
C
Chris Mason 已提交
4009

4010
	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4011
	btrfs_mark_buffer_dirty(leaf);
4012
	btrfs_set_inode_last_trans(trans, inode);
C
Chris Mason 已提交
4013 4014 4015 4016 4017 4018
	ret = 0;
failed:
	btrfs_free_path(path);
	return ret;
}

4019 4020 4021 4022
/*
 * copy everything in the in-memory inode into the btree.
 */
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4023 4024
				struct btrfs_root *root,
				struct btrfs_inode *inode)
4025
{
4026
	struct btrfs_fs_info *fs_info = root->fs_info;
4027 4028 4029 4030 4031 4032 4033 4034 4035
	int ret;

	/*
	 * If the inode is a free space inode, we can deadlock during commit
	 * if we put it into the delayed code.
	 *
	 * The data relocation inode should also be directly updated
	 * without delay
	 */
4036
	if (!btrfs_is_free_space_inode(inode)
4037
	    && !btrfs_is_data_reloc_root(root)
4038
	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4039 4040
		btrfs_update_root_times(trans, root);

4041
		ret = btrfs_delayed_update_inode(trans, root, inode);
4042
		if (!ret)
4043
			btrfs_set_inode_last_trans(trans, inode);
4044 4045 4046
		return ret;
	}

4047
	return btrfs_update_inode_item(trans, root, inode);
4048 4049
}

4050 4051
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
				struct btrfs_root *root, struct btrfs_inode *inode)
4052 4053 4054
{
	int ret;

4055
	ret = btrfs_update_inode(trans, root, inode);
4056
	if (ret == -ENOSPC)
4057
		return btrfs_update_inode_item(trans, root, inode);
4058 4059 4060
	return ret;
}

C
Chris Mason 已提交
4061 4062 4063 4064 4065
/*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
 */
4066
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4067 4068
				struct btrfs_inode *dir,
				struct btrfs_inode *inode,
4069 4070
				const char *name, int name_len,
				struct btrfs_rename_ctx *rename_ctx)
C
Chris Mason 已提交
4071
{
4072
	struct btrfs_root *root = dir->root;
4073
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4074 4075 4076
	struct btrfs_path *path;
	int ret = 0;
	struct btrfs_dir_item *di;
4077
	u64 index;
L
Li Zefan 已提交
4078 4079
	u64 ino = btrfs_ino(inode);
	u64 dir_ino = btrfs_ino(dir);
C
Chris Mason 已提交
4080 4081

	path = btrfs_alloc_path();
4082 4083
	if (!path) {
		ret = -ENOMEM;
4084
		goto out;
4085 4086
	}

L
Li Zefan 已提交
4087
	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
C
Chris Mason 已提交
4088
				    name, name_len, -1);
4089 4090
	if (IS_ERR_OR_NULL(di)) {
		ret = di ? PTR_ERR(di) : -ENOENT;
C
Chris Mason 已提交
4091 4092 4093
		goto err;
	}
	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4094 4095
	if (ret)
		goto err;
4096
	btrfs_release_path(path);
C
Chris Mason 已提交
4097

4098 4099 4100 4101 4102 4103 4104 4105 4106 4107
	/*
	 * If we don't have dir index, we have to get it by looking up
	 * the inode ref, since we get the inode ref, remove it directly,
	 * it is unnecessary to do delayed deletion.
	 *
	 * But if we have dir index, needn't search inode ref to get it.
	 * Since the inode ref is close to the inode item, it is better
	 * that we delay to delete it, and just do this deletion when
	 * we update the inode item.
	 */
4108
	if (inode->dir_index) {
4109 4110
		ret = btrfs_delayed_delete_inode_ref(inode);
		if (!ret) {
4111
			index = inode->dir_index;
4112 4113 4114 4115
			goto skip_backref;
		}
	}

L
Li Zefan 已提交
4116 4117
	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
				  dir_ino, &index);
4118
	if (ret) {
4119
		btrfs_info(fs_info,
4120
			"failed to delete reference to %.*s, inode %llu parent %llu",
4121
			name_len, name, ino, dir_ino);
4122
		btrfs_abort_transaction(trans, ret);
4123 4124
		goto err;
	}
4125
skip_backref:
4126 4127 4128
	if (rename_ctx)
		rename_ctx->index = index;

4129
	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4130
	if (ret) {
4131
		btrfs_abort_transaction(trans, ret);
C
Chris Mason 已提交
4132
		goto err;
4133
	}
C
Chris Mason 已提交
4134

4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146
	/*
	 * If we are in a rename context, we don't need to update anything in the
	 * log. That will be done later during the rename by btrfs_log_new_name().
	 * Besides that, doing it here would only cause extra unncessary btree
	 * operations on the log tree, increasing latency for applications.
	 */
	if (!rename_ctx) {
		btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
					   dir_ino);
		btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
					     index);
	}
4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157

	/*
	 * If we have a pending delayed iput we could end up with the final iput
	 * being run in btrfs-cleaner context.  If we have enough of these built
	 * up we can end up burning a lot of time in btrfs-cleaner without any
	 * way to throttle the unlinks.  Since we're currently holding a ref on
	 * the inode we can run the delayed iput here without any issues as the
	 * final iput won't be done until after we drop the ref we're currently
	 * holding.
	 */
	btrfs_run_delayed_iput(fs_info, inode);
C
Chris Mason 已提交
4158 4159
err:
	btrfs_free_path(path);
4160 4161 4162
	if (ret)
		goto out;

4163
	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4164 4165 4166 4167
	inode_inc_iversion(&inode->vfs_inode);
	inode_inc_iversion(&dir->vfs_inode);
	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4168
	ret = btrfs_update_inode(trans, root, dir);
4169
out:
C
Chris Mason 已提交
4170 4171 4172
	return ret;
}

4173
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4174
		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4175 4176 4177
		       const char *name, int name_len)
{
	int ret;
4178
	ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL);
4179
	if (!ret) {
4180
		drop_nlink(&inode->vfs_inode);
4181
		ret = btrfs_update_inode(trans, inode->root, inode);
4182 4183 4184
	}
	return ret;
}
C
Chris Mason 已提交
4185

4186 4187 4188
/*
 * helper to start transaction for unlink and rmdir.
 *
4189 4190 4191 4192
 * unlink and rmdir are special in btrfs, they do not always free space, so
 * if we cannot make our reservations the normal way try and see if there is
 * plenty of slack room in the global reserve to migrate, otherwise we cannot
 * allow the unlink to occur.
4193
 */
4194
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4195
{
4196
	struct btrfs_root *root = BTRFS_I(dir)->root;
4197

4198 4199 4200 4201 4202 4203 4204
	/*
	 * 1 for the possible orphan item
	 * 1 for the dir item
	 * 1 for the dir index
	 * 1 for the inode ref
	 * 1 for the inode
	 */
4205
	return btrfs_start_transaction_fallback_global_rsv(root, 5);
4206 4207 4208 4209 4210
}

static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
	struct btrfs_trans_handle *trans;
4211
	struct inode *inode = d_inode(dentry);
4212 4213
	int ret;

4214
	trans = __unlink_start_trans(dir);
4215 4216
	if (IS_ERR(trans))
		return PTR_ERR(trans);
4217

4218 4219
	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
			0);
4220

4221
	ret = btrfs_unlink_inode(trans, BTRFS_I(dir),
4222 4223
			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
			dentry->d_name.len);
4224 4225
	if (ret)
		goto out;
4226

4227
	if (inode->i_nlink == 0) {
4228
		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4229 4230
		if (ret)
			goto out;
4231
	}
4232

4233
out:
4234
	btrfs_end_transaction(trans);
4235
	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
C
Chris Mason 已提交
4236 4237 4238
	return ret;
}

4239
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4240
			       struct inode *dir, struct dentry *dentry)
4241
{
4242
	struct btrfs_root *root = BTRFS_I(dir)->root;
4243
	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4244 4245 4246 4247
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dir_item *di;
	struct btrfs_key key;
4248 4249
	const char *name = dentry->d_name.name;
	int name_len = dentry->d_name.len;
4250 4251
	u64 index;
	int ret;
4252
	u64 objectid;
4253
	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4254

4255 4256 4257 4258 4259 4260 4261 4262 4263
	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
		objectid = inode->root->root_key.objectid;
	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
		objectid = inode->location.objectid;
	} else {
		WARN_ON(1);
		return -EINVAL;
	}

4264 4265 4266 4267
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

L
Li Zefan 已提交
4268
	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4269
				   name, name_len, -1);
4270
	if (IS_ERR_OR_NULL(di)) {
4271
		ret = di ? PTR_ERR(di) : -ENOENT;
4272 4273
		goto out;
	}
4274 4275 4276 4277 4278

	leaf = path->nodes[0];
	btrfs_dir_item_key_to_cpu(leaf, di, &key);
	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4279
	if (ret) {
4280
		btrfs_abort_transaction(trans, ret);
4281 4282
		goto out;
	}
4283
	btrfs_release_path(path);
4284

4285 4286 4287 4288
	/*
	 * This is a placeholder inode for a subvolume we didn't have a
	 * reference to at the time of the snapshot creation.  In the meantime
	 * we could have renamed the real subvol link into our snapshot, so
D
David Sterba 已提交
4289
	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4290 4291 4292 4293 4294
	 * Instead simply lookup the dir_index_item for this entry so we can
	 * remove it.  Otherwise we know we have a ref to the root and we can
	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
	 */
	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
L
Li Zefan 已提交
4295
		di = btrfs_search_dir_index_item(root, path, dir_ino,
4296
						 name, name_len);
4297 4298 4299 4300 4301
		if (IS_ERR_OR_NULL(di)) {
			if (!di)
				ret = -ENOENT;
			else
				ret = PTR_ERR(di);
4302
			btrfs_abort_transaction(trans, ret);
4303 4304
			goto out;
		}
4305 4306 4307 4308

		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		index = key.offset;
4309 4310 4311 4312 4313 4314 4315 4316 4317
		btrfs_release_path(path);
	} else {
		ret = btrfs_del_root_ref(trans, objectid,
					 root->root_key.objectid, dir_ino,
					 &index, name, name_len);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
4318 4319
	}

4320
	ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
4321
	if (ret) {
4322
		btrfs_abort_transaction(trans, ret);
4323 4324
		goto out;
	}
4325

4326
	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4327
	inode_inc_iversion(dir);
4328
	dir->i_mtime = dir->i_ctime = current_time(dir);
4329
	ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
4330
	if (ret)
4331
		btrfs_abort_transaction(trans, ret);
4332
out:
4333
	btrfs_free_path(path);
4334
	return ret;
4335 4336
}

4337 4338 4339 4340
/*
 * Helper to check if the subvolume references other subvolumes or if it's
 * default.
 */
4341
static noinline int may_destroy_subvol(struct btrfs_root *root)
4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_path *path;
	struct btrfs_dir_item *di;
	struct btrfs_key key;
	u64 dir_id;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* Make sure this root isn't set as the default subvol */
	dir_id = btrfs_super_root_dir(fs_info->super_copy);
	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
				   dir_id, "default", 7, 0);
	if (di && !IS_ERR(di)) {
		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
		if (key.objectid == root->root_key.objectid) {
			ret = -EPERM;
			btrfs_err(fs_info,
				  "deleting default subvolume %llu is not allowed",
				  key.objectid);
			goto out;
		}
		btrfs_release_path(path);
	}

	key.objectid = root->root_key.objectid;
	key.type = BTRFS_ROOT_REF_KEY;
	key.offset = (u64)-1;

	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	BUG_ON(ret == 0);

	ret = 0;
	if (path->slots[0] > 0) {
		path->slots[0]--;
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		if (key.objectid == root->root_key.objectid &&
		    key.type == BTRFS_ROOT_REF_KEY)
			ret = -ENOTEMPTY;
	}
out:
	btrfs_free_path(path);
	return ret;
}

4392 4393 4394 4395 4396 4397 4398 4399 4400 4401
/* Delete all dentries for inodes belonging to the root */
static void btrfs_prune_dentries(struct btrfs_root *root)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct rb_node *node;
	struct rb_node *prev;
	struct btrfs_inode *entry;
	struct inode *inode;
	u64 objectid = 0;

J
Josef Bacik 已提交
4402
	if (!BTRFS_FS_ERROR(fs_info))
4403 4404 4405 4406 4407 4408 4409 4410 4411 4412
		WARN_ON(btrfs_root_refs(&root->root_item) != 0);

	spin_lock(&root->inode_lock);
again:
	node = root->inode_tree.rb_node;
	prev = NULL;
	while (node) {
		prev = node;
		entry = rb_entry(node, struct btrfs_inode, rb_node);

4413
		if (objectid < btrfs_ino(entry))
4414
			node = node->rb_left;
4415
		else if (objectid > btrfs_ino(entry))
4416 4417 4418 4419 4420 4421 4422
			node = node->rb_right;
		else
			break;
	}
	if (!node) {
		while (prev) {
			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4423
			if (objectid <= btrfs_ino(entry)) {
4424 4425 4426 4427 4428 4429 4430 4431
				node = prev;
				break;
			}
			prev = rb_next(prev);
		}
	}
	while (node) {
		entry = rb_entry(node, struct btrfs_inode, rb_node);
4432
		objectid = btrfs_ino(entry) + 1;
4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455
		inode = igrab(&entry->vfs_inode);
		if (inode) {
			spin_unlock(&root->inode_lock);
			if (atomic_read(&inode->i_count) > 1)
				d_prune_aliases(inode);
			/*
			 * btrfs_drop_inode will have it removed from the inode
			 * cache when its usage count hits zero.
			 */
			iput(inode);
			cond_resched();
			spin_lock(&root->inode_lock);
			goto again;
		}

		if (cond_resched_lock(&root->inode_lock))
			goto again;

		node = rb_next(node);
	}
	spin_unlock(&root->inode_lock);
}

4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct inode *inode = d_inode(dentry);
	struct btrfs_root *dest = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
	struct btrfs_block_rsv block_rsv;
	u64 root_flags;
	int ret;

	/*
	 * Don't allow to delete a subvolume with send in progress. This is
	 * inside the inode lock so the error handling that has to drop the bit
	 * again is not run concurrently.
	 */
	spin_lock(&dest->root_item_lock);
4473
	if (dest->send_in_progress) {
4474 4475 4476 4477 4478 4479
		spin_unlock(&dest->root_item_lock);
		btrfs_warn(fs_info,
			   "attempt to delete subvolume %llu during send",
			   dest->root_key.objectid);
		return -EPERM;
	}
4480 4481 4482 4483
	root_flags = btrfs_root_flags(&dest->root_item);
	btrfs_set_root_flags(&dest->root_item,
			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
	spin_unlock(&dest->root_item_lock);
4484 4485 4486

	down_write(&fs_info->subvol_sem);

4487 4488
	ret = may_destroy_subvol(dest);
	if (ret)
4489 4490 4491 4492 4493 4494 4495 4496
		goto out_up_write;

	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
	/*
	 * One for dir inode,
	 * two for dir entries,
	 * two for root ref/backref.
	 */
4497 4498
	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
	if (ret)
4499 4500 4501 4502
		goto out_up_write;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
4503
		ret = PTR_ERR(trans);
4504 4505 4506 4507 4508 4509 4510
		goto out_release;
	}
	trans->block_rsv = &block_rsv;
	trans->bytes_reserved = block_rsv.size;

	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));

4511
	ret = btrfs_unlink_subvol(trans, dir, dentry);
4512 4513 4514 4515 4516
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out_end_trans;
	}

4517 4518 4519 4520 4521
	ret = btrfs_record_root_in_trans(trans, dest);
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out_end_trans;
	}
4522 4523 4524

	memset(&dest->root_item.drop_progress, 0,
		sizeof(dest->root_item.drop_progress));
4525
	btrfs_set_root_drop_level(&dest->root_item, 0);
4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537
	btrfs_set_root_refs(&dest->root_item, 0);

	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
		ret = btrfs_insert_orphan_item(trans,
					fs_info->tree_root,
					dest->root_key.objectid);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out_end_trans;
		}
	}

4538
	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4539 4540 4541 4542 4543 4544 4545
				  BTRFS_UUID_KEY_SUBVOL,
				  dest->root_key.objectid);
	if (ret && ret != -ENOENT) {
		btrfs_abort_transaction(trans, ret);
		goto out_end_trans;
	}
	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4546
		ret = btrfs_uuid_tree_remove(trans,
4547 4548 4549 4550 4551 4552 4553 4554 4555
					  dest->root_item.received_uuid,
					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
					  dest->root_key.objectid);
		if (ret && ret != -ENOENT) {
			btrfs_abort_transaction(trans, ret);
			goto out_end_trans;
		}
	}

4556 4557
	free_anon_bdev(dest->anon_dev);
	dest->anon_dev = 0;
4558 4559 4560 4561 4562 4563
out_end_trans:
	trans->block_rsv = NULL;
	trans->bytes_reserved = 0;
	ret = btrfs_end_transaction(trans);
	inode->i_flags |= S_DEAD;
out_release:
4564
	btrfs_subvolume_release_metadata(root, &block_rsv);
4565 4566
out_up_write:
	up_write(&fs_info->subvol_sem);
4567
	if (ret) {
4568 4569 4570 4571 4572 4573 4574
		spin_lock(&dest->root_item_lock);
		root_flags = btrfs_root_flags(&dest->root_item);
		btrfs_set_root_flags(&dest->root_item,
				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
		spin_unlock(&dest->root_item_lock);
	} else {
		d_invalidate(dentry);
4575
		btrfs_prune_dentries(dest);
4576 4577 4578
		ASSERT(dest->send_in_progress == 0);
	}

4579
	return ret;
4580 4581
}

C
Chris Mason 已提交
4582 4583
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
4584
	struct inode *inode = d_inode(dentry);
4585
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4586
	int err = 0;
C
Chris Mason 已提交
4587
	struct btrfs_trans_handle *trans;
4588
	u64 last_unlink_trans;
C
Chris Mason 已提交
4589

4590
	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
Y
Yan 已提交
4591
		return -ENOTEMPTY;
4592 4593 4594 4595 4596 4597
	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
			btrfs_err(fs_info,
			"extent tree v2 doesn't support snapshot deletion yet");
			return -EOPNOTSUPP;
		}
4598
		return btrfs_delete_subvolume(dir, dentry);
4599
	}
Y
Yan 已提交
4600

4601
	trans = __unlink_start_trans(dir);
4602
	if (IS_ERR(trans))
4603 4604
		return PTR_ERR(trans);

4605
	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4606
		err = btrfs_unlink_subvol(trans, dir, dentry);
4607 4608 4609
		goto out;
	}

4610
	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4611
	if (err)
4612
		goto out;
4613

4614 4615
	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;

C
Chris Mason 已提交
4616
	/* now the directory is empty */
4617
	err = btrfs_unlink_inode(trans, BTRFS_I(dir),
4618 4619
			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
			dentry->d_name.len);
4620
	if (!err) {
4621
		btrfs_i_size_write(BTRFS_I(inode), 0);
4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635
		/*
		 * Propagate the last_unlink_trans value of the deleted dir to
		 * its parent directory. This is to prevent an unrecoverable
		 * log tree in the case we do something like this:
		 * 1) create dir foo
		 * 2) create snapshot under dir foo
		 * 3) delete the snapshot
		 * 4) rmdir foo
		 * 5) mkdir foo
		 * 6) fsync foo or some file inside foo
		 */
		if (last_unlink_trans >= trans->transid)
			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
	}
4636
out:
4637
	btrfs_end_transaction(trans);
4638
	btrfs_btree_balance_dirty(fs_info);
4639

C
Chris Mason 已提交
4640 4641 4642 4643
	return err;
}

/*
4644
 * btrfs_truncate_block - read, zero a chunk and write a block
J
Josef Bacik 已提交
4645 4646 4647 4648 4649 4650
 * @inode - inode that we're zeroing
 * @from - the offset to start zeroing
 * @len - the length to zero, 0 to zero the entire range respective to the
 *	offset
 * @front - zero up to the offset instead of from the offset on
 *
4651
 * This will find the block for the "from" offset and cow the block and zero the
J
Josef Bacik 已提交
4652
 * part we want to zero.  This is used with truncate and hole punching.
C
Chris Mason 已提交
4653
 */
4654 4655
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
			 int front)
C
Chris Mason 已提交
4656
{
4657 4658 4659
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct address_space *mapping = inode->vfs_inode.i_mapping;
	struct extent_io_tree *io_tree = &inode->io_tree;
4660
	struct btrfs_ordered_extent *ordered;
4661
	struct extent_state *cached_state = NULL;
4662
	struct extent_changeset *data_reserved = NULL;
4663
	bool only_release_metadata = false;
4664
	u32 blocksize = fs_info->sectorsize;
4665
	pgoff_t index = from >> PAGE_SHIFT;
4666
	unsigned offset = from & (blocksize - 1);
C
Chris Mason 已提交
4667
	struct page *page;
4668
	gfp_t mask = btrfs_alloc_write_mask(mapping);
4669
	size_t write_bytes = blocksize;
C
Chris Mason 已提交
4670
	int ret = 0;
4671 4672
	u64 block_start;
	u64 block_end;
C
Chris Mason 已提交
4673

4674 4675
	if (IS_ALIGNED(offset, blocksize) &&
	    (!len || IS_ALIGNED(len, blocksize)))
C
Chris Mason 已提交
4676
		goto out;
4677

J
Josef Bacik 已提交
4678 4679 4680
	block_start = round_down(from, blocksize);
	block_end = block_start + blocksize - 1;

4681 4682
	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
					  blocksize);
4683
	if (ret < 0) {
4684
		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes) > 0) {
4685 4686 4687 4688 4689 4690
			/* For nocow case, no need to reserve data space */
			only_release_metadata = true;
		} else {
			goto out;
		}
	}
4691
	ret = btrfs_delalloc_reserve_metadata(inode, blocksize);
4692 4693
	if (ret < 0) {
		if (!only_release_metadata)
4694 4695
			btrfs_free_reserved_data_space(inode, data_reserved,
						       block_start, blocksize);
4696 4697
		goto out;
	}
4698
again:
4699
	page = find_or_create_page(mapping, index, mask);
4700
	if (!page) {
4701 4702 4703
		btrfs_delalloc_release_space(inode, data_reserved, block_start,
					     blocksize, true);
		btrfs_delalloc_release_extents(inode, blocksize);
4704
		ret = -ENOMEM;
C
Chris Mason 已提交
4705
		goto out;
4706
	}
4707 4708 4709
	ret = set_page_extent_mapped(page);
	if (ret < 0)
		goto out_unlock;
4710

C
Chris Mason 已提交
4711
	if (!PageUptodate(page)) {
C
Chris Mason 已提交
4712
		ret = btrfs_readpage(NULL, page);
C
Chris Mason 已提交
4713
		lock_page(page);
4714 4715
		if (page->mapping != mapping) {
			unlock_page(page);
4716
			put_page(page);
4717 4718
			goto again;
		}
C
Chris Mason 已提交
4719 4720
		if (!PageUptodate(page)) {
			ret = -EIO;
4721
			goto out_unlock;
C
Chris Mason 已提交
4722 4723
		}
	}
4724
	wait_on_page_writeback(page);
4725

4726
	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4727

4728
	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4729
	if (ordered) {
4730
		unlock_extent_cached(io_tree, block_start, block_end,
4731
				     &cached_state);
4732
		unlock_page(page);
4733
		put_page(page);
4734
		btrfs_start_ordered_extent(ordered, 1);
4735 4736 4737 4738
		btrfs_put_ordered_extent(ordered);
		goto again;
	}

4739
	clear_extent_bit(&inode->io_tree, block_start, block_end,
4740 4741
			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
			 0, 0, &cached_state);
4742

4743
	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4744
					&cached_state);
J
Josef Bacik 已提交
4745
	if (ret) {
4746
		unlock_extent_cached(io_tree, block_start, block_end,
4747
				     &cached_state);
J
Josef Bacik 已提交
4748 4749 4750
		goto out_unlock;
	}

4751
	if (offset != blocksize) {
J
Josef Bacik 已提交
4752
		if (!len)
4753
			len = blocksize - offset;
J
Josef Bacik 已提交
4754
		if (front)
4755 4756
			memzero_page(page, (block_start - page_offset(page)),
				     offset);
J
Josef Bacik 已提交
4757
		else
4758 4759
			memzero_page(page, (block_start - page_offset(page)) + offset,
				     len);
4760 4761
		flush_dcache_page(page);
	}
4762 4763
	btrfs_page_clear_checked(fs_info, page, block_start,
				 block_end + 1 - block_start);
4764
	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4765
	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
C
Chris Mason 已提交
4766

4767
	if (only_release_metadata)
4768
		set_extent_bit(&inode->io_tree, block_start, block_end,
4769
			       EXTENT_NORESERVE, 0, NULL, NULL, GFP_NOFS, NULL);
4770

4771
out_unlock:
4772 4773
	if (ret) {
		if (only_release_metadata)
4774
			btrfs_delalloc_release_metadata(inode, blocksize, true);
4775
		else
4776
			btrfs_delalloc_release_space(inode, data_reserved,
4777 4778
					block_start, blocksize, true);
	}
4779
	btrfs_delalloc_release_extents(inode, blocksize);
C
Chris Mason 已提交
4780
	unlock_page(page);
4781
	put_page(page);
C
Chris Mason 已提交
4782
out:
4783
	if (only_release_metadata)
4784
		btrfs_check_nocow_unlock(inode);
4785
	extent_changeset_free(data_reserved);
C
Chris Mason 已提交
4786 4787 4788
	return ret;
}

4789
static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4790 4791
			     u64 offset, u64 len)
{
4792
	struct btrfs_fs_info *fs_info = root->fs_info;
4793
	struct btrfs_trans_handle *trans;
4794
	struct btrfs_drop_extents_args drop_args = { 0 };
4795 4796 4797
	int ret;

	/*
4798 4799 4800 4801
	 * If NO_HOLES is enabled, we don't need to do anything.
	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
	 * or btrfs_update_inode() will be called, which guarantee that the next
	 * fsync will know this inode was changed and needs to be logged.
4802
	 */
4803
	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814
		return 0;

	/*
	 * 1 - for the one we're dropping
	 * 1 - for the one we're adding
	 * 1 - for updating the inode.
	 */
	trans = btrfs_start_transaction(root, 3);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

4815 4816 4817 4818
	drop_args.start = offset;
	drop_args.end = offset + len;
	drop_args.drop_cache = true;

4819
	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4820
	if (ret) {
4821
		btrfs_abort_transaction(trans, ret);
4822
		btrfs_end_transaction(trans);
4823 4824 4825
		return ret;
	}

4826
	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
4827
			offset, 0, 0, len, 0, len, 0, 0, 0);
4828
	if (ret) {
4829
		btrfs_abort_transaction(trans, ret);
4830
	} else {
4831 4832
		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
		btrfs_update_inode(trans, root, inode);
4833
	}
4834
	btrfs_end_transaction(trans);
4835 4836 4837
	return ret;
}

4838 4839 4840 4841 4842 4843
/*
 * This function puts in dummy file extents for the area we're creating a hole
 * for.  So if we are truncating this file to a larger size we need to insert
 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
 * the range between oldsize and size
 */
4844
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
C
Chris Mason 已提交
4845
{
4846 4847 4848
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_io_tree *io_tree = &inode->io_tree;
4849
	struct extent_map *em = NULL;
4850
	struct extent_state *cached_state = NULL;
4851
	struct extent_map_tree *em_tree = &inode->extent_tree;
4852 4853
	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
	u64 block_end = ALIGN(size, fs_info->sectorsize);
Y
Yan Zheng 已提交
4854 4855 4856
	u64 last_byte;
	u64 cur_offset;
	u64 hole_size;
J
Josef Bacik 已提交
4857
	int err = 0;
C
Chris Mason 已提交
4858

4859
	/*
4860 4861
	 * If our size started in the middle of a block we need to zero out the
	 * rest of the block before we expand the i_size, otherwise we could
4862 4863
	 * expose stale data.
	 */
4864
	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4865 4866 4867
	if (err)
		return err;

Y
Yan Zheng 已提交
4868 4869 4870
	if (size <= hole_start)
		return 0;

4871 4872
	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
					   &cached_state);
Y
Yan Zheng 已提交
4873 4874
	cur_offset = hole_start;
	while (1) {
4875
		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4876
				      block_end - cur_offset);
4877 4878
		if (IS_ERR(em)) {
			err = PTR_ERR(em);
4879
			em = NULL;
4880 4881
			break;
		}
Y
Yan Zheng 已提交
4882
		last_byte = min(extent_map_end(em), block_end);
4883
		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4884 4885
		hole_size = last_byte - cur_offset;

4886
		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
J
Josef Bacik 已提交
4887
			struct extent_map *hole_em;
J
Josef Bacik 已提交
4888

4889 4890
			err = maybe_insert_hole(root, inode, cur_offset,
						hole_size);
4891
			if (err)
4892
				break;
4893

4894
			err = btrfs_inode_set_file_extent_range(inode,
4895 4896 4897 4898
							cur_offset, hole_size);
			if (err)
				break;

4899
			btrfs_drop_extent_cache(inode, cur_offset,
J
Josef Bacik 已提交
4900 4901 4902 4903
						cur_offset + hole_size - 1, 0);
			hole_em = alloc_extent_map();
			if (!hole_em) {
				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4904
					&inode->runtime_flags);
J
Josef Bacik 已提交
4905 4906 4907 4908 4909
				goto next;
			}
			hole_em->start = cur_offset;
			hole_em->len = hole_size;
			hole_em->orig_start = cur_offset;
4910

J
Josef Bacik 已提交
4911 4912
			hole_em->block_start = EXTENT_MAP_HOLE;
			hole_em->block_len = 0;
4913
			hole_em->orig_block_len = 0;
J
Josef Bacik 已提交
4914
			hole_em->ram_bytes = hole_size;
J
Josef Bacik 已提交
4915
			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4916
			hole_em->generation = fs_info->generation;
4917

J
Josef Bacik 已提交
4918 4919
			while (1) {
				write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4920
				err = add_extent_mapping(em_tree, hole_em, 1);
J
Josef Bacik 已提交
4921 4922 4923
				write_unlock(&em_tree->lock);
				if (err != -EEXIST)
					break;
4924
				btrfs_drop_extent_cache(inode, cur_offset,
J
Josef Bacik 已提交
4925 4926 4927 4928
							cur_offset +
							hole_size - 1, 0);
			}
			free_extent_map(hole_em);
4929
		} else {
4930
			err = btrfs_inode_set_file_extent_range(inode,
4931 4932 4933
							cur_offset, hole_size);
			if (err)
				break;
Y
Yan Zheng 已提交
4934
		}
4935
next:
Y
Yan Zheng 已提交
4936
		free_extent_map(em);
4937
		em = NULL;
Y
Yan Zheng 已提交
4938
		cur_offset = last_byte;
4939
		if (cur_offset >= block_end)
Y
Yan Zheng 已提交
4940 4941
			break;
	}
4942
	free_extent_map(em);
4943
	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
Y
Yan Zheng 已提交
4944 4945
	return err;
}
C
Chris Mason 已提交
4946

4947
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4948
{
4949 4950
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
4951
	loff_t oldsize = i_size_read(inode);
4952 4953
	loff_t newsize = attr->ia_size;
	int mask = attr->ia_valid;
4954 4955
	int ret;

4956 4957 4958 4959 4960 4961
	/*
	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
	 * special case where we need to update the times despite not having
	 * these flags set.  For all other operations the VFS set these flags
	 * explicitly if it wants a timestamp update.
	 */
C
Christoph Hellwig 已提交
4962 4963 4964 4965
	if (newsize != oldsize) {
		inode_inc_iversion(inode);
		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
			inode->i_ctime = inode->i_mtime =
4966
				current_time(inode);
C
Christoph Hellwig 已提交
4967
	}
4968

4969
	if (newsize > oldsize) {
4970
		/*
4971
		 * Don't do an expanding truncate while snapshotting is ongoing.
4972 4973 4974 4975 4976
		 * This is to ensure the snapshot captures a fully consistent
		 * state of this file - if the snapshot captures this expanding
		 * truncation, it must capture all writes that happened before
		 * this truncation.
		 */
4977
		btrfs_drew_write_lock(&root->snapshot_lock);
4978
		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
4979
		if (ret) {
4980
			btrfs_drew_write_unlock(&root->snapshot_lock);
4981
			return ret;
4982
		}
4983

4984
		trans = btrfs_start_transaction(root, 1);
4985
		if (IS_ERR(trans)) {
4986
			btrfs_drew_write_unlock(&root->snapshot_lock);
4987
			return PTR_ERR(trans);
4988
		}
4989 4990

		i_size_write(inode, newsize);
4991
		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
4992
		pagecache_isize_extended(inode, oldsize, newsize);
4993
		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
4994
		btrfs_drew_write_unlock(&root->snapshot_lock);
4995
		btrfs_end_transaction(trans);
4996
	} else {
4997 4998 4999 5000 5001 5002 5003 5004 5005
		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

		if (btrfs_is_zoned(fs_info)) {
			ret = btrfs_wait_ordered_range(inode,
					ALIGN(newsize, fs_info->sectorsize),
					(u64)-1);
			if (ret)
				return ret;
		}
5006

5007 5008
		/*
		 * We're truncating a file that used to have good data down to
5009 5010
		 * zero. Make sure any new writes to the file get on disk
		 * on close.
5011 5012
		 */
		if (newsize == 0)
5013
			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5014
				&BTRFS_I(inode)->runtime_flags);
5015

5016
		truncate_setsize(inode, newsize);
5017 5018 5019

		inode_dio_wait(inode);

5020
		ret = btrfs_truncate(inode, newsize == oldsize);
5021 5022 5023 5024
		if (ret && inode->i_nlink) {
			int err;

			/*
5025 5026 5027 5028
			 * Truncate failed, so fix up the in-memory size. We
			 * adjusted disk_i_size down as we removed extents, so
			 * wait for disk_i_size to be stable and then update the
			 * in-memory size to match.
5029
			 */
5030
			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5031
			if (err)
5032 5033
				return err;
			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5034
		}
5035 5036
	}

5037
	return ret;
5038 5039
}

5040 5041
static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
			 struct iattr *attr)
Y
Yan Zheng 已提交
5042
{
5043
	struct inode *inode = d_inode(dentry);
L
Li Zefan 已提交
5044
	struct btrfs_root *root = BTRFS_I(inode)->root;
Y
Yan Zheng 已提交
5045
	int err;
C
Chris Mason 已提交
5046

L
Li Zefan 已提交
5047 5048 5049
	if (btrfs_root_readonly(root))
		return -EROFS;

5050
	err = setattr_prepare(mnt_userns, dentry, attr);
Y
Yan Zheng 已提交
5051 5052
	if (err)
		return err;
C
Chris Mason 已提交
5053

5054
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5055
		err = btrfs_setsize(inode, attr);
5056 5057
		if (err)
			return err;
C
Chris Mason 已提交
5058
	}
Y
Yan Zheng 已提交
5059

C
Christoph Hellwig 已提交
5060
	if (attr->ia_valid) {
5061
		setattr_copy(mnt_userns, inode, attr);
5062
		inode_inc_iversion(inode);
5063
		err = btrfs_dirty_inode(inode);
C
Christoph Hellwig 已提交
5064

5065
		if (!err && attr->ia_valid & ATTR_MODE)
5066
			err = posix_acl_chmod(mnt_userns, inode, inode->i_mode);
C
Christoph Hellwig 已提交
5067
	}
J
Josef Bacik 已提交
5068

C
Chris Mason 已提交
5069 5070
	return err;
}
5071

5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090
/*
 * While truncating the inode pages during eviction, we get the VFS calling
 * btrfs_invalidatepage() against each page of the inode. This is slow because
 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
 * extent_state structures over and over, wasting lots of time.
 *
 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
 * those expensive operations on a per page basis and do only the ordered io
 * finishing, while we release here the extent_map and extent_state structures,
 * without the excessive merging and splitting.
 */
static void evict_inode_truncate_pages(struct inode *inode)
{
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
	struct rb_node *node;

	ASSERT(inode->i_state & I_FREEING);
5091
	truncate_inode_pages_final(&inode->i_data);
5092 5093

	write_lock(&map_tree->lock);
L
Liu Bo 已提交
5094
	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
5095 5096
		struct extent_map *em;

L
Liu Bo 已提交
5097
		node = rb_first_cached(&map_tree->map);
5098
		em = rb_entry(node, struct extent_map, rb_node);
5099 5100
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5101 5102
		remove_extent_mapping(map_tree, em);
		free_extent_map(em);
5103 5104 5105 5106 5107
		if (need_resched()) {
			write_unlock(&map_tree->lock);
			cond_resched();
			write_lock(&map_tree->lock);
		}
5108 5109 5110
	}
	write_unlock(&map_tree->lock);

5111 5112
	/*
	 * Keep looping until we have no more ranges in the io tree.
5113 5114
	 * We can have ongoing bios started by readahead that have
	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5115 5116
	 * still in progress (unlocked the pages in the bio but did not yet
	 * unlocked the ranges in the io tree). Therefore this means some
5117 5118 5119 5120 5121 5122 5123 5124 5125 5126
	 * ranges can still be locked and eviction started because before
	 * submitting those bios, which are executed by a separate task (work
	 * queue kthread), inode references (inode->i_count) were not taken
	 * (which would be dropped in the end io callback of each bio).
	 * Therefore here we effectively end up waiting for those bios and
	 * anyone else holding locked ranges without having bumped the inode's
	 * reference count - if we don't do it, when they access the inode's
	 * io_tree to unlock a range it may be too late, leading to an
	 * use-after-free issue.
	 */
5127 5128 5129 5130
	spin_lock(&io_tree->lock);
	while (!RB_EMPTY_ROOT(&io_tree->state)) {
		struct extent_state *state;
		struct extent_state *cached_state = NULL;
5131 5132
		u64 start;
		u64 end;
5133
		unsigned state_flags;
5134 5135 5136

		node = rb_first(&io_tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
5137 5138
		start = state->start;
		end = state->end;
5139
		state_flags = state->state;
5140 5141
		spin_unlock(&io_tree->lock);

5142
		lock_extent_bits(io_tree, start, end, &cached_state);
Q
Qu Wenruo 已提交
5143 5144 5145 5146 5147 5148 5149 5150 5151

		/*
		 * If still has DELALLOC flag, the extent didn't reach disk,
		 * and its reserved space won't be freed by delayed_ref.
		 * So we need to free its reserved space here.
		 * (Refer to comment in btrfs_invalidatepage, case 2)
		 *
		 * Note, end is the bytenr of last byte, so we need + 1 here.
		 */
5152
		if (state_flags & EXTENT_DELALLOC)
5153 5154
			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
					       end - start + 1);
Q
Qu Wenruo 已提交
5155

5156
		clear_extent_bit(io_tree, start, end,
5157 5158 5159
				 EXTENT_LOCKED | EXTENT_DELALLOC |
				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
				 &cached_state);
5160

5161
		cond_resched();
5162 5163 5164 5165 5166
		spin_lock(&io_tree->lock);
	}
	spin_unlock(&io_tree->lock);
}

5167
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5168
							struct btrfs_block_rsv *rsv)
5169 5170
{
	struct btrfs_fs_info *fs_info = root->fs_info;
5171
	struct btrfs_trans_handle *trans;
5172
	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
5173
	int ret;
5174

5175 5176 5177 5178 5179 5180 5181 5182 5183 5184
	/*
	 * Eviction should be taking place at some place safe because of our
	 * delayed iputs.  However the normal flushing code will run delayed
	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
	 *
	 * We reserve the delayed_refs_extra here again because we can't use
	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
	 * above.  We reserve our extra bit here because we generate a ton of
	 * delayed refs activity by truncating.
	 *
5185 5186 5187
	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
	 * if we fail to make this reservation we can re-try without the
	 * delayed_refs_extra so we can make some forward progress.
5188
	 */
5189
	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5190 5191
				     BTRFS_RESERVE_FLUSH_EVICT);
	if (ret) {
5192
		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5193 5194
					     BTRFS_RESERVE_FLUSH_EVICT);
		if (ret) {
5195 5196 5197 5198 5199 5200
			btrfs_warn(fs_info,
				   "could not allocate space for delete; will truncate on mount");
			return ERR_PTR(-ENOSPC);
		}
		delayed_refs_extra = 0;
	}
5201

5202 5203 5204 5205 5206 5207 5208 5209 5210
	trans = btrfs_join_transaction(root);
	if (IS_ERR(trans))
		return trans;

	if (delayed_refs_extra) {
		trans->block_rsv = &fs_info->trans_block_rsv;
		trans->bytes_reserved = delayed_refs_extra;
		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
					delayed_refs_extra, 1);
5211
	}
5212
	return trans;
5213 5214
}

A
Al Viro 已提交
5215
void btrfs_evict_inode(struct inode *inode)
C
Chris Mason 已提交
5216
{
5217
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
5218 5219
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(inode)->root;
5220
	struct btrfs_block_rsv *rsv;
C
Chris Mason 已提交
5221 5222
	int ret;

5223 5224
	trace_btrfs_inode_evict(inode);

5225
	if (!root) {
B
Boris Burkov 已提交
5226
		fsverity_cleanup_inode(inode);
5227
		clear_inode(inode);
5228 5229 5230
		return;
	}

5231 5232
	evict_inode_truncate_pages(inode);

5233 5234 5235
	if (inode->i_nlink &&
	    ((btrfs_root_refs(&root->root_item) != 0 &&
	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5236
	     btrfs_is_free_space_inode(BTRFS_I(inode))))
A
Al Viro 已提交
5237 5238
		goto no_delete;

5239
	if (is_bad_inode(inode))
C
Chris Mason 已提交
5240
		goto no_delete;
5241

5242
	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5243

5244
	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5245 5246
		goto no_delete;

5247
	if (inode->i_nlink > 0) {
5248 5249
		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5250 5251 5252
		goto no_delete;
	}

5253 5254 5255 5256
	/*
	 * This makes sure the inode item in tree is uptodate and the space for
	 * the inode update is released.
	 */
5257
	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5258
	if (ret)
5259 5260
		goto no_delete;

5261 5262 5263 5264 5265 5266 5267 5268
	/*
	 * This drops any pending insert or delete operations we have for this
	 * inode.  We could have a delayed dir index deletion queued up, but
	 * we're removing the inode completely so that'll be taken care of in
	 * the truncate.
	 */
	btrfs_kill_delayed_inode_items(BTRFS_I(inode));

5269
	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5270
	if (!rsv)
5271
		goto no_delete;
5272
	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5273
	rsv->failfast = 1;
5274

5275
	btrfs_i_size_write(BTRFS_I(inode), 0);
5276

5277
	while (1) {
J
Josef Bacik 已提交
5278
		struct btrfs_truncate_control control = {
5279
			.inode = BTRFS_I(inode),
5280
			.ino = btrfs_ino(BTRFS_I(inode)),
J
Josef Bacik 已提交
5281 5282 5283 5284
			.new_size = 0,
			.min_type = 0,
		};

5285
		trans = evict_refill_and_join(root, rsv);
5286 5287
		if (IS_ERR(trans))
			goto free_rsv;
5288

5289 5290
		trans->block_rsv = rsv;

5291
		ret = btrfs_truncate_inode_items(trans, root, &control);
5292 5293 5294 5295 5296 5297
		trans->block_rsv = &fs_info->trans_block_rsv;
		btrfs_end_transaction(trans);
		btrfs_btree_balance_dirty(fs_info);
		if (ret && ret != -ENOSPC && ret != -EAGAIN)
			goto free_rsv;
		else if (!ret)
5298 5299
			break;
	}
5300

5301
	/*
5302 5303 5304 5305 5306 5307 5308
	 * Errors here aren't a big deal, it just means we leave orphan items in
	 * the tree. They will be cleaned up on the next mount. If the inode
	 * number gets reused, cleanup deletes the orphan item without doing
	 * anything, and unlink reuses the existing orphan item.
	 *
	 * If it turns out that we are dropping too many of these, we might want
	 * to add a mechanism for retrying these after a commit.
5309
	 */
5310
	trans = evict_refill_and_join(root, rsv);
5311 5312 5313 5314 5315 5316
	if (!IS_ERR(trans)) {
		trans->block_rsv = rsv;
		btrfs_orphan_del(trans, BTRFS_I(inode));
		trans->block_rsv = &fs_info->trans_block_rsv;
		btrfs_end_transaction(trans);
	}
5317

5318 5319
free_rsv:
	btrfs_free_block_rsv(fs_info, rsv);
C
Chris Mason 已提交
5320
no_delete:
5321 5322 5323 5324 5325
	/*
	 * If we didn't successfully delete, the orphan item will still be in
	 * the tree and we'll retry on the next mount. Again, we might also want
	 * to retry these periodically in the future.
	 */
5326
	btrfs_remove_delayed_node(BTRFS_I(inode));
B
Boris Burkov 已提交
5327
	fsverity_cleanup_inode(inode);
5328
	clear_inode(inode);
C
Chris Mason 已提交
5329 5330 5331
}

/*
5332 5333 5334
 * Return the key found in the dir entry in the location pointer, fill @type
 * with BTRFS_FT_*, and return 0.
 *
5335 5336
 * If no dir entries were found, returns -ENOENT.
 * If found a corrupted location in dir entry, returns -EUCLEAN.
C
Chris Mason 已提交
5337 5338
 */
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5339
			       struct btrfs_key *location, u8 *type)
C
Chris Mason 已提交
5340 5341 5342 5343 5344 5345
{
	const char *name = dentry->d_name.name;
	int namelen = dentry->d_name.len;
	struct btrfs_dir_item *di;
	struct btrfs_path *path;
	struct btrfs_root *root = BTRFS_I(dir)->root;
Y
Yan 已提交
5346
	int ret = 0;
C
Chris Mason 已提交
5347 5348

	path = btrfs_alloc_path();
5349 5350
	if (!path)
		return -ENOMEM;
5351

5352 5353
	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
			name, namelen, 0);
5354 5355
	if (IS_ERR_OR_NULL(di)) {
		ret = di ? PTR_ERR(di) : -ENOENT;
5356 5357
		goto out;
	}
C
Chris Mason 已提交
5358

5359
	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5360 5361
	if (location->type != BTRFS_INODE_ITEM_KEY &&
	    location->type != BTRFS_ROOT_ITEM_KEY) {
5362
		ret = -EUCLEAN;
5363 5364 5365 5366 5367
		btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
			   __func__, name, btrfs_ino(BTRFS_I(dir)),
			   location->objectid, location->type, location->offset);
	}
5368 5369
	if (!ret)
		*type = btrfs_dir_type(path->nodes[0], di);
C
Chris Mason 已提交
5370 5371 5372 5373 5374 5375 5376 5377 5378 5379
out:
	btrfs_free_path(path);
	return ret;
}

/*
 * when we hit a tree root in a directory, the btrfs part of the inode
 * needs to be changed to reflect the root directory of the tree root.  This
 * is kind of like crossing a mount point.
 */
5380
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5381 5382 5383 5384
				    struct inode *dir,
				    struct dentry *dentry,
				    struct btrfs_key *location,
				    struct btrfs_root **sub_root)
C
Chris Mason 已提交
5385
{
5386 5387 5388 5389
	struct btrfs_path *path;
	struct btrfs_root *new_root;
	struct btrfs_root_ref *ref;
	struct extent_buffer *leaf;
5390
	struct btrfs_key key;
5391 5392
	int ret;
	int err = 0;
C
Chris Mason 已提交
5393

5394 5395 5396 5397 5398
	path = btrfs_alloc_path();
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5399

5400
	err = -ENOENT;
5401 5402 5403 5404
	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
	key.type = BTRFS_ROOT_REF_KEY;
	key.offset = location->objectid;

5405
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5406 5407 5408 5409 5410
	if (ret) {
		if (ret < 0)
			err = ret;
		goto out;
	}
C
Chris Mason 已提交
5411

5412 5413
	leaf = path->nodes[0];
	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5414
	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5415 5416
	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
		goto out;
C
Chris Mason 已提交
5417

5418 5419 5420 5421 5422 5423
	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
				   (unsigned long)(ref + 1),
				   dentry->d_name.len);
	if (ret)
		goto out;

5424
	btrfs_release_path(path);
5425

D
David Sterba 已提交
5426
	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439
	if (IS_ERR(new_root)) {
		err = PTR_ERR(new_root);
		goto out;
	}

	*sub_root = new_root;
	location->objectid = btrfs_root_dirid(&new_root->root_item);
	location->type = BTRFS_INODE_ITEM_KEY;
	location->offset = 0;
	err = 0;
out:
	btrfs_free_path(path);
	return err;
C
Chris Mason 已提交
5440 5441
}

5442 5443 5444 5445
static void inode_tree_add(struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_inode *entry;
5446 5447
	struct rb_node **p;
	struct rb_node *parent;
5448
	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5449
	u64 ino = btrfs_ino(BTRFS_I(inode));
5450

A
Al Viro 已提交
5451
	if (inode_unhashed(inode))
5452
		return;
5453
	parent = NULL;
5454
	spin_lock(&root->inode_lock);
5455
	p = &root->inode_tree.rb_node;
5456 5457 5458 5459
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct btrfs_inode, rb_node);

5460
		if (ino < btrfs_ino(entry))
5461
			p = &parent->rb_left;
5462
		else if (ino > btrfs_ino(entry))
5463
			p = &parent->rb_right;
5464 5465
		else {
			WARN_ON(!(entry->vfs_inode.i_state &
A
Al Viro 已提交
5466
				  (I_WILL_FREE | I_FREEING)));
5467
			rb_replace_node(parent, new, &root->inode_tree);
5468 5469
			RB_CLEAR_NODE(parent);
			spin_unlock(&root->inode_lock);
5470
			return;
5471 5472
		}
	}
5473 5474
	rb_link_node(new, parent, p);
	rb_insert_color(new, &root->inode_tree);
5475 5476 5477
	spin_unlock(&root->inode_lock);
}

5478
static void inode_tree_del(struct btrfs_inode *inode)
5479
{
5480
	struct btrfs_root *root = inode->root;
5481
	int empty = 0;
5482

5483
	spin_lock(&root->inode_lock);
5484 5485 5486
	if (!RB_EMPTY_NODE(&inode->rb_node)) {
		rb_erase(&inode->rb_node, &root->inode_tree);
		RB_CLEAR_NODE(&inode->rb_node);
5487
		empty = RB_EMPTY_ROOT(&root->inode_tree);
5488
	}
5489
	spin_unlock(&root->inode_lock);
5490

5491
	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5492 5493 5494 5495 5496 5497 5498 5499
		spin_lock(&root->inode_lock);
		empty = RB_EMPTY_ROOT(&root->inode_tree);
		spin_unlock(&root->inode_lock);
		if (empty)
			btrfs_add_dead_root(root);
	}
}

5500

5501 5502 5503
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
	struct btrfs_iget_args *args = p;
D
David Sterba 已提交
5504 5505 5506 5507 5508

	inode->i_ino = args->ino;
	BTRFS_I(inode)->location.objectid = args->ino;
	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
	BTRFS_I(inode)->location.offset = 0;
5509 5510
	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
	BUG_ON(args->root && !BTRFS_I(inode)->root);
C
Chris Mason 已提交
5511 5512 5513 5514 5515 5516
	return 0;
}

static int btrfs_find_actor(struct inode *inode, void *opaque)
{
	struct btrfs_iget_args *args = opaque;
D
David Sterba 已提交
5517 5518

	return args->ino == BTRFS_I(inode)->location.objectid &&
C
Chris Mason 已提交
5519
		args->root == BTRFS_I(inode)->root;
C
Chris Mason 已提交
5520 5521
}

D
David Sterba 已提交
5522
static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5523
				       struct btrfs_root *root)
C
Chris Mason 已提交
5524 5525 5526
{
	struct inode *inode;
	struct btrfs_iget_args args;
D
David Sterba 已提交
5527
	unsigned long hashval = btrfs_inode_hash(ino, root);
5528

D
David Sterba 已提交
5529
	args.ino = ino;
C
Chris Mason 已提交
5530 5531
	args.root = root;

5532
	inode = iget5_locked(s, hashval, btrfs_find_actor,
C
Chris Mason 已提交
5533 5534 5535 5536 5537
			     btrfs_init_locked_inode,
			     (void *)&args);
	return inode;
}

5538
/*
D
David Sterba 已提交
5539
 * Get an inode object given its inode number and corresponding root.
5540 5541 5542
 * Path can be preallocated to prevent recursing back to iget through
 * allocator. NULL is also valid but may require an additional allocation
 * later.
B
Balaji Rao 已提交
5543
 */
D
David Sterba 已提交
5544
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5545
			      struct btrfs_root *root, struct btrfs_path *path)
B
Balaji Rao 已提交
5546 5547 5548
{
	struct inode *inode;

D
David Sterba 已提交
5549
	inode = btrfs_iget_locked(s, ino, root);
B
Balaji Rao 已提交
5550
	if (!inode)
5551
		return ERR_PTR(-ENOMEM);
B
Balaji Rao 已提交
5552 5553

	if (inode->i_state & I_NEW) {
5554 5555
		int ret;

5556
		ret = btrfs_read_locked_inode(inode, path);
5557
		if (!ret) {
5558 5559 5560
			inode_tree_add(inode);
			unlock_new_inode(inode);
		} else {
A
Al Viro 已提交
5561 5562 5563 5564 5565 5566 5567 5568 5569
			iget_failed(inode);
			/*
			 * ret > 0 can come from btrfs_search_slot called by
			 * btrfs_read_locked_inode, this means the inode item
			 * was not found.
			 */
			if (ret > 0)
				ret = -ENOENT;
			inode = ERR_PTR(ret);
5570 5571 5572
		}
	}

B
Balaji Rao 已提交
5573 5574 5575
	return inode;
}

D
David Sterba 已提交
5576
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5577
{
D
David Sterba 已提交
5578
	return btrfs_iget_path(s, ino, root, NULL);
5579 5580
}

5581 5582 5583 5584 5585 5586 5587 5588 5589
static struct inode *new_simple_dir(struct super_block *s,
				    struct btrfs_key *key,
				    struct btrfs_root *root)
{
	struct inode *inode = new_inode(s);

	if (!inode)
		return ERR_PTR(-ENOMEM);

5590
	BTRFS_I(inode)->root = btrfs_grab_root(root);
5591
	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5592
	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5593 5594

	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5595 5596 5597 5598 5599
	/*
	 * We only need lookup, the rest is read-only and there's no inode
	 * associated with the dentry
	 */
	inode->i_op = &simple_dir_inode_operations;
5600
	inode->i_opflags &= ~IOP_XATTR;
5601 5602
	inode->i_fop = &simple_dir_operations;
	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5603
	inode->i_mtime = current_time(inode);
5604 5605
	inode->i_atime = inode->i_mtime;
	inode->i_ctime = inode->i_mtime;
5606
	BTRFS_I(inode)->i_otime = inode->i_mtime;
5607 5608 5609 5610

	return inode;
}

5611 5612 5613 5614 5615 5616 5617 5618 5619
static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
static_assert(BTRFS_FT_DIR == FT_DIR);
static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
static_assert(BTRFS_FT_FIFO == FT_FIFO);
static_assert(BTRFS_FT_SOCK == FT_SOCK);
static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);

5620 5621 5622 5623 5624
static inline u8 btrfs_inode_type(struct inode *inode)
{
	return fs_umode_to_ftype(inode->i_mode);
}

5625
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
C
Chris Mason 已提交
5626
{
5627
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
C
Chris Mason 已提交
5628
	struct inode *inode;
5629
	struct btrfs_root *root = BTRFS_I(dir)->root;
C
Chris Mason 已提交
5630 5631
	struct btrfs_root *sub_root = root;
	struct btrfs_key location;
5632
	u8 di_type = 0;
5633
	int ret = 0;
C
Chris Mason 已提交
5634 5635 5636

	if (dentry->d_name.len > BTRFS_NAME_LEN)
		return ERR_PTR(-ENAMETOOLONG);
5637

5638
	ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
C
Chris Mason 已提交
5639 5640
	if (ret < 0)
		return ERR_PTR(ret);
5641

5642
	if (location.type == BTRFS_INODE_ITEM_KEY) {
D
David Sterba 已提交
5643
		inode = btrfs_iget(dir->i_sb, location.objectid, root);
5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655
		if (IS_ERR(inode))
			return inode;

		/* Do extra check against inode mode with di_type */
		if (btrfs_inode_type(inode) != di_type) {
			btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
				  inode->i_mode, btrfs_inode_type(inode),
				  di_type);
			iput(inode);
			return ERR_PTR(-EUCLEAN);
		}
5656 5657 5658
		return inode;
	}

5659
	ret = fixup_tree_root_location(fs_info, dir, dentry,
5660 5661 5662 5663 5664 5665 5666
				       &location, &sub_root);
	if (ret < 0) {
		if (ret != -ENOENT)
			inode = ERR_PTR(ret);
		else
			inode = new_simple_dir(dir->i_sb, &location, sub_root);
	} else {
D
David Sterba 已提交
5667
		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
C
Chris Mason 已提交
5668
	}
5669
	if (root != sub_root)
5670
		btrfs_put_root(sub_root);
5671

5672
	if (!IS_ERR(inode) && root != sub_root) {
5673
		down_read(&fs_info->cleanup_work_sem);
5674
		if (!sb_rdonly(inode->i_sb))
5675
			ret = btrfs_orphan_cleanup(sub_root);
5676
		up_read(&fs_info->cleanup_work_sem);
5677 5678
		if (ret) {
			iput(inode);
5679
			inode = ERR_PTR(ret);
5680
		}
5681 5682
	}

5683 5684 5685
	return inode;
}

N
Nick Piggin 已提交
5686
static int btrfs_dentry_delete(const struct dentry *dentry)
5687 5688
{
	struct btrfs_root *root;
5689
	struct inode *inode = d_inode(dentry);
5690

L
Li Zefan 已提交
5691
	if (!inode && !IS_ROOT(dentry))
5692
		inode = d_inode(dentry->d_parent);
5693

L
Li Zefan 已提交
5694 5695
	if (inode) {
		root = BTRFS_I(inode)->root;
5696 5697
		if (btrfs_root_refs(&root->root_item) == 0)
			return 1;
L
Li Zefan 已提交
5698

5699
		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
L
Li Zefan 已提交
5700
			return 1;
5701
	}
5702 5703 5704
	return 0;
}

5705
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
5706
				   unsigned int flags)
5707
{
A
Al Viro 已提交
5708
	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5709

A
Al Viro 已提交
5710 5711
	if (inode == ERR_PTR(-ENOENT))
		inode = NULL;
5712
	return d_splice_alias(inode, dentry);
C
Chris Mason 已提交
5713 5714
}

5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752
/*
 * All this infrastructure exists because dir_emit can fault, and we are holding
 * the tree lock when doing readdir.  For now just allocate a buffer and copy
 * our information into that, and then dir_emit from the buffer.  This is
 * similar to what NFS does, only we don't keep the buffer around in pagecache
 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
 * copy_to_user_inatomic so we don't have to worry about page faulting under the
 * tree lock.
 */
static int btrfs_opendir(struct inode *inode, struct file *file)
{
	struct btrfs_file_private *private;

	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
	if (!private)
		return -ENOMEM;
	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
	if (!private->filldir_buf) {
		kfree(private);
		return -ENOMEM;
	}
	file->private_data = private;
	return 0;
}

struct dir_entry {
	u64 ino;
	u64 offset;
	unsigned type;
	int name_len;
};

static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
{
	while (entries--) {
		struct dir_entry *entry = addr;
		char *name = (char *)(entry + 1);

5753 5754 5755 5756
		ctx->pos = get_unaligned(&entry->offset);
		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
					 get_unaligned(&entry->ino),
					 get_unaligned(&entry->type)))
5757
			return 1;
5758 5759
		addr += sizeof(struct dir_entry) +
			get_unaligned(&entry->name_len);
5760 5761 5762 5763 5764
		ctx->pos++;
	}
	return 0;
}

A
Al Viro 已提交
5765
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
C
Chris Mason 已提交
5766
{
A
Al Viro 已提交
5767
	struct inode *inode = file_inode(file);
C
Chris Mason 已提交
5768
	struct btrfs_root *root = BTRFS_I(inode)->root;
5769
	struct btrfs_file_private *private = file->private_data;
C
Chris Mason 已提交
5770 5771
	struct btrfs_dir_item *di;
	struct btrfs_key key;
5772
	struct btrfs_key found_key;
C
Chris Mason 已提交
5773
	struct btrfs_path *path;
5774
	void *addr;
5775 5776
	struct list_head ins_list;
	struct list_head del_list;
C
Chris Mason 已提交
5777
	int ret;
5778
	struct extent_buffer *leaf;
C
Chris Mason 已提交
5779
	int slot;
5780 5781
	char *name_ptr;
	int name_len;
5782 5783
	int entries = 0;
	int total_len = 0;
5784
	bool put = false;
5785
	struct btrfs_key location;
5786

A
Al Viro 已提交
5787 5788 5789
	if (!dir_emit_dots(file, ctx))
		return 0;

5790
	path = btrfs_alloc_path();
5791 5792
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
5793

5794
	addr = private->filldir_buf;
5795
	path->reada = READA_FORWARD;
5796

5797 5798 5799
	INIT_LIST_HEAD(&ins_list);
	INIT_LIST_HEAD(&del_list);
	put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5800

5801
again:
5802
	key.type = BTRFS_DIR_INDEX_KEY;
A
Al Viro 已提交
5803
	key.offset = ctx->pos;
5804
	key.objectid = btrfs_ino(BTRFS_I(inode));
5805

C
Chris Mason 已提交
5806 5807 5808
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto err;
5809 5810

	while (1) {
5811 5812
		struct dir_entry *entry;

5813
		leaf = path->nodes[0];
C
Chris Mason 已提交
5814
		slot = path->slots[0];
5815 5816 5817 5818 5819 5820 5821
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				goto err;
			else if (ret > 0)
				break;
			continue;
C
Chris Mason 已提交
5822
		}
5823

5824 5825 5826
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		if (found_key.objectid != key.objectid)
C
Chris Mason 已提交
5827
			break;
5828
		if (found_key.type != BTRFS_DIR_INDEX_KEY)
C
Chris Mason 已提交
5829
			break;
A
Al Viro 已提交
5830
		if (found_key.offset < ctx->pos)
5831
			goto next;
5832
		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5833
			goto next;
C
Chris Mason 已提交
5834
		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5835
		name_len = btrfs_dir_name_len(leaf, di);
5836 5837 5838 5839 5840 5841 5842 5843 5844 5845
		if ((total_len + sizeof(struct dir_entry) + name_len) >=
		    PAGE_SIZE) {
			btrfs_release_path(path);
			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
			if (ret)
				goto nopos;
			addr = private->filldir_buf;
			entries = 0;
			total_len = 0;
			goto again;
5846
		}
5847 5848

		entry = addr;
5849
		put_unaligned(name_len, &entry->name_len);
5850
		name_ptr = (char *)(entry + 1);
5851 5852
		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
				   name_len);
5853
		put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
5854
				&entry->type);
5855
		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5856 5857
		put_unaligned(location.objectid, &entry->ino);
		put_unaligned(found_key.offset, &entry->offset);
5858 5859 5860
		entries++;
		addr += sizeof(struct dir_entry) + name_len;
		total_len += sizeof(struct dir_entry) + name_len;
5861 5862
next:
		path->slots[0]++;
C
Chris Mason 已提交
5863
	}
5864 5865 5866 5867 5868
	btrfs_release_path(path);

	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
	if (ret)
		goto nopos;
5869

5870
	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5871
	if (ret)
5872 5873
		goto nopos;

5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890
	/*
	 * Stop new entries from being returned after we return the last
	 * entry.
	 *
	 * New directory entries are assigned a strictly increasing
	 * offset.  This means that new entries created during readdir
	 * are *guaranteed* to be seen in the future by that readdir.
	 * This has broken buggy programs which operate on names as
	 * they're returned by readdir.  Until we re-use freed offsets
	 * we have this hack to stop new entries from being returned
	 * under the assumption that they'll never reach this huge
	 * offset.
	 *
	 * This is being careful not to overflow 32bit loff_t unless the
	 * last entry requires it because doing so has broken 32bit apps
	 * in the past.
	 */
5891 5892 5893 5894
	if (ctx->pos >= INT_MAX)
		ctx->pos = LLONG_MAX;
	else
		ctx->pos = INT_MAX;
C
Chris Mason 已提交
5895 5896 5897
nopos:
	ret = 0;
err:
5898 5899
	if (put)
		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
C
Chris Mason 已提交
5900 5901 5902 5903 5904
	btrfs_free_path(path);
	return ret;
}

/*
5905
 * This is somewhat expensive, updating the tree every time the
C
Chris Mason 已提交
5906 5907 5908 5909
 * inode changes.  But, it is most likely to find the inode in cache.
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
5910
static int btrfs_dirty_inode(struct inode *inode)
C
Chris Mason 已提交
5911
{
5912
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
5913 5914
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
5915 5916
	int ret;

5917
	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5918
		return 0;
C
Chris Mason 已提交
5919

5920
	trans = btrfs_join_transaction(root);
5921 5922
	if (IS_ERR(trans))
		return PTR_ERR(trans);
5923

5924
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5925
	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
5926
		/* whoops, lets try again with the full transaction */
5927
		btrfs_end_transaction(trans);
5928
		trans = btrfs_start_transaction(root, 1);
5929 5930
		if (IS_ERR(trans))
			return PTR_ERR(trans);
5931

5932
		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5933
	}
5934
	btrfs_end_transaction(trans);
5935
	if (BTRFS_I(inode)->delayed_node)
5936
		btrfs_balance_delayed_items(fs_info);
5937 5938 5939 5940 5941 5942 5943 5944

	return ret;
}

/*
 * This is a copy of file_update_time.  We need this so we can return error on
 * ENOSPC for updating the inode in the case of file write and mmap writes.
 */
5945
static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
5946
			     int flags)
5947
{
5948
	struct btrfs_root *root = BTRFS_I(inode)->root;
5949
	bool dirty = flags & ~S_VERSION;
5950 5951 5952 5953

	if (btrfs_root_readonly(root))
		return -EROFS;

5954
	if (flags & S_VERSION)
5955
		dirty |= inode_maybe_inc_iversion(inode, dirty);
5956 5957 5958 5959 5960 5961
	if (flags & S_CTIME)
		inode->i_ctime = *now;
	if (flags & S_MTIME)
		inode->i_mtime = *now;
	if (flags & S_ATIME)
		inode->i_atime = *now;
5962
	return dirty ? btrfs_dirty_inode(inode) : 0;
C
Chris Mason 已提交
5963 5964
}

C
Chris Mason 已提交
5965 5966 5967 5968 5969
/*
 * find the highest existing sequence number in a directory
 * and then set the in-memory index_cnt variable to reflect
 * free sequence numbers
 */
5970
static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5971
{
5972
	struct btrfs_root *root = inode->root;
5973 5974 5975 5976 5977
	struct btrfs_key key, found_key;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	int ret;

5978
	key.objectid = btrfs_ino(inode);
5979
	key.type = BTRFS_DIR_INDEX_KEY;
5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994
	key.offset = (u64)-1;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	/* FIXME: we should be able to handle this */
	if (ret == 0)
		goto out;
	ret = 0;

	if (path->slots[0] == 0) {
5995
		inode->index_cnt = BTRFS_DIR_START_INDEX;
5996 5997 5998 5999 6000 6001 6002 6003
		goto out;
	}

	path->slots[0]--;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

6004
	if (found_key.objectid != btrfs_ino(inode) ||
6005
	    found_key.type != BTRFS_DIR_INDEX_KEY) {
6006
		inode->index_cnt = BTRFS_DIR_START_INDEX;
6007 6008 6009
		goto out;
	}

6010
	inode->index_cnt = found_key.offset + 1;
6011 6012 6013 6014 6015
out:
	btrfs_free_path(path);
	return ret;
}

C
Chris Mason 已提交
6016 6017 6018 6019
/*
 * helper to find a free sequence number in a given directory.  This current
 * code is very simple, later versions will do smarter things in the btree
 */
6020
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6021 6022 6023
{
	int ret = 0;

6024 6025
	if (dir->index_cnt == (u64)-1) {
		ret = btrfs_inode_delayed_dir_index_count(dir);
6026 6027 6028 6029 6030
		if (ret) {
			ret = btrfs_set_inode_index_count(dir);
			if (ret)
				return ret;
		}
6031 6032
	}

6033 6034
	*index = dir->index_cnt;
	dir->index_cnt++;
6035 6036 6037 6038

	return ret;
}

6039 6040 6041
static int btrfs_insert_inode_locked(struct inode *inode)
{
	struct btrfs_iget_args args;
D
David Sterba 已提交
6042 6043

	args.ino = BTRFS_I(inode)->location.objectid;
6044 6045 6046 6047 6048 6049 6050
	args.root = BTRFS_I(inode)->root;

	return insert_inode_locked4(inode,
		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
		   btrfs_find_actor, &args);
}

6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078
/*
 * Inherit flags from the parent inode.
 *
 * Currently only the compression flags and the cow flags are inherited.
 */
static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
	unsigned int flags;

	if (!dir)
		return;

	flags = BTRFS_I(dir)->flags;

	if (flags & BTRFS_INODE_NOCOMPRESS) {
		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
	} else if (flags & BTRFS_INODE_COMPRESS) {
		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
	}

	if (flags & BTRFS_INODE_NODATACOW) {
		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
		if (S_ISREG(inode->i_mode))
			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
	}

6079
	btrfs_sync_inode_flags_to_i_flags(inode);
6080 6081
}

C
Chris Mason 已提交
6082 6083
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
6084
				     struct user_namespace *mnt_userns,
6085
				     struct inode *dir,
6086
				     const char *name, int name_len,
A
Al Viro 已提交
6087 6088
				     u64 ref_objectid, u64 objectid,
				     umode_t mode, u64 *index)
C
Chris Mason 已提交
6089
{
6090
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
6091
	struct inode *inode;
6092
	struct btrfs_inode_item *inode_item;
C
Chris Mason 已提交
6093
	struct btrfs_key *location;
6094
	struct btrfs_path *path;
6095 6096 6097
	struct btrfs_inode_ref *ref;
	struct btrfs_key key[2];
	u32 sizes[2];
6098
	struct btrfs_item_batch batch;
6099
	unsigned long ptr;
6100
	unsigned int nofs_flag;
C
Chris Mason 已提交
6101 6102
	int ret;

6103
	path = btrfs_alloc_path();
6104 6105
	if (!path)
		return ERR_PTR(-ENOMEM);
6106

6107
	nofs_flag = memalloc_nofs_save();
6108
	inode = new_inode(fs_info->sb);
6109
	memalloc_nofs_restore(nofs_flag);
6110 6111
	if (!inode) {
		btrfs_free_path(path);
C
Chris Mason 已提交
6112
		return ERR_PTR(-ENOMEM);
6113
	}
C
Chris Mason 已提交
6114

6115 6116 6117 6118 6119 6120 6121
	/*
	 * O_TMPFILE, set link count to 0, so that after this point,
	 * we fill in an inode item with the correct link count.
	 */
	if (!name)
		set_nlink(inode, 0);

6122 6123 6124 6125 6126 6127
	/*
	 * we have to initialize this early, so we can reclaim the inode
	 * number if we fail afterwards in this function.
	 */
	inode->i_ino = objectid;

6128
	if (dir && name) {
6129 6130
		trace_btrfs_inode_request(dir);

6131
		ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6132
		if (ret) {
6133
			btrfs_free_path(path);
6134
			iput(inode);
6135
			return ERR_PTR(ret);
6136
		}
6137 6138
	} else if (dir) {
		*index = 0;
6139 6140 6141
	}
	/*
	 * index_cnt is ignored for everything but a dir,
6142
	 * btrfs_set_inode_index_count has an explanation for the magic
6143 6144 6145
	 * number
	 */
	BTRFS_I(inode)->index_cnt = 2;
6146
	BTRFS_I(inode)->dir_index = *index;
6147
	BTRFS_I(inode)->root = btrfs_grab_root(root);
6148
	BTRFS_I(inode)->generation = trans->transid;
6149
	inode->i_generation = BTRFS_I(inode)->generation;
6150

J
Josef Bacik 已提交
6151 6152 6153 6154 6155 6156 6157 6158
	/*
	 * We could have gotten an inode number from somebody who was fsynced
	 * and then removed in this same transaction, so let's just set full
	 * sync since it will be a full sync anyway and this will blow away the
	 * old info in the log.
	 */
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);

6159
	key[0].objectid = objectid;
6160
	key[0].type = BTRFS_INODE_ITEM_KEY;
6161 6162 6163
	key[0].offset = 0;

	sizes[0] = sizeof(struct btrfs_inode_item);
6164 6165 6166 6167 6168 6169 6170 6171 6172

	if (name) {
		/*
		 * Start new inodes with an inode_ref. This is slightly more
		 * efficient for small numbers of hard links since they will
		 * be packed into one item. Extended refs will kick in if we
		 * add more hard links than can fit in the ref item.
		 */
		key[1].objectid = objectid;
6173
		key[1].type = BTRFS_INODE_REF_KEY;
6174 6175 6176 6177
		key[1].offset = ref_objectid;

		sizes[1] = name_len + sizeof(*ref);
	}
6178

6179 6180 6181
	location = &BTRFS_I(inode)->location;
	location->objectid = objectid;
	location->offset = 0;
6182
	location->type = BTRFS_INODE_ITEM_KEY;
6183 6184

	ret = btrfs_insert_inode_locked(inode);
A
Al Viro 已提交
6185 6186
	if (ret < 0) {
		iput(inode);
6187
		goto fail;
A
Al Viro 已提交
6188
	}
6189

6190 6191 6192 6193 6194
	batch.keys = &key[0];
	batch.data_sizes = &sizes[0];
	batch.total_data_size = sizes[0] + (name ? sizes[1] : 0);
	batch.nr = name ? 2 : 1;
	ret = btrfs_insert_empty_items(trans, root, path, &batch);
6195
	if (ret != 0)
6196
		goto fail_unlock;
6197

6198
	inode_init_owner(mnt_userns, inode, dir, mode);
6199
	inode_set_bytes(inode, 0);
6200

6201
	inode->i_mtime = current_time(inode);
6202 6203
	inode->i_atime = inode->i_mtime;
	inode->i_ctime = inode->i_mtime;
6204
	BTRFS_I(inode)->i_otime = inode->i_mtime;
6205

6206 6207
	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				  struct btrfs_inode_item);
6208
	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6209
			     sizeof(*inode_item));
6210
	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6211

6212 6213 6214 6215 6216 6217 6218 6219
	if (name) {
		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
				     struct btrfs_inode_ref);
		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
		ptr = (unsigned long)(ref + 1);
		write_extent_buffer(path->nodes[0], name, ptr, name_len);
	}
6220

6221 6222 6223
	btrfs_mark_buffer_dirty(path->nodes[0]);
	btrfs_free_path(path);

6224 6225
	btrfs_inherit_iflags(inode, dir);

6226
	if (S_ISREG(mode)) {
6227
		if (btrfs_test_opt(fs_info, NODATASUM))
6228
			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6229
		if (btrfs_test_opt(fs_info, NODATACOW))
6230 6231
			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
				BTRFS_INODE_NODATASUM;
6232 6233
	}

6234
	inode_tree_add(inode);
6235 6236

	trace_btrfs_inode_new(inode);
6237
	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6238

6239 6240
	btrfs_update_root_times(trans, root);

6241 6242
	ret = btrfs_inode_inherit_props(trans, inode, dir);
	if (ret)
6243
		btrfs_err(fs_info,
6244
			  "error inheriting props for ino %llu (root %llu): %d",
6245
			btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6246

C
Chris Mason 已提交
6247
	return inode;
6248 6249

fail_unlock:
A
Al Viro 已提交
6250
	discard_new_inode(inode);
6251
fail:
6252
	if (dir && name)
6253
		BTRFS_I(dir)->index_cnt--;
6254 6255
	btrfs_free_path(path);
	return ERR_PTR(ret);
C
Chris Mason 已提交
6256 6257
}

C
Chris Mason 已提交
6258 6259 6260 6261 6262 6263
/*
 * utility function to add 'inode' into 'parent_inode' with
 * a give name and a given sequence number.
 * if 'add_backref' is true, also insert a backref from the
 * inode to the parent directory.
 */
6264
int btrfs_add_link(struct btrfs_trans_handle *trans,
6265
		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6266
		   const char *name, int name_len, int add_backref, u64 index)
C
Chris Mason 已提交
6267
{
6268
	int ret = 0;
C
Chris Mason 已提交
6269
	struct btrfs_key key;
6270 6271 6272
	struct btrfs_root *root = parent_inode->root;
	u64 ino = btrfs_ino(inode);
	u64 parent_ino = btrfs_ino(parent_inode);
6273

L
Li Zefan 已提交
6274
	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6275
		memcpy(&key, &inode->root->root_key, sizeof(key));
6276
	} else {
L
Li Zefan 已提交
6277
		key.objectid = ino;
6278
		key.type = BTRFS_INODE_ITEM_KEY;
6279 6280 6281
		key.offset = 0;
	}

L
Li Zefan 已提交
6282
	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6283
		ret = btrfs_add_root_ref(trans, key.objectid,
6284 6285
					 root->root_key.objectid, parent_ino,
					 index, name, name_len);
6286
	} else if (add_backref) {
L
Li Zefan 已提交
6287 6288
		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
					     parent_ino, index);
6289
	}
C
Chris Mason 已提交
6290

6291 6292 6293
	/* Nothing to clean up yet */
	if (ret)
		return ret;
6294

6295
	ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
6296
				    btrfs_inode_type(&inode->vfs_inode), index);
C
Chris Mason 已提交
6297
	if (ret == -EEXIST || ret == -EOVERFLOW)
6298 6299
		goto fail_dir_item;
	else if (ret) {
6300
		btrfs_abort_transaction(trans, ret);
6301
		return ret;
C
Chris Mason 已提交
6302
	}
6303

6304
	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6305
			   name_len * 2);
6306
	inode_inc_iversion(&parent_inode->vfs_inode);
6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318
	/*
	 * If we are replaying a log tree, we do not want to update the mtime
	 * and ctime of the parent directory with the current time, since the
	 * log replay procedure is responsible for setting them to their correct
	 * values (the ones it had when the fsync was done).
	 */
	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
		struct timespec64 now = current_time(&parent_inode->vfs_inode);

		parent_inode->vfs_inode.i_mtime = now;
		parent_inode->vfs_inode.i_ctime = now;
	}
6319
	ret = btrfs_update_inode(trans, root, parent_inode);
6320
	if (ret)
6321
		btrfs_abort_transaction(trans, ret);
C
Chris Mason 已提交
6322
	return ret;
6323 6324 6325 6326 6327

fail_dir_item:
	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
		u64 local_index;
		int err;
6328
		err = btrfs_del_root_ref(trans, key.objectid,
6329 6330
					 root->root_key.objectid, parent_ino,
					 &local_index, name, name_len);
6331 6332
		if (err)
			btrfs_abort_transaction(trans, err);
6333 6334 6335 6336 6337 6338
	} else if (add_backref) {
		u64 local_index;
		int err;

		err = btrfs_del_inode_ref(trans, root, name, name_len,
					  ino, parent_ino, &local_index);
6339 6340
		if (err)
			btrfs_abort_transaction(trans, err);
6341
	}
6342 6343

	/* Return the original error code */
6344
	return ret;
C
Chris Mason 已提交
6345 6346 6347
}

static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6348 6349
			    struct btrfs_inode *dir, struct dentry *dentry,
			    struct btrfs_inode *inode, int backref, u64 index)
C
Chris Mason 已提交
6350
{
6351 6352 6353
	int err = btrfs_add_link(trans, dir, inode,
				 dentry->d_name.name, dentry->d_name.len,
				 backref, index);
C
Chris Mason 已提交
6354 6355 6356 6357 6358
	if (err > 0)
		err = -EEXIST;
	return err;
}

6359 6360
static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
		       struct dentry *dentry, umode_t mode, dev_t rdev)
J
Josef Bacik 已提交
6361
{
6362
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
J
Josef Bacik 已提交
6363 6364
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
6365
	struct inode *inode = NULL;
J
Josef Bacik 已提交
6366 6367
	int err;
	u64 objectid;
6368
	u64 index = 0;
J
Josef Bacik 已提交
6369

J
Josef Bacik 已提交
6370 6371 6372 6373 6374
	/*
	 * 2 for inode item and ref
	 * 2 for dir items
	 * 1 for xattr if selinux is on
	 */
6375 6376 6377
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
6378

6379
	err = btrfs_get_free_objectid(root, &objectid);
6380 6381 6382
	if (err)
		goto out_unlock;

6383
	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6384 6385
			dentry->d_name.name, dentry->d_name.len,
			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
6386 6387
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
6388
		inode = NULL;
J
Josef Bacik 已提交
6389
		goto out_unlock;
6390
	}
J
Josef Bacik 已提交
6391

6392 6393 6394 6395 6396 6397 6398
	/*
	* If the active LSM wants to access the inode during
	* d_instantiate it needs these. Smack checks to see
	* if the filesystem supports xattrs by looking at the
	* ops vector.
	*/
	inode->i_op = &btrfs_special_inode_operations;
6399 6400 6401
	init_special_inode(inode, inode->i_mode, rdev);

	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
J
Josef Bacik 已提交
6402
	if (err)
A
Al Viro 已提交
6403
		goto out_unlock;
6404

6405 6406
	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
			0, index);
A
Al Viro 已提交
6407 6408 6409
	if (err)
		goto out_unlock;

6410
	btrfs_update_inode(trans, root, BTRFS_I(inode));
A
Al Viro 已提交
6411
	d_instantiate_new(dentry, inode);
6412

J
Josef Bacik 已提交
6413
out_unlock:
6414
	btrfs_end_transaction(trans);
6415
	btrfs_btree_balance_dirty(fs_info);
A
Al Viro 已提交
6416
	if (err && inode) {
J
Josef Bacik 已提交
6417
		inode_dec_link_count(inode);
A
Al Viro 已提交
6418
		discard_new_inode(inode);
J
Josef Bacik 已提交
6419 6420 6421 6422
	}
	return err;
}

6423 6424
static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir,
			struct dentry *dentry, umode_t mode, bool excl)
C
Chris Mason 已提交
6425
{
6426
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
C
Chris Mason 已提交
6427 6428
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
6429
	struct inode *inode = NULL;
6430
	int err;
C
Chris Mason 已提交
6431
	u64 objectid;
6432
	u64 index = 0;
C
Chris Mason 已提交
6433

J
Josef Bacik 已提交
6434 6435 6436 6437 6438
	/*
	 * 2 for inode item and ref
	 * 2 for dir items
	 * 1 for xattr if selinux is on
	 */
6439 6440 6441
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
J
Josef Bacik 已提交
6442

6443
	err = btrfs_get_free_objectid(root, &objectid);
6444 6445 6446
	if (err)
		goto out_unlock;

6447
	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6448 6449
			dentry->d_name.name, dentry->d_name.len,
			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
6450 6451
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
6452
		inode = NULL;
C
Chris Mason 已提交
6453
		goto out_unlock;
6454
	}
6455 6456 6457 6458 6459 6460 6461 6462
	/*
	* If the active LSM wants to access the inode during
	* d_instantiate it needs these. Smack checks to see
	* if the filesystem supports xattrs by looking at the
	* ops vector.
	*/
	inode->i_fop = &btrfs_file_operations;
	inode->i_op = &btrfs_file_inode_operations;
6463 6464 6465 6466
	inode->i_mapping->a_ops = &btrfs_aops;

	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
	if (err)
A
Al Viro 已提交
6467
		goto out_unlock;
6468

6469
	err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6470
	if (err)
A
Al Viro 已提交
6471
		goto out_unlock;
6472

6473 6474
	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
			0, index);
C
Chris Mason 已提交
6475
	if (err)
A
Al Viro 已提交
6476
		goto out_unlock;
6477

6478
	d_instantiate_new(dentry, inode);
6479

C
Chris Mason 已提交
6480
out_unlock:
6481
	btrfs_end_transaction(trans);
A
Al Viro 已提交
6482
	if (err && inode) {
C
Chris Mason 已提交
6483
		inode_dec_link_count(inode);
A
Al Viro 已提交
6484
		discard_new_inode(inode);
C
Chris Mason 已提交
6485
	}
6486
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
6487 6488 6489 6490 6491 6492
	return err;
}

static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
		      struct dentry *dentry)
{
6493
	struct btrfs_trans_handle *trans = NULL;
C
Chris Mason 已提交
6494
	struct btrfs_root *root = BTRFS_I(dir)->root;
6495
	struct inode *inode = d_inode(old_dentry);
6496
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6497
	u64 index;
C
Chris Mason 已提交
6498 6499 6500
	int err;
	int drop_inode = 0;

6501
	/* do not allow sys_link's with other subvols of the same device */
6502
	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6503
		return -EXDEV;
6504

M
Mark Fasheh 已提交
6505
	if (inode->i_nlink >= BTRFS_LINK_MAX)
6506
		return -EMLINK;
6507

6508
	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6509 6510 6511
	if (err)
		goto fail;

6512
	/*
M
Miao Xie 已提交
6513
	 * 2 items for inode and inode ref
6514
	 * 2 items for dir items
M
Miao Xie 已提交
6515
	 * 1 item for parent inode
6516
	 * 1 item for orphan item deletion if O_TMPFILE
6517
	 */
6518
	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6519 6520
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
6521
		trans = NULL;
6522 6523
		goto fail;
	}
6524

6525 6526
	/* There are several dir indexes for this inode, clear the cache. */
	BTRFS_I(inode)->dir_index = 0ULL;
Z
Zach Brown 已提交
6527
	inc_nlink(inode);
6528
	inode_inc_iversion(inode);
6529
	inode->i_ctime = current_time(inode);
A
Al Viro 已提交
6530
	ihold(inode);
6531
	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6532

6533 6534
	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
			1, index);
6535

6536
	if (err) {
6537
		drop_inode = 1;
6538
	} else {
6539
		struct dentry *parent = dentry->d_parent;
6540

6541
		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6542 6543
		if (err)
			goto fail;
6544 6545 6546 6547 6548
		if (inode->i_nlink == 1) {
			/*
			 * If new hard link count is 1, it's a file created
			 * with open(2) O_TMPFILE flag.
			 */
6549
			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6550 6551 6552
			if (err)
				goto fail;
		}
6553
		d_instantiate(dentry, inode);
6554
		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6555
	}
C
Chris Mason 已提交
6556

6557
fail:
6558
	if (trans)
6559
		btrfs_end_transaction(trans);
C
Chris Mason 已提交
6560 6561 6562 6563
	if (drop_inode) {
		inode_dec_link_count(inode);
		iput(inode);
	}
6564
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
6565 6566 6567
	return err;
}

6568 6569
static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
		       struct dentry *dentry, umode_t mode)
C
Chris Mason 已提交
6570
{
6571
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6572
	struct inode *inode = NULL;
C
Chris Mason 已提交
6573 6574 6575
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
	int err = 0;
6576
	u64 objectid = 0;
6577
	u64 index = 0;
C
Chris Mason 已提交
6578

J
Josef Bacik 已提交
6579 6580 6581 6582 6583
	/*
	 * 2 items for inode and ref
	 * 2 items for dir items
	 * 1 for xattr if selinux is on
	 */
6584 6585 6586
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
C
Chris Mason 已提交
6587

6588
	err = btrfs_get_free_objectid(root, &objectid);
6589 6590 6591
	if (err)
		goto out_fail;

6592
	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
6593 6594
			dentry->d_name.name, dentry->d_name.len,
			btrfs_ino(BTRFS_I(dir)), objectid,
6595
			S_IFDIR | mode, &index);
C
Chris Mason 已提交
6596 6597
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
6598
		inode = NULL;
C
Chris Mason 已提交
6599 6600
		goto out_fail;
	}
6601

6602 6603 6604
	/* these must be set before we unlock the inode */
	inode->i_op = &btrfs_dir_inode_operations;
	inode->i_fop = &btrfs_dir_file_operations;
J
Josef Bacik 已提交
6605

6606
	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
J
Josef Bacik 已提交
6607
	if (err)
A
Al Viro 已提交
6608
		goto out_fail;
C
Chris Mason 已提交
6609

6610
	btrfs_i_size_write(BTRFS_I(inode), 0);
6611
	err = btrfs_update_inode(trans, root, BTRFS_I(inode));
C
Chris Mason 已提交
6612
	if (err)
A
Al Viro 已提交
6613
		goto out_fail;
6614

6615 6616 6617
	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
			dentry->d_name.name,
			dentry->d_name.len, 0, index);
C
Chris Mason 已提交
6618
	if (err)
A
Al Viro 已提交
6619
		goto out_fail;
6620

6621
	d_instantiate_new(dentry, inode);
C
Chris Mason 已提交
6622 6623

out_fail:
6624
	btrfs_end_transaction(trans);
A
Al Viro 已提交
6625
	if (err && inode) {
6626
		inode_dec_link_count(inode);
A
Al Viro 已提交
6627
		discard_new_inode(inode);
6628
	}
6629
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
6630 6631 6632
	return err;
}

C
Chris Mason 已提交
6633
static noinline int uncompress_inline(struct btrfs_path *path,
6634
				      struct page *page,
C
Chris Mason 已提交
6635 6636 6637 6638 6639 6640 6641 6642 6643
				      size_t pg_offset, u64 extent_offset,
				      struct btrfs_file_extent_item *item)
{
	int ret;
	struct extent_buffer *leaf = path->nodes[0];
	char *tmp;
	size_t max_size;
	unsigned long inline_size;
	unsigned long ptr;
6644
	int compress_type;
C
Chris Mason 已提交
6645 6646

	WARN_ON(pg_offset != 0);
6647
	compress_type = btrfs_file_extent_compression(leaf, item);
C
Chris Mason 已提交
6648
	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6649
	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
C
Chris Mason 已提交
6650
	tmp = kmalloc(inline_size, GFP_NOFS);
6651 6652
	if (!tmp)
		return -ENOMEM;
C
Chris Mason 已提交
6653 6654 6655 6656
	ptr = btrfs_file_extent_inline_start(item);

	read_extent_buffer(leaf, tmp, ptr, inline_size);

6657
	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6658 6659
	ret = btrfs_decompress(compress_type, tmp, page,
			       extent_offset, inline_size, max_size);
6660 6661 6662 6663 6664 6665 6666 6667 6668

	/*
	 * decompression code contains a memset to fill in any space between the end
	 * of the uncompressed data and the end of max_size in case the decompressed
	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
	 * the end of an inline extent and the beginning of the next block, so we
	 * cover that region here.
	 */

6669 6670 6671
	if (max_size + pg_offset < PAGE_SIZE)
		memzero_page(page,  pg_offset + max_size,
			     PAGE_SIZE - max_size - pg_offset);
C
Chris Mason 已提交
6672
	kfree(tmp);
6673
	return ret;
C
Chris Mason 已提交
6674 6675
}

6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687
/**
 * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
 * @inode:	file to search in
 * @page:	page to read extent data into if the extent is inline
 * @pg_offset:	offset into @page to copy to
 * @start:	file offset
 * @len:	length of range starting at @start
 *
 * This returns the first &struct extent_map which overlaps with the given
 * range, reading it from the B-tree and caching it if necessary. Note that
 * there may be more extents which overlap the given range after the returned
 * extent_map.
C
Chris Mason 已提交
6688
 *
6689 6690 6691 6692
 * If @page is not NULL and the extent is inline, this also reads the extent
 * data directly into the page and marks the extent up to date in the io_tree.
 *
 * Return: ERR_PTR on error, non-NULL extent_map on success.
C
Chris Mason 已提交
6693
 */
6694
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6695 6696
				    struct page *page, size_t pg_offset,
				    u64 start, u64 len)
6697
{
6698
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6699
	int ret = 0;
6700 6701
	u64 extent_start = 0;
	u64 extent_end = 0;
6702
	u64 objectid = btrfs_ino(inode);
6703
	int extent_type = -1;
6704
	struct btrfs_path *path = NULL;
6705
	struct btrfs_root *root = inode->root;
6706
	struct btrfs_file_extent_item *item;
6707 6708
	struct extent_buffer *leaf;
	struct btrfs_key found_key;
6709
	struct extent_map *em = NULL;
6710 6711
	struct extent_map_tree *em_tree = &inode->extent_tree;
	struct extent_io_tree *io_tree = &inode->io_tree;
6712

6713
	read_lock(&em_tree->lock);
6714
	em = lookup_extent_mapping(em_tree, start, len);
6715
	read_unlock(&em_tree->lock);
6716

6717
	if (em) {
6718 6719 6720
		if (em->start > start || em->start + em->len <= start)
			free_extent_map(em);
		else if (em->block_start == EXTENT_MAP_INLINE && page)
6721 6722 6723
			free_extent_map(em);
		else
			goto out;
6724
	}
6725
	em = alloc_extent_map();
6726
	if (!em) {
6727
		ret = -ENOMEM;
6728
		goto out;
6729
	}
6730
	em->start = EXTENT_MAP_HOLE;
6731
	em->orig_start = EXTENT_MAP_HOLE;
6732
	em->len = (u64)-1;
C
Chris Mason 已提交
6733
	em->block_len = (u64)-1;
6734

6735
	path = btrfs_alloc_path();
6736
	if (!path) {
6737
		ret = -ENOMEM;
6738
		goto out;
6739 6740
	}

6741 6742
	/* Chances are we'll be called again, so go ahead and do readahead */
	path->reada = READA_FORWARD;
6743 6744 6745 6746 6747 6748 6749 6750 6751 6752

	/*
	 * The same explanation in load_free_space_cache applies here as well,
	 * we only read when we're loading the free space cache, and at that
	 * point the commit_root has everything we need.
	 */
	if (btrfs_is_free_space_inode(inode)) {
		path->search_commit_root = 1;
		path->skip_locking = 1;
	}
J
Josef Bacik 已提交
6753

6754
	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6755 6756
	if (ret < 0) {
		goto out;
6757
	} else if (ret > 0) {
6758 6759 6760
		if (path->slots[0] == 0)
			goto not_found;
		path->slots[0]--;
6761
		ret = 0;
6762 6763
	}

6764 6765
	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0],
6766
			      struct btrfs_file_extent_item);
6767 6768
	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
	if (found_key.objectid != objectid ||
6769
	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6770 6771 6772 6773 6774 6775 6776 6777
		/*
		 * If we backup past the first extent we want to move forward
		 * and see if there is an extent in front of us, otherwise we'll
		 * say there is a hole for our whole search range which can
		 * cause problems.
		 */
		extent_end = start;
		goto next;
6778 6779
	}

6780
	extent_type = btrfs_file_extent_type(leaf, item);
6781
	extent_start = found_key.offset;
6782
	extent_end = btrfs_file_extent_end(path);
6783 6784
	if (extent_type == BTRFS_FILE_EXTENT_REG ||
	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6785 6786
		/* Only regular file could have regular/prealloc extent */
		if (!S_ISREG(inode->vfs_inode.i_mode)) {
6787
			ret = -EUCLEAN;
6788 6789 6790 6791 6792
			btrfs_crit(fs_info,
		"regular/prealloc extent found for non-regular inode %llu",
				   btrfs_ino(inode));
			goto out;
		}
L
Liu Bo 已提交
6793 6794
		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
						       extent_start);
6795
	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
L
Liu Bo 已提交
6796 6797 6798
		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
						      path->slots[0],
						      extent_start);
Y
Yan Zheng 已提交
6799
	}
6800
next:
Y
Yan Zheng 已提交
6801 6802 6803 6804
	if (start >= extent_end) {
		path->slots[0]++;
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
6805
			if (ret < 0)
Y
Yan Zheng 已提交
6806
				goto out;
6807
			else if (ret > 0)
Y
Yan Zheng 已提交
6808
				goto not_found;
6809

Y
Yan Zheng 已提交
6810
			leaf = path->nodes[0];
6811
		}
Y
Yan Zheng 已提交
6812 6813 6814 6815 6816 6817
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid != objectid ||
		    found_key.type != BTRFS_EXTENT_DATA_KEY)
			goto not_found;
		if (start + len <= found_key.offset)
			goto not_found;
6818 6819
		if (start > found_key.offset)
			goto next;
6820 6821

		/* New extent overlaps with existing one */
Y
Yan Zheng 已提交
6822
		em->start = start;
6823
		em->orig_start = start;
Y
Yan Zheng 已提交
6824
		em->len = found_key.offset - start;
6825 6826
		em->block_start = EXTENT_MAP_HOLE;
		goto insert;
Y
Yan Zheng 已提交
6827 6828
	}

6829
	btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
6830

6831 6832
	if (extent_type == BTRFS_FILE_EXTENT_REG ||
	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6833
		goto insert;
6834
	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6835
		unsigned long ptr;
6836
		char *map;
6837 6838 6839
		size_t size;
		size_t extent_offset;
		size_t copy_size;
6840

6841
		if (!page)
6842
			goto out;
6843

6844
		size = btrfs_file_extent_ram_bytes(leaf, item);
Y
Yan Zheng 已提交
6845
		extent_offset = page_offset(page) + pg_offset - extent_start;
6846 6847
		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
				  size - extent_offset);
6848
		em->start = extent_start + extent_offset;
6849
		em->len = ALIGN(copy_size, fs_info->sectorsize);
6850
		em->orig_block_len = em->len;
6851
		em->orig_start = em->start;
6852
		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6853

6854
		if (!PageUptodate(page)) {
6855 6856
			if (btrfs_file_extent_compression(leaf, item) !=
			    BTRFS_COMPRESS_NONE) {
6857
				ret = uncompress_inline(path, page, pg_offset,
C
Chris Mason 已提交
6858
							extent_offset, item);
6859
				if (ret)
6860
					goto out;
C
Chris Mason 已提交
6861
			} else {
6862
				map = kmap_local_page(page);
C
Chris Mason 已提交
6863 6864
				read_extent_buffer(leaf, map + pg_offset, ptr,
						   copy_size);
6865
				if (pg_offset + copy_size < PAGE_SIZE) {
6866
					memset(map + pg_offset + copy_size, 0,
6867
					       PAGE_SIZE - pg_offset -
6868 6869
					       copy_size);
				}
6870
				kunmap_local(map);
C
Chris Mason 已提交
6871
			}
6872
			flush_dcache_page(page);
6873
		}
6874
		set_extent_uptodate(io_tree, em->start,
6875
				    extent_map_end(em) - 1, NULL, GFP_NOFS);
6876 6877 6878 6879
		goto insert;
	}
not_found:
	em->start = start;
6880
	em->orig_start = start;
6881
	em->len = len;
6882
	em->block_start = EXTENT_MAP_HOLE;
6883
insert:
6884
	ret = 0;
6885
	btrfs_release_path(path);
6886
	if (em->start > start || extent_map_end(em) <= start) {
6887
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
6888 6889
			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
			  em->start, em->len, start, len);
6890
		ret = -EIO;
6891 6892
		goto out;
	}
6893

6894
	write_lock(&em_tree->lock);
6895
	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6896
	write_unlock(&em_tree->lock);
6897
out:
6898
	btrfs_free_path(path);
6899

6900
	trace_btrfs_get_extent(root, inode, em);
6901

6902
	if (ret) {
6903
		free_extent_map(em);
6904
		return ERR_PTR(ret);
6905 6906 6907 6908
	}
	return em;
}

6909
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
6910
					   u64 start, u64 len)
6911 6912 6913
{
	struct extent_map *em;
	struct extent_map *hole_em = NULL;
6914
	u64 delalloc_start = start;
6915
	u64 end;
6916 6917
	u64 delalloc_len;
	u64 delalloc_end;
6918 6919
	int err = 0;

6920
	em = btrfs_get_extent(inode, NULL, 0, start, len);
6921 6922
	if (IS_ERR(em))
		return em;
6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933
	/*
	 * If our em maps to:
	 * - a hole or
	 * - a pre-alloc extent,
	 * there might actually be delalloc bytes behind it.
	 */
	if (em->block_start != EXTENT_MAP_HOLE &&
	    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		return em;
	else
		hole_em = em;
6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944

	/* check to see if we've wrapped (len == -1 or similar) */
	end = start + len;
	if (end < start)
		end = (u64)-1;
	else
		end -= 1;

	em = NULL;

	/* ok, we didn't find anything, lets look for delalloc */
6945
	delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
6946
				 end, len, EXTENT_DELALLOC, 1);
6947 6948 6949
	delalloc_end = delalloc_start + delalloc_len;
	if (delalloc_end < delalloc_start)
		delalloc_end = (u64)-1;
6950 6951

	/*
6952 6953
	 * We didn't find anything useful, return the original results from
	 * get_extent()
6954
	 */
6955
	if (delalloc_start > end || delalloc_end <= start) {
6956 6957 6958 6959 6960
		em = hole_em;
		hole_em = NULL;
		goto out;
	}

6961 6962 6963
	/*
	 * Adjust the delalloc_start to make sure it doesn't go backwards from
	 * the start they passed in
6964
	 */
6965 6966
	delalloc_start = max(start, delalloc_start);
	delalloc_len = delalloc_end - delalloc_start;
6967

6968 6969
	if (delalloc_len > 0) {
		u64 hole_start;
6970
		u64 hole_len;
6971
		const u64 hole_end = extent_map_end(hole_em);
6972

6973
		em = alloc_extent_map();
6974 6975 6976 6977
		if (!em) {
			err = -ENOMEM;
			goto out;
		}
6978 6979

		ASSERT(hole_em);
6980
		/*
6981 6982
		 * When btrfs_get_extent can't find anything it returns one
		 * huge hole
6983
		 *
6984 6985
		 * Make sure what it found really fits our range, and adjust to
		 * make sure it is based on the start from the caller
6986
		 */
6987 6988 6989 6990 6991 6992
		if (hole_end <= start || hole_em->start > end) {
		       free_extent_map(hole_em);
		       hole_em = NULL;
		} else {
		       hole_start = max(hole_em->start, start);
		       hole_len = hole_end - hole_start;
6993
		}
6994 6995 6996 6997 6998 6999

		if (hole_em && delalloc_start > hole_start) {
			/*
			 * Our hole starts before our delalloc, so we have to
			 * return just the parts of the hole that go until the
			 * delalloc starts
7000
			 */
7001
			em->len = min(hole_len, delalloc_start - hole_start);
7002 7003 7004
			em->start = hole_start;
			em->orig_start = hole_start;
			/*
7005 7006
			 * Don't adjust block start at all, it is fixed at
			 * EXTENT_MAP_HOLE
7007 7008 7009
			 */
			em->block_start = hole_em->block_start;
			em->block_len = hole_len;
7010 7011
			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7012
		} else {
7013 7014 7015 7016 7017 7018 7019
			/*
			 * Hole is out of passed range or it starts after
			 * delalloc range
			 */
			em->start = delalloc_start;
			em->len = delalloc_len;
			em->orig_start = delalloc_start;
7020
			em->block_start = EXTENT_MAP_DELALLOC;
7021
			em->block_len = delalloc_len;
7022
		}
7023
	} else {
7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035
		return hole_em;
	}
out:

	free_extent_map(hole_em);
	if (err) {
		free_extent_map(em);
		return ERR_PTR(err);
	}
	return em;
}

7036
static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049
						  const u64 start,
						  const u64 len,
						  const u64 orig_start,
						  const u64 block_start,
						  const u64 block_len,
						  const u64 orig_block_len,
						  const u64 ram_bytes,
						  const int type)
{
	struct extent_map *em = NULL;
	int ret;

	if (type != BTRFS_ORDERED_NOCOW) {
7050 7051
		em = create_io_em(inode, start, len, orig_start, block_start,
				  block_len, orig_block_len, ram_bytes,
7052 7053
				  BTRFS_COMPRESS_NONE, /* compress_type */
				  type);
7054 7055 7056
		if (IS_ERR(em))
			goto out;
	}
7057 7058
	ret = btrfs_add_ordered_extent_dio(inode, start, block_start, len,
					   block_len, type);
7059 7060 7061
	if (ret) {
		if (em) {
			free_extent_map(em);
7062
			btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
7063 7064 7065 7066 7067 7068 7069 7070
		}
		em = ERR_PTR(ret);
	}
 out:

	return em;
}

7071
static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
7072 7073
						  u64 start, u64 len)
{
7074 7075
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
7076
	struct extent_map *em;
7077 7078 7079 7080
	struct btrfs_key ins;
	u64 alloc_hint;
	int ret;

7081
	alloc_hint = get_extent_allocation_hint(inode, start, len);
7082
	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7083
				   0, alloc_hint, &ins, 1, 1);
7084 7085
	if (ret)
		return ERR_PTR(ret);
7086

7087
	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7088
				     ins.objectid, ins.offset, ins.offset,
7089
				     ins.offset, BTRFS_ORDERED_REGULAR);
7090
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7091
	if (IS_ERR(em))
7092 7093
		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
					   1);
7094

7095 7096 7097
	return em;
}

7098
static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7099 7100
{
	struct btrfs_block_group *block_group;
7101
	bool readonly = false;
7102 7103 7104

	block_group = btrfs_lookup_block_group(fs_info, bytenr);
	if (!block_group || block_group->ro)
7105
		readonly = true;
7106 7107 7108 7109 7110
	if (block_group)
		btrfs_put_block_group(block_group);
	return readonly;
}

7111
/*
7112 7113 7114 7115 7116 7117 7118 7119
 * Check if we can do nocow write into the range [@offset, @offset + @len)
 *
 * @offset:	File offset
 * @len:	The length to write, will be updated to the nocow writeable
 *		range
 * @orig_start:	(optional) Return the original file offset of the file extent
 * @orig_len:	(optional) Return the original on-disk length of the file extent
 * @ram_bytes:	(optional) Return the ram_bytes of the file extent
7120 7121
 * @strict:	if true, omit optimizations that might force us into unnecessary
 *		cow. e.g., don't trust generation number.
7122 7123 7124 7125 7126 7127 7128 7129
 *
 * Return:
 * >0	and update @len if we can do nocow write
 *  0	if we can't do nocow write
 * <0	if error happened
 *
 * NOTE: This only checks the file extents, caller is responsible to wait for
 *	 any ordered extents.
7130
 */
7131
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7132
			      u64 *orig_start, u64 *orig_block_len,
7133
			      u64 *ram_bytes, bool strict)
7134
{
7135
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7136 7137 7138 7139
	struct btrfs_path *path;
	int ret;
	struct extent_buffer *leaf;
	struct btrfs_root *root = BTRFS_I(inode)->root;
7140
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7141 7142 7143 7144 7145 7146 7147 7148
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;
	u64 disk_bytenr;
	u64 backref_offset;
	u64 extent_end;
	u64 num_bytes;
	int slot;
	int found_type;
7149
	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7150

7151 7152 7153 7154
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

7155 7156
	ret = btrfs_lookup_file_extent(NULL, root, path,
			btrfs_ino(BTRFS_I(inode)), offset, 0);
7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171
	if (ret < 0)
		goto out;

	slot = path->slots[0];
	if (ret == 1) {
		if (slot == 0) {
			/* can't find the item, must cow */
			ret = 0;
			goto out;
		}
		slot--;
	}
	ret = 0;
	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, slot);
7172
	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189
	    key.type != BTRFS_EXTENT_DATA_KEY) {
		/* not our file or wrong item type, must cow */
		goto out;
	}

	if (key.offset > offset) {
		/* Wrong offset, must cow */
		goto out;
	}

	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
	found_type = btrfs_file_extent_type(leaf, fi);
	if (found_type != BTRFS_FILE_EXTENT_REG &&
	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
		/* not a regular extent, must cow */
		goto out;
	}
7190 7191 7192 7193

	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
		goto out;

7194 7195 7196 7197
	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
	if (extent_end <= offset)
		goto out;

7198
	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7199 7200 7201 7202 7203 7204 7205 7206
	if (disk_bytenr == 0)
		goto out;

	if (btrfs_file_extent_compression(leaf, fi) ||
	    btrfs_file_extent_encryption(leaf, fi) ||
	    btrfs_file_extent_other_encoding(leaf, fi))
		goto out;

7207 7208 7209 7210
	/*
	 * Do the same check as in btrfs_cross_ref_exist but without the
	 * unnecessary search.
	 */
7211 7212 7213
	if (!strict &&
	    (btrfs_file_extent_generation(leaf, fi) <=
	     btrfs_root_last_snapshot(&root->root_item)))
7214 7215
		goto out;

7216 7217
	backref_offset = btrfs_file_extent_offset(leaf, fi);

7218 7219 7220 7221 7222
	if (orig_start) {
		*orig_start = key.offset - backref_offset;
		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
	}
7223

7224
	if (btrfs_extent_readonly(fs_info, disk_bytenr))
7225
		goto out;
7226 7227 7228 7229 7230

	num_bytes = min(offset + *len, extent_end) - offset;
	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
		u64 range_end;

7231 7232
		range_end = round_up(offset + num_bytes,
				     root->fs_info->sectorsize) - 1;
7233 7234 7235 7236 7237 7238 7239 7240
		ret = test_range_bit(io_tree, offset, range_end,
				     EXTENT_DELALLOC, 0, NULL);
		if (ret) {
			ret = -EAGAIN;
			goto out;
		}
	}

7241
	btrfs_release_path(path);
7242 7243 7244 7245 7246

	/*
	 * look for other files referencing this extent, if we
	 * find any we must cow
	 */
7247

7248
	ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7249 7250
				    key.offset - backref_offset, disk_bytenr,
				    strict);
7251 7252 7253 7254
	if (ret) {
		ret = 0;
		goto out;
	}
7255 7256 7257 7258 7259 7260 7261 7262 7263

	/*
	 * adjust disk_bytenr and num_bytes to cover just the bytes
	 * in this extent we are about to write.  If there
	 * are any csums in that range we have to cow in order
	 * to keep the csums correct
	 */
	disk_bytenr += backref_offset;
	disk_bytenr += offset - key.offset;
7264 7265
	if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
		goto out;
7266 7267 7268 7269
	/*
	 * all of the above have passed, it is safe to overwrite this extent
	 * without cow
	 */
7270
	*len = num_bytes;
7271 7272 7273 7274 7275 7276
	ret = 1;
out:
	btrfs_free_path(path);
	return ret;
}

7277
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7278
			      struct extent_state **cached_state, bool writing)
7279 7280 7281 7282 7283 7284
{
	struct btrfs_ordered_extent *ordered;
	int ret = 0;

	while (1) {
		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7285
				 cached_state);
7286 7287
		/*
		 * We're concerned with the entire range that we're going to be
7288
		 * doing DIO to, so we need to make sure there's no ordered
7289 7290
		 * extents in this range.
		 */
7291
		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7292 7293 7294 7295 7296 7297 7298 7299 7300
						     lockend - lockstart + 1);

		/*
		 * We need to make sure there are no buffered pages in this
		 * range either, we could have raced between the invalidate in
		 * generic_file_direct_write and locking the extent.  The
		 * invalidate needs to happen so that reads after a write do not
		 * get stale data.
		 */
7301
		if (!ordered &&
7302 7303
		    (!writing || !filemap_range_has_page(inode->i_mapping,
							 lockstart, lockend)))
7304 7305 7306
			break;

		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7307
				     cached_state);
7308 7309

		if (ordered) {
7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326
			/*
			 * If we are doing a DIO read and the ordered extent we
			 * found is for a buffered write, we can not wait for it
			 * to complete and retry, because if we do so we can
			 * deadlock with concurrent buffered writes on page
			 * locks. This happens only if our DIO read covers more
			 * than one extent map, if at this point has already
			 * created an ordered extent for a previous extent map
			 * and locked its range in the inode's io tree, and a
			 * concurrent write against that previous extent map's
			 * range and this range started (we unlock the ranges
			 * in the io tree only when the bios complete and
			 * buffered writes always lock pages before attempting
			 * to lock range in the io tree).
			 */
			if (writing ||
			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7327
				btrfs_start_ordered_extent(ordered, 1);
7328 7329
			else
				ret = -ENOTBLK;
7330 7331 7332
			btrfs_put_ordered_extent(ordered);
		} else {
			/*
7333 7334 7335 7336
			 * We could trigger writeback for this range (and wait
			 * for it to complete) and then invalidate the pages for
			 * this range (through invalidate_inode_pages2_range()),
			 * but that can lead us to a deadlock with a concurrent
7337
			 * call to readahead (a buffered read or a defrag call
7338 7339 7340
			 * triggered a readahead) on a page lock due to an
			 * ordered dio extent we created before but did not have
			 * yet a corresponding bio submitted (whence it can not
7341
			 * complete), which makes readahead wait for that
7342 7343
			 * ordered extent to complete while holding a lock on
			 * that page.
7344
			 */
7345
			ret = -ENOTBLK;
7346 7347
		}

7348 7349 7350
		if (ret)
			break;

7351 7352 7353 7354 7355 7356
		cond_resched();
	}

	return ret;
}

7357
/* The callers of this must take lock_extent() */
7358 7359
static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
				       u64 len, u64 orig_start, u64 block_start,
7360 7361 7362
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type)
7363 7364 7365 7366 7367
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	int ret;

7368 7369 7370
	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
	       type == BTRFS_ORDERED_COMPRESSED ||
	       type == BTRFS_ORDERED_NOCOW ||
7371
	       type == BTRFS_ORDERED_REGULAR);
7372

7373
	em_tree = &inode->extent_tree;
7374 7375 7376 7377 7378 7379 7380 7381 7382
	em = alloc_extent_map();
	if (!em)
		return ERR_PTR(-ENOMEM);

	em->start = start;
	em->orig_start = orig_start;
	em->len = len;
	em->block_len = block_len;
	em->block_start = block_start;
7383
	em->orig_block_len = orig_block_len;
J
Josef Bacik 已提交
7384
	em->ram_bytes = ram_bytes;
7385
	em->generation = -1;
7386
	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7387
	if (type == BTRFS_ORDERED_PREALLOC) {
7388
		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7389
	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7390 7391 7392
		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
		em->compress_type = compress_type;
	}
7393 7394

	do {
7395 7396
		btrfs_drop_extent_cache(inode, em->start,
					em->start + em->len - 1, 0);
7397
		write_lock(&em_tree->lock);
J
Josef Bacik 已提交
7398
		ret = add_extent_mapping(em_tree, em, 1);
7399
		write_unlock(&em_tree->lock);
7400 7401 7402 7403
		/*
		 * The caller has taken lock_extent(), who could race with us
		 * to add em?
		 */
7404 7405 7406 7407 7408 7409 7410
	} while (ret == -EEXIST);

	if (ret) {
		free_extent_map(em);
		return ERR_PTR(ret);
	}

7411
	/* em got 2 refs now, callers needs to do free_extent_map once. */
7412 7413 7414
	return em;
}

7415

7416 7417 7418 7419 7420 7421 7422
static int btrfs_get_blocks_direct_write(struct extent_map **map,
					 struct inode *inode,
					 struct btrfs_dio_data *dio_data,
					 u64 start, u64 len)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	struct extent_map *em = *map;
7423 7424 7425 7426
	int type;
	u64 block_start, orig_start, orig_block_len, ram_bytes;
	bool can_nocow = false;
	bool space_reserved = false;
7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448
	int ret = 0;

	/*
	 * We don't allocate a new extent in the following cases
	 *
	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
	 * existing extent.
	 * 2) The extent is marked as PREALLOC. We're good to go here and can
	 * just use the extent.
	 *
	 */
	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
	     em->block_start != EXTENT_MAP_HOLE)) {
		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
			type = BTRFS_ORDERED_PREALLOC;
		else
			type = BTRFS_ORDERED_NOCOW;
		len = min(len, em->len - (start - em->start));
		block_start = em->block_start + (start - em->start);

		if (can_nocow_extent(inode, start, &len, &orig_start,
7449
				     &orig_block_len, &ram_bytes, false) == 1 &&
7450 7451 7452
		    btrfs_inc_nocow_writers(fs_info, block_start))
			can_nocow = true;
	}
7453

7454 7455 7456 7457 7458 7459 7460 7461 7462
	if (can_nocow) {
		struct extent_map *em2;

		/* We can NOCOW, so only need to reserve metadata space. */
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
		if (ret < 0) {
			/* Our caller expects us to free the input extent map. */
			free_extent_map(em);
			*map = NULL;
7463
			btrfs_dec_nocow_writers(fs_info, block_start);
7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476
			goto out;
		}
		space_reserved = true;

		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
					      orig_start, block_start,
					      len, orig_block_len,
					      ram_bytes, type);
		btrfs_dec_nocow_writers(fs_info, block_start);
		if (type == BTRFS_ORDERED_PREALLOC) {
			free_extent_map(em);
			*map = em = em2;
		}
7477

7478 7479 7480
		if (IS_ERR(em2)) {
			ret = PTR_ERR(em2);
			goto out;
7481
		}
7482 7483
	} else {
		const u64 prev_len = len;
7484

7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508
		/* Our caller expects us to free the input extent map. */
		free_extent_map(em);
		*map = NULL;

		/* We have to COW, so need to reserve metadata and data space. */
		ret = btrfs_delalloc_reserve_space(BTRFS_I(inode),
						   &dio_data->data_reserved,
						   start, len);
		if (ret < 0)
			goto out;
		space_reserved = true;

		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}
		*map = em;
		len = min(len, em->len - (start - em->start));
		if (len < prev_len)
			btrfs_delalloc_release_space(BTRFS_I(inode),
						     dio_data->data_reserved,
						     start + len, prev_len - len,
						     true);
7509 7510
	}

7511 7512 7513 7514 7515
	/*
	 * We have created our ordered extent, so we can now release our reservation
	 * for an outstanding extent.
	 */
	btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7516 7517 7518 7519 7520

	/*
	 * Need to update the i_size under the extent lock so buffered
	 * readers will get the updated i_size when we unlock.
	 */
7521
	if (start + len > i_size_read(inode))
7522 7523
		i_size_write(inode, start + len);
out:
7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535
	if (ret && space_reserved) {
		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
		if (can_nocow) {
			btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
		} else {
			btrfs_delalloc_release_space(BTRFS_I(inode),
						     dio_data->data_reserved,
						     start, len, true);
			extent_changeset_free(dio_data->data_reserved);
			dio_data->data_reserved = NULL;
		}
	}
7536 7537 7538
	return ret;
}

7539 7540 7541
static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
		loff_t length, unsigned int flags, struct iomap *iomap,
		struct iomap *srcmap)
7542
{
7543
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7544
	struct extent_map *em;
7545
	struct extent_state *cached_state = NULL;
7546
	struct btrfs_dio_data *dio_data = NULL;
7547
	u64 lockstart, lockend;
7548
	const bool write = !!(flags & IOMAP_WRITE);
7549
	int ret = 0;
7550 7551
	u64 len = length;
	bool unlock_extents = false;
7552

7553
	if (!write)
7554
		len = min_t(u64, len, fs_info->sectorsize);
7555

7556 7557 7558
	lockstart = start;
	lockend = start + len - 1;

7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578
	/*
	 * The generic stuff only does filemap_write_and_wait_range, which
	 * isn't enough if we've written compressed pages to this area, so we
	 * need to flush the dirty pages again to make absolutely sure that any
	 * outstanding dirty pages are on disk.
	 */
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
		     &BTRFS_I(inode)->runtime_flags)) {
		ret = filemap_fdatawrite_range(inode->i_mapping, start,
					       start + length - 1);
		if (ret)
			return ret;
	}

	dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS);
	if (!dio_data)
		return -ENOMEM;

	iomap->private = dio_data;

7579

7580 7581 7582 7583
	/*
	 * If this errors out it's because we couldn't invalidate pagecache for
	 * this range and we need to fallback to buffered.
	 */
7584
	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) {
7585 7586 7587
		ret = -ENOTBLK;
		goto err;
	}
7588

7589
	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7590 7591 7592 7593
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto unlock_err;
	}
7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604

	/*
	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
	 * io.  INLINE is special, and we could probably kludge it in here, but
	 * it's still buffered so for safety lets just fall back to the generic
	 * buffered path.
	 *
	 * For COMPRESSED we _have_ to read the entire extent in so we can
	 * decompress it, so there will be buffering required no matter what we
	 * do, so go ahead and fallback to buffered.
	 *
7605
	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7606 7607 7608 7609 7610 7611
	 * to buffered IO.  Don't blame me, this is the price we pay for using
	 * the generic code.
	 */
	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
	    em->block_start == EXTENT_MAP_INLINE) {
		free_extent_map(em);
7612 7613
		ret = -ENOTBLK;
		goto unlock_err;
7614 7615
	}

7616
	len = min(len, em->len - (start - em->start));
7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644

	/*
	 * If we have a NOWAIT request and the range contains multiple extents
	 * (or a mix of extents and holes), then we return -EAGAIN to make the
	 * caller fallback to a context where it can do a blocking (without
	 * NOWAIT) request. This way we avoid doing partial IO and returning
	 * success to the caller, which is not optimal for writes and for reads
	 * it can result in unexpected behaviour for an application.
	 *
	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
	 * iomap_dio_rw(), we can end up returning less data then what the caller
	 * asked for, resulting in an unexpected, and incorrect, short read.
	 * That is, the caller asked to read N bytes and we return less than that,
	 * which is wrong unless we are crossing EOF. This happens if we get a
	 * page fault error when trying to fault in pages for the buffer that is
	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
	 * have previously submitted bios for other extents in the range, in
	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
	 * those bios have completed by the time we get the page fault error,
	 * which we return back to our caller - we should only return EIOCBQUEUED
	 * after we have submitted bios for all the extents in the range.
	 */
	if ((flags & IOMAP_NOWAIT) && len < length) {
		free_extent_map(em);
		ret = -EAGAIN;
		goto unlock_err;
	}

7645 7646 7647
	if (write) {
		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
						    start, len);
7648 7649
		if (ret < 0)
			goto unlock_err;
7650 7651 7652
		unlock_extents = true;
		/* Recalc len in case the new em is smaller than requested */
		len = min(len, em->len - (start - em->start));
7653
	} else {
7654 7655 7656 7657
		/*
		 * We need to unlock only the end area that we aren't using.
		 * The rest is going to be unlocked by the endio routine.
		 */
7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680
		lockstart = start + len;
		if (lockstart < lockend)
			unlock_extents = true;
	}

	if (unlock_extents)
		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
				     lockstart, lockend, &cached_state);
	else
		free_extent_state(cached_state);

	/*
	 * Translate extent map information to iomap.
	 * We trim the extents (and move the addr) even though iomap code does
	 * that, since we have locked only the parts we are performing I/O in.
	 */
	if ((em->block_start == EXTENT_MAP_HOLE) ||
	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
		iomap->addr = IOMAP_NULL_ADDR;
		iomap->type = IOMAP_HOLE;
	} else {
		iomap->addr = em->block_start + (start - em->start);
		iomap->type = IOMAP_MAPPED;
7681
	}
7682
	iomap->offset = start;
7683
	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7684
	iomap->length = len;
7685

7686
	if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
7687 7688
		iomap->flags |= IOMAP_F_ZONE_APPEND;

7689 7690 7691
	free_extent_map(em);

	return 0;
7692 7693

unlock_err:
7694 7695
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
			     &cached_state);
7696
err:
7697 7698
	kfree(dio_data);

7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727
	return ret;
}

static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
		ssize_t written, unsigned int flags, struct iomap *iomap)
{
	int ret = 0;
	struct btrfs_dio_data *dio_data = iomap->private;
	size_t submitted = dio_data->submitted;
	const bool write = !!(flags & IOMAP_WRITE);

	if (!write && (iomap->type == IOMAP_HOLE)) {
		/* If reading from a hole, unlock and return */
		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1);
		goto out;
	}

	if (submitted < length) {
		pos += submitted;
		length -= submitted;
		if (write)
			__endio_write_update_ordered(BTRFS_I(inode), pos,
					length, false);
		else
			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
				      pos + length - 1);
		ret = -ENOTBLK;
	}

7728
	if (write)
7729 7730 7731 7732 7733
		extent_changeset_free(dio_data->data_reserved);
out:
	kfree(dio_data);
	iomap->private = NULL;

7734 7735 7736
	return ret;
}

7737
static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
7738
{
7739 7740 7741 7742 7743 7744
	/*
	 * This implies a barrier so that stores to dio_bio->bi_status before
	 * this and loads of dio_bio->bi_status after this are fully ordered.
	 */
	if (!refcount_dec_and_test(&dip->refs))
		return;
7745

7746
	if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) {
7747
		__endio_write_update_ordered(BTRFS_I(dip->inode),
7748
					     dip->file_offset,
7749 7750 7751 7752
					     dip->bytes,
					     !dip->dio_bio->bi_status);
	} else {
		unlock_extent(&BTRFS_I(dip->inode)->io_tree,
7753 7754
			      dip->file_offset,
			      dip->file_offset + dip->bytes - 1);
7755 7756
	}

7757
	bio_endio(dip->dio_bio);
7758
	kfree(dip);
7759 7760
}

7761 7762 7763
static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio,
					  int mirror_num,
					  unsigned long bio_flags)
7764
{
7765
	struct btrfs_dio_private *dip = bio->bi_private;
7766
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7767
	blk_status_t ret;
7768

M
Mike Christie 已提交
7769
	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7770

7771
	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
7772
	if (ret)
7773
		return ret;
7774

7775
	refcount_inc(&dip->refs);
7776
	ret = btrfs_map_bio(fs_info, bio, mirror_num);
7777
	if (ret)
7778
		refcount_dec(&dip->refs);
7779
	return ret;
7780 7781
}

7782
static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
7783
					     struct btrfs_bio *bbio,
7784
					     const bool uptodate)
7785
{
7786
	struct inode *inode = dip->inode;
7787 7788 7789 7790 7791
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	const u32 sectorsize = fs_info->sectorsize;
	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
7792 7793
	struct bio_vec bvec;
	struct bvec_iter iter;
7794 7795
	const u64 orig_file_offset = dip->file_offset;
	u64 start = orig_file_offset;
7796
	u32 bio_offset = 0;
7797
	blk_status_t err = BLK_STS_OK;
7798

7799
	__bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
7800
		unsigned int i, nr_sectors, pgoff;
7801

7802 7803
		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
		pgoff = bvec.bv_offset;
7804
		for (i = 0; i < nr_sectors; i++) {
7805
			ASSERT(pgoff < PAGE_SIZE);
7806
			if (uptodate &&
7807
			    (!csum || !check_data_csum(inode, bbio,
7808 7809
						       bio_offset, bvec.bv_page,
						       pgoff, start))) {
7810 7811 7812 7813 7814
				clean_io_failure(fs_info, failure_tree, io_tree,
						 start, bvec.bv_page,
						 btrfs_ino(BTRFS_I(inode)),
						 pgoff);
			} else {
7815
				int ret;
7816

7817
				ASSERT((start - orig_file_offset) < UINT_MAX);
7818
				ret = btrfs_repair_one_sector(inode,
7819
						&bbio->bio,
7820
						start - orig_file_offset,
7821
						bvec.bv_page, pgoff,
7822
						start, bbio->mirror_num,
7823 7824 7825
						submit_dio_repair_bio);
				if (ret)
					err = errno_to_blk_status(ret);
7826 7827
			}
			start += sectorsize;
7828 7829
			ASSERT(bio_offset + sectorsize > bio_offset);
			bio_offset += sectorsize;
7830 7831
			pgoff += sectorsize;
		}
7832
	}
7833 7834 7835
	return err;
}

7836
static void __endio_write_update_ordered(struct btrfs_inode *inode,
7837 7838
					 const u64 offset, const u64 bytes,
					 const bool uptodate)
7839
{
7840 7841
	btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
				       finish_ordered_fn, uptodate);
7842 7843
}

7844
static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
7845 7846
						     struct bio *bio,
						     u64 dio_file_offset)
7847
{
7848
	return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, 1);
7849 7850
}

7851
static void btrfs_end_dio_bio(struct bio *bio)
M
Miao Xie 已提交
7852 7853
{
	struct btrfs_dio_private *dip = bio->bi_private;
7854
	blk_status_t err = bio->bi_status;
M
Miao Xie 已提交
7855

7856 7857
	if (err)
		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7858
			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
7859
			   btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
D
David Sterba 已提交
7860
			   bio->bi_opf, bio->bi_iter.bi_sector,
7861 7862
			   bio->bi_iter.bi_size, err);

7863 7864
	if (bio_op(bio) == REQ_OP_READ)
		err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err);
M
Miao Xie 已提交
7865

7866 7867
	if (err)
		dip->dio_bio->bi_status = err;
M
Miao Xie 已提交
7868

7869
	btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio);
7870

M
Miao Xie 已提交
7871
	bio_put(bio);
7872
	btrfs_dio_private_put(dip);
7873 7874
}

7875 7876
static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
		struct inode *inode, u64 file_offset, int async_submit)
M
Miao Xie 已提交
7877
{
7878
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7879
	struct btrfs_dio_private *dip = bio->bi_private;
7880
	bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
7881
	blk_status_t ret;
M
Miao Xie 已提交
7882

7883
	/* Check btrfs_submit_bio_hook() for rules about async submit. */
7884 7885 7886
	if (async_submit)
		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);

7887
	if (!write) {
7888
		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
7889 7890 7891
		if (ret)
			goto err;
	}
M
Miao Xie 已提交
7892

7893
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
7894 7895 7896
		goto map;

	if (write && async_submit) {
7897
		ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset,
7898
					  btrfs_submit_bio_start_direct_io);
M
Miao Xie 已提交
7899
		goto err;
7900 7901 7902 7903 7904
	} else if (write) {
		/*
		 * If we aren't doing async submit, calculate the csum of the
		 * bio now.
		 */
7905
		ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, 1);
7906 7907
		if (ret)
			goto err;
7908
	} else {
7909 7910
		u64 csum_offset;

7911
		csum_offset = file_offset - dip->file_offset;
7912
		csum_offset >>= fs_info->sectorsize_bits;
7913
		csum_offset *= fs_info->csum_size;
7914
		btrfs_bio(bio)->csum = dip->csums + csum_offset;
7915
	}
7916
map:
7917
	ret = btrfs_map_bio(fs_info, bio, 0);
M
Miao Xie 已提交
7918 7919 7920 7921
err:
	return ret;
}

7922 7923 7924 7925 7926 7927 7928
/*
 * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
 * or ordered extents whether or not we submit any bios.
 */
static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
							  struct inode *inode,
							  loff_t file_offset)
M
Miao Xie 已提交
7929
{
7930
	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
7931 7932
	const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
	size_t dip_size;
7933 7934
	struct btrfs_dio_private *dip;

7935 7936 7937 7938 7939
	dip_size = sizeof(*dip);
	if (!write && csum) {
		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
		size_t nblocks;

7940
		nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits;
7941
		dip_size += fs_info->csum_size * nblocks;
7942 7943 7944
	}

	dip = kzalloc(dip_size, GFP_NOFS);
7945 7946 7947 7948
	if (!dip)
		return NULL;

	dip->inode = inode;
7949
	dip->file_offset = file_offset;
7950
	dip->bytes = dio_bio->bi_iter.bi_size;
D
David Sterba 已提交
7951
	dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9;
7952
	dip->dio_bio = dio_bio;
7953
	refcount_set(&dip->refs, 1);
7954 7955 7956
	return dip;
}

7957
static void btrfs_submit_direct(const struct iomap_iter *iter,
7958
		struct bio *dio_bio, loff_t file_offset)
7959
{
7960
	struct inode *inode = iter->inode;
7961
	const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE);
7962
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7963 7964
	const bool raid56 = (btrfs_data_alloc_profile(fs_info) &
			     BTRFS_BLOCK_GROUP_RAID56_MASK);
7965
	struct btrfs_dio_private *dip;
M
Miao Xie 已提交
7966
	struct bio *bio;
7967
	u64 start_sector;
7968
	int async_submit = 0;
7969
	u64 submit_len;
7970 7971
	u64 clone_offset = 0;
	u64 clone_len;
7972
	u64 logical;
7973
	int ret;
7974
	blk_status_t status;
7975
	struct btrfs_io_geometry geom;
7976
	struct btrfs_dio_data *dio_data = iter->iomap.private;
7977
	struct extent_map *em = NULL;
M
Miao Xie 已提交
7978

7979 7980 7981 7982 7983 7984 7985
	dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
	if (!dip) {
		if (!write) {
			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
				file_offset + dio_bio->bi_iter.bi_size - 1);
		}
		dio_bio->bi_status = BLK_STS_RESOURCE;
7986
		bio_endio(dio_bio);
7987
		return;
7988
	}
7989

7990
	if (!write) {
7991 7992 7993
		/*
		 * Load the csums up front to reduce csum tree searches and
		 * contention when submitting bios.
7994 7995
		 *
		 * If we have csums disabled this will do nothing.
7996
		 */
7997
		status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums);
7998 7999
		if (status != BLK_STS_OK)
			goto out_err;
8000 8001
	}

8002 8003
	start_sector = dio_bio->bi_iter.bi_sector;
	submit_len = dio_bio->bi_iter.bi_size;
D
David Woodhouse 已提交
8004

8005
	do {
8006 8007 8008 8009 8010 8011 8012 8013
		logical = start_sector << 9;
		em = btrfs_get_chunk_map(fs_info, logical, submit_len);
		if (IS_ERR(em)) {
			status = errno_to_blk_status(PTR_ERR(em));
			em = NULL;
			goto out_err_em;
		}
		ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(dio_bio),
8014
					    logical, &geom);
8015 8016
		if (ret) {
			status = errno_to_blk_status(ret);
8017
			goto out_err_em;
8018 8019
		}

8020 8021
		clone_len = min(submit_len, geom.len);
		ASSERT(clone_len <= UINT_MAX);
8022

8023 8024 8025 8026
		/*
		 * This will never fail as it's passing GPF_NOFS and
		 * the allocation is backed by btrfs_bioset.
		 */
8027
		bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len);
8028 8029 8030
		bio->bi_private = dip;
		bio->bi_end_io = btrfs_end_dio_bio;

8031 8032 8033 8034 8035 8036 8037 8038 8039
		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
			status = extract_ordered_extent(BTRFS_I(inode), bio,
							file_offset);
			if (status) {
				bio_put(bio);
				goto out_err;
			}
		}

8040 8041
		ASSERT(submit_len >= clone_len);
		submit_len -= clone_len;
M
Miao Xie 已提交
8042

8043 8044 8045 8046 8047
		/*
		 * Increase the count before we submit the bio so we know
		 * the end IO handler won't happen before we increase the
		 * count. Otherwise, the dip might get freed before we're
		 * done setting it up.
8048 8049 8050
		 *
		 * We transfer the initial reference to the last bio, so we
		 * don't need to increment the reference count for the last one.
8051
		 */
8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062
		if (submit_len > 0) {
			refcount_inc(&dip->refs);
			/*
			 * If we are submitting more than one bio, submit them
			 * all asynchronously. The exception is RAID 5 or 6, as
			 * asynchronous checksums make it difficult to collect
			 * full stripe writes.
			 */
			if (!raid56)
				async_submit = 1;
		}
M
Miao Xie 已提交
8063

8064
		status = btrfs_submit_dio_bio(bio, inode, file_offset,
8065 8066
						async_submit);
		if (status) {
8067
			bio_put(bio);
8068 8069
			if (submit_len > 0)
				refcount_dec(&dip->refs);
8070
			goto out_err_em;
8071
		}
M
Miao Xie 已提交
8072

8073
		dio_data->submitted += clone_len;
8074 8075 8076
		clone_offset += clone_len;
		start_sector += clone_len >> 9;
		file_offset += clone_len;
8077 8078

		free_extent_map(em);
8079
	} while (submit_len > 0);
8080
	return;
M
Miao Xie 已提交
8081

8082 8083
out_err_em:
	free_extent_map(em);
M
Miao Xie 已提交
8084
out_err:
8085 8086
	dip->dio_bio->bi_status = status;
	btrfs_dio_private_put(dip);
8087 8088
}

8089
const struct iomap_ops btrfs_dio_iomap_ops = {
8090 8091 8092 8093
	.iomap_begin            = btrfs_dio_iomap_begin,
	.iomap_end              = btrfs_dio_iomap_end,
};

8094
const struct iomap_dio_ops btrfs_dio_ops = {
8095 8096 8097
	.submit_io		= btrfs_submit_direct,
};

Y
Yehuda Sadeh 已提交
8098
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8099
			u64 start, u64 len)
Y
Yehuda Sadeh 已提交
8100
{
T
Tsutomu Itoh 已提交
8101 8102
	int	ret;

8103
	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
T
Tsutomu Itoh 已提交
8104 8105 8106
	if (ret)
		return ret;

8107
	return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
Y
Yehuda Sadeh 已提交
8108 8109
}

8110
int btrfs_readpage(struct file *file, struct page *page)
C
Chris Mason 已提交
8111
{
8112 8113 8114
	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
	u64 start = page_offset(page);
	u64 end = start + PAGE_SIZE - 1;
8115
	struct btrfs_bio_ctrl bio_ctrl = { 0 };
8116 8117
	int ret;

8118 8119
	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);

8120
	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
8121 8122 8123 8124 8125 8126 8127
	if (bio_ctrl.bio) {
		int ret2;

		ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags);
		if (ret == 0)
			ret = ret2;
	}
8128
	return ret;
C
Chris Mason 已提交
8129
}
8130

8131
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
C
Chris Mason 已提交
8132
{
J
Josef Bacik 已提交
8133 8134
	struct inode *inode = page->mapping->host;
	int ret;
8135 8136 8137 8138 8139 8140

	if (current->flags & PF_MEMALLOC) {
		redirty_page_for_writepage(wbc, page);
		unlock_page(page);
		return 0;
	}
J
Josef Bacik 已提交
8141 8142 8143 8144 8145 8146 8147 8148 8149 8150

	/*
	 * If we are under memory pressure we will call this directly from the
	 * VM, we need to make sure we have the inode referenced for the ordered
	 * extent.  If not just return like we didn't do anything.
	 */
	if (!igrab(inode)) {
		redirty_page_for_writepage(wbc, page);
		return AOP_WRITEPAGE_ACTIVATE;
	}
8151
	ret = extent_write_full_page(page, wbc);
J
Josef Bacik 已提交
8152 8153
	btrfs_add_delayed_iput(inode);
	return ret;
C
Chris Mason 已提交
8154 8155
}

8156 8157
static int btrfs_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
C
Chris Mason 已提交
8158
{
8159
	return extent_writepages(mapping, wbc);
C
Chris Mason 已提交
8160 8161
}

8162
static void btrfs_readahead(struct readahead_control *rac)
C
Chris Mason 已提交
8163
{
8164
	extent_readahead(rac);
C
Chris Mason 已提交
8165
}
8166

8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199
/*
 * For releasepage() and invalidatepage() we have a race window where
 * end_page_writeback() is called but the subpage spinlock is not yet released.
 * If we continue to release/invalidate the page, we could cause use-after-free
 * for subpage spinlock.  So this function is to spin and wait for subpage
 * spinlock.
 */
static void wait_subpage_spinlock(struct page *page)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
	struct btrfs_subpage *subpage;

	if (fs_info->sectorsize == PAGE_SIZE)
		return;

	ASSERT(PagePrivate(page) && page->private);
	subpage = (struct btrfs_subpage *)page->private;

	/*
	 * This may look insane as we just acquire the spinlock and release it,
	 * without doing anything.  But we just want to make sure no one is
	 * still holding the subpage spinlock.
	 * And since the page is not dirty nor writeback, and we have page
	 * locked, the only possible way to hold a spinlock is from the endio
	 * function to clear page writeback.
	 *
	 * Here we just acquire the spinlock so that all existing callers
	 * should exit and we're safe to release/invalidate the page.
	 */
	spin_lock_irq(&subpage->lock);
	spin_unlock_irq(&subpage->lock);
}

8200
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
C
Chris Mason 已提交
8201
{
8202
	int ret = try_release_extent_mapping(page, gfp_flags);
8203 8204 8205

	if (ret == 1) {
		wait_subpage_spinlock(page);
8206
		clear_page_extent_mapped(page);
8207
	}
8208
	return ret;
C
Chris Mason 已提交
8209 8210
}

8211 8212
static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
8213 8214
	if (PageWriteback(page) || PageDirty(page))
		return 0;
8215
	return __btrfs_releasepage(page, gfp_flags);
8216 8217
}

8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228
#ifdef CONFIG_MIGRATION
static int btrfs_migratepage(struct address_space *mapping,
			     struct page *newpage, struct page *page,
			     enum migrate_mode mode)
{
	int ret;

	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
	if (ret != MIGRATEPAGE_SUCCESS)
		return ret;

8229 8230
	if (page_has_private(page))
		attach_page_private(newpage, detach_page_private(page));
8231

8232 8233 8234
	if (PageOrdered(page)) {
		ClearPageOrdered(page);
		SetPageOrdered(newpage);
8235 8236 8237 8238 8239 8240 8241 8242 8243 8244
	}

	if (mode != MIGRATE_SYNC_NO_COPY)
		migrate_page_copy(newpage, page);
	else
		migrate_page_states(newpage, page);
	return MIGRATEPAGE_SUCCESS;
}
#endif

8245 8246
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
				 unsigned int length)
C
Chris Mason 已提交
8247
{
8248
	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
8249
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
8250
	struct extent_io_tree *tree = &inode->io_tree;
8251
	struct extent_state *cached_state = NULL;
8252
	u64 page_start = page_offset(page);
8253
	u64 page_end = page_start + PAGE_SIZE - 1;
8254
	u64 cur;
8255
	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
C
Chris Mason 已提交
8256

8257
	/*
8258 8259
	 * We have page locked so no new ordered extent can be created on this
	 * page, nor bio can be submitted for this page.
8260
	 *
8261
	 * But already submitted bio can still be finished on this page.
8262 8263 8264 8265
	 * Furthermore, endio function won't skip page which has Ordered
	 * (Private2) already cleared, so it's possible for endio and
	 * invalidatepage to do the same ordered extent accounting twice
	 * on one page.
8266 8267 8268
	 *
	 * So here we wait for any submitted bios to finish, so that we won't
	 * do double ordered extent accounting on the same page.
8269
	 */
8270
	wait_on_page_writeback(page);
8271
	wait_subpage_spinlock(page);
8272

8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285
	/*
	 * For subpage case, we have call sites like
	 * btrfs_punch_hole_lock_range() which passes range not aligned to
	 * sectorsize.
	 * If the range doesn't cover the full page, we don't need to and
	 * shouldn't clear page extent mapped, as page->private can still
	 * record subpage dirty bits for other part of the range.
	 *
	 * For cases that can invalidate the full even the range doesn't
	 * cover the full page, like invalidating the last page, we're
	 * still safe to wait for ordered extent to finish.
	 */
	if (!(offset == 0 && length == PAGE_SIZE)) {
8286 8287 8288
		btrfs_releasepage(page, GFP_NOFS);
		return;
	}
8289 8290

	if (!inode_evicting)
8291
		lock_extent_bits(tree, page_start, page_end, &cached_state);
8292

8293 8294 8295 8296 8297
	cur = page_start;
	while (cur < page_end) {
		struct btrfs_ordered_extent *ordered;
		bool delete_states;
		u64 range_end;
8298
		u32 range_len;
8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324

		ordered = btrfs_lookup_first_ordered_range(inode, cur,
							   page_end + 1 - cur);
		if (!ordered) {
			range_end = page_end;
			/*
			 * No ordered extent covering this range, we are safe
			 * to delete all extent states in the range.
			 */
			delete_states = true;
			goto next;
		}
		if (ordered->file_offset > cur) {
			/*
			 * There is a range between [cur, oe->file_offset) not
			 * covered by any ordered extent.
			 * We are safe to delete all extent states, and handle
			 * the ordered extent in the next iteration.
			 */
			range_end = ordered->file_offset - 1;
			delete_states = true;
			goto next;
		}

		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
				page_end);
8325 8326 8327
		ASSERT(range_end + 1 - cur < U32_MAX);
		range_len = range_end + 1 - cur;
		if (!btrfs_page_test_ordered(fs_info, page, cur, range_len)) {
8328
			/*
8329 8330
			 * If Ordered (Private2) is cleared, it means endio has
			 * already been executed for the range.
8331 8332 8333 8334 8335 8336
			 * We can't delete the extent states as
			 * btrfs_finish_ordered_io() may still use some of them.
			 */
			delete_states = false;
			goto next;
		}
8337
		btrfs_page_clear_ordered(fs_info, page, cur, range_len);
8338

8339
		/*
8340 8341 8342
		 * IO on this page will never be started, so we need to account
		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
		 * here, must leave that up for the ordered extent completion.
8343 8344 8345
		 *
		 * This will also unlock the range for incoming
		 * btrfs_finish_ordered_io().
8346
		 */
8347
		if (!inode_evicting)
8348
			clear_extent_bit(tree, cur, range_end,
8349
					 EXTENT_DELALLOC |
8350
					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8351
					 EXTENT_DEFRAG, 1, 0, &cached_state);
8352 8353 8354 8355 8356 8357 8358 8359

		spin_lock_irq(&inode->ordered_tree.lock);
		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
		ordered->truncated_len = min(ordered->truncated_len,
					     cur - ordered->file_offset);
		spin_unlock_irq(&inode->ordered_tree.lock);

		if (btrfs_dec_test_ordered_pending(inode, &ordered,
8360
						   cur, range_end + 1 - cur)) {
8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377
			btrfs_finish_ordered_io(ordered);
			/*
			 * The ordered extent has finished, now we're again
			 * safe to delete all extent states of the range.
			 */
			delete_states = true;
		} else {
			/*
			 * btrfs_finish_ordered_io() will get executed by endio
			 * of other pages, thus we can't delete extent states
			 * anymore
			 */
			delete_states = false;
		}
next:
		if (ordered)
			btrfs_put_ordered_extent(ordered);
8378
		/*
8379 8380
		 * Qgroup reserved space handler
		 * Sector(s) here will be either:
8381
		 *
8382 8383 8384 8385 8386 8387 8388 8389 8390 8391
		 * 1) Already written to disk or bio already finished
		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
		 *    Qgroup will be handled by its qgroup_record then.
		 *    btrfs_qgroup_free_data() call will do nothing here.
		 *
		 * 2) Not written to disk yet
		 *    Then btrfs_qgroup_free_data() call will clear the
		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
		 *    reserved data space.
		 *    Since the IO will never happen for this page.
8392
		 */
8393
		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
8394
		if (!inode_evicting) {
8395 8396 8397 8398
			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
				 EXTENT_DELALLOC | EXTENT_UPTODATE |
				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1,
				 delete_states, &cached_state);
8399
		}
8400
		cur = range_end + 1;
8401
	}
Q
Qu Wenruo 已提交
8402
	/*
8403
	 * We have iterated through all ordered extents of the page, the page
8404 8405
	 * should not have Ordered (Private2) anymore, or the above iteration
	 * did something wrong.
Q
Qu Wenruo 已提交
8406
	 */
8407
	ASSERT(!PageOrdered(page));
8408
	btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE);
8409
	if (!inode_evicting)
8410
		__btrfs_releasepage(page, GFP_NOFS);
8411
	clear_page_extent_mapped(page);
C
Chris Mason 已提交
8412 8413
}

C
Chris Mason 已提交
8414 8415 8416 8417 8418 8419 8420 8421 8422 8423
/*
 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
8424 8425
 * truncate_setsize() writes the inode size before removing pages, once we have
 * the page lock we can determine safely if the page is beyond EOF. If it is not
C
Chris Mason 已提交
8426 8427 8428
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 */
8429
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
C
Chris Mason 已提交
8430
{
8431
	struct page *page = vmf->page;
8432
	struct inode *inode = file_inode(vmf->vma->vm_file);
8433
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8434 8435
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct btrfs_ordered_extent *ordered;
8436
	struct extent_state *cached_state = NULL;
8437
	struct extent_changeset *data_reserved = NULL;
8438
	unsigned long zero_start;
C
Chris Mason 已提交
8439
	loff_t size;
8440 8441
	vm_fault_t ret;
	int ret2;
8442
	int reserved = 0;
8443
	u64 reserved_space;
8444
	u64 page_start;
8445
	u64 page_end;
8446 8447
	u64 end;

8448
	reserved_space = PAGE_SIZE;
C
Chris Mason 已提交
8449

8450
	sb_start_pagefault(inode->i_sb);
8451
	page_start = page_offset(page);
8452
	page_end = page_start + PAGE_SIZE - 1;
8453
	end = page_end;
8454

8455 8456 8457 8458 8459 8460 8461 8462
	/*
	 * Reserving delalloc space after obtaining the page lock can lead to
	 * deadlock. For example, if a dirty page is locked by this function
	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
	 * dirty page write out, then the btrfs_writepage() function could
	 * end up waiting indefinitely to get a lock on the page currently
	 * being processed by btrfs_page_mkwrite() function.
	 */
8463 8464
	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
					    page_start, reserved_space);
8465 8466
	if (!ret2) {
		ret2 = file_update_time(vmf->vma->vm_file);
8467 8468
		reserved = 1;
	}
8469 8470
	if (ret2) {
		ret = vmf_error(ret2);
8471 8472 8473
		if (reserved)
			goto out;
		goto out_noreserve;
8474
	}
8475

8476
	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8477
again:
8478
	down_read(&BTRFS_I(inode)->i_mmap_lock);
C
Chris Mason 已提交
8479 8480
	lock_page(page);
	size = i_size_read(inode);
8481

C
Chris Mason 已提交
8482
	if ((page->mapping != inode->i_mapping) ||
8483
	    (page_start >= size)) {
C
Chris Mason 已提交
8484 8485 8486
		/* page got truncated out from underneath us */
		goto out_unlock;
	}
8487 8488
	wait_on_page_writeback(page);

8489
	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
8490 8491 8492 8493 8494 8495
	ret2 = set_page_extent_mapped(page);
	if (ret2 < 0) {
		ret = vmf_error(ret2);
		unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
		goto out_unlock;
	}
8496

8497 8498 8499 8500
	/*
	 * we can't set the delalloc bits if there are pending ordered
	 * extents.  Drop our locks and wait for them to finish
	 */
8501 8502
	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
			PAGE_SIZE);
8503
	if (ordered) {
8504
		unlock_extent_cached(io_tree, page_start, page_end,
8505
				     &cached_state);
8506
		unlock_page(page);
8507
		up_read(&BTRFS_I(inode)->i_mmap_lock);
8508
		btrfs_start_ordered_extent(ordered, 1);
8509 8510 8511 8512
		btrfs_put_ordered_extent(ordered);
		goto again;
	}

8513
	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8514
		reserved_space = round_up(size - page_start,
8515
					  fs_info->sectorsize);
8516
		if (reserved_space < PAGE_SIZE) {
8517
			end = page_start + reserved_space - 1;
8518 8519 8520
			btrfs_delalloc_release_space(BTRFS_I(inode),
					data_reserved, page_start,
					PAGE_SIZE - reserved_space, true);
8521 8522 8523
		}
	}

J
Josef Bacik 已提交
8524
	/*
8525 8526 8527 8528 8529
	 * page_mkwrite gets called when the page is firstly dirtied after it's
	 * faulted in, but write(2) could also dirty a page and set delalloc
	 * bits, thus in this case for space account reason, we still need to
	 * clear any delalloc bits within this page range since we have to
	 * reserve data&meta space before lock_page() (see above comments).
J
Josef Bacik 已提交
8530
	 */
8531
	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8532 8533
			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
			  EXTENT_DEFRAG, 0, 0, &cached_state);
J
Josef Bacik 已提交
8534

8535
	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8536
					&cached_state);
8537
	if (ret2) {
8538
		unlock_extent_cached(io_tree, page_start, page_end,
8539
				     &cached_state);
J
Josef Bacik 已提交
8540 8541 8542
		ret = VM_FAULT_SIGBUS;
		goto out_unlock;
	}
C
Chris Mason 已提交
8543 8544

	/* page is wholly or partially inside EOF */
8545
	if (page_start + PAGE_SIZE > size)
8546
		zero_start = offset_in_page(size);
C
Chris Mason 已提交
8547
	else
8548
		zero_start = PAGE_SIZE;
C
Chris Mason 已提交
8549

8550
	if (zero_start != PAGE_SIZE) {
8551
		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8552 8553
		flush_dcache_page(page);
	}
8554
	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8555 8556
	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8557

8558
	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8559

8560
	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
8561
	up_read(&BTRFS_I(inode)->i_mmap_lock);
C
Chris Mason 已提交
8562

8563 8564 8565 8566
	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
	sb_end_pagefault(inode->i_sb);
	extent_changeset_free(data_reserved);
	return VM_FAULT_LOCKED;
8567 8568

out_unlock:
C
Chris Mason 已提交
8569
	unlock_page(page);
8570
	up_read(&BTRFS_I(inode)->i_mmap_lock);
8571
out:
8572
	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8573
	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8574
				     reserved_space, (ret != 0));
8575
out_noreserve:
8576
	sb_end_pagefault(inode->i_sb);
8577
	extent_changeset_free(data_reserved);
C
Chris Mason 已提交
8578 8579 8580
	return ret;
}

8581
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
C
Chris Mason 已提交
8582
{
J
Josef Bacik 已提交
8583
	struct btrfs_truncate_control control = {
8584
		.inode = BTRFS_I(inode),
8585
		.ino = btrfs_ino(BTRFS_I(inode)),
J
Josef Bacik 已提交
8586
		.min_type = BTRFS_EXTENT_DATA_KEY,
8587
		.clear_extent_range = true,
J
Josef Bacik 已提交
8588
	};
8589
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
8590
	struct btrfs_root *root = BTRFS_I(inode)->root;
8591
	struct btrfs_block_rsv *rsv;
8592
	int ret;
C
Chris Mason 已提交
8593
	struct btrfs_trans_handle *trans;
8594
	u64 mask = fs_info->sectorsize - 1;
8595
	u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
C
Chris Mason 已提交
8596

8597 8598 8599 8600 8601 8602
	if (!skip_writeback) {
		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
					       (u64)-1);
		if (ret)
			return ret;
	}
C
Chris Mason 已提交
8603

8604
	/*
8605 8606
	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
	 * things going on here:
8607
	 *
8608
	 * 1) We need to reserve space to update our inode.
8609
	 *
8610
	 * 2) We need to have something to cache all the space that is going to
8611 8612 8613 8614
	 * be free'd up by the truncate operation, but also have some slack
	 * space reserved in case it uses space during the truncate (thank you
	 * very much snapshotting).
	 *
8615
	 * And we need these to be separate.  The fact is we can use a lot of
8616
	 * space doing the truncate, and we have no earthly idea how much space
8617
	 * we will use, so we need the truncate reservation to be separate so it
8618 8619 8620 8621 8622 8623
	 * doesn't end up using space reserved for updating the inode.  We also
	 * need to be able to stop the transaction and start a new one, which
	 * means we need to be able to update the inode several times, and we
	 * have no idea of knowing how many times that will be, so we can't just
	 * reserve 1 item for the entirety of the operation, so that has to be
	 * done separately as well.
8624 8625 8626
	 *
	 * So that leaves us with
	 *
8627
	 * 1) rsv - for the truncate reservation, which we will steal from the
8628
	 * transaction reservation.
8629
	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8630 8631
	 * updating the inode.
	 */
8632
	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8633 8634
	if (!rsv)
		return -ENOMEM;
J
Josef Bacik 已提交
8635
	rsv->size = min_size;
8636
	rsv->failfast = 1;
8637

8638
	/*
8639
	 * 1 for the truncate slack space
8640 8641
	 * 1 for updating the inode.
	 */
8642
	trans = btrfs_start_transaction(root, 2);
8643
	if (IS_ERR(trans)) {
8644
		ret = PTR_ERR(trans);
8645 8646
		goto out;
	}
8647

8648
	/* Migrate the slack space for the truncate to our reserve */
8649
	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8650
				      min_size, false);
8651
	BUG_ON(ret);
8652

8653
	trans->block_rsv = rsv;
8654

8655
	while (1) {
8656 8657 8658 8659
		struct extent_state *cached_state = NULL;
		const u64 new_size = inode->i_size;
		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);

J
Josef Bacik 已提交
8660
		control.new_size = new_size;
8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671
		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
				 &cached_state);
		/*
		 * We want to drop from the next block forward in case this new
		 * size is not block aligned since we will be keeping the last
		 * block of the extent just the way it is.
		 */
		btrfs_drop_extent_cache(BTRFS_I(inode),
					ALIGN(new_size, fs_info->sectorsize),
					(u64)-1, 0);

8672
		ret = btrfs_truncate_inode_items(trans, root, &control);
8673

8674
		inode_sub_bytes(inode, control.sub_bytes);
8675 8676
		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size);

8677 8678 8679
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
				     (u64)-1, &cached_state);

8680
		trans->block_rsv = &fs_info->trans_block_rsv;
8681
		if (ret != -ENOSPC && ret != -EAGAIN)
8682
			break;
C
Chris Mason 已提交
8683

8684
		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
8685
		if (ret)
8686
			break;
8687

8688
		btrfs_end_transaction(trans);
8689
		btrfs_btree_balance_dirty(fs_info);
8690 8691 8692

		trans = btrfs_start_transaction(root, 2);
		if (IS_ERR(trans)) {
8693
			ret = PTR_ERR(trans);
8694 8695 8696 8697
			trans = NULL;
			break;
		}

8698
		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8699
		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8700
					      rsv, min_size, false);
8701 8702
		BUG_ON(ret);	/* shouldn't happen */
		trans->block_rsv = rsv;
8703 8704
	}

8705 8706
	/*
	 * We can't call btrfs_truncate_block inside a trans handle as we could
8707 8708 8709
	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
	 * know we've truncated everything except the last little bit, and can
	 * do btrfs_truncate_block and then update the disk_i_size.
8710
	 */
8711
	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8712 8713 8714
		btrfs_end_transaction(trans);
		btrfs_btree_balance_dirty(fs_info);

8715
		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
8716 8717 8718 8719 8720 8721 8722
		if (ret)
			goto out;
		trans = btrfs_start_transaction(root, 1);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			goto out;
		}
8723
		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
8724 8725
	}

8726
	if (trans) {
8727 8728
		int ret2;

8729
		trans->block_rsv = &fs_info->trans_block_rsv;
8730
		ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode));
8731 8732
		if (ret2 && !ret)
			ret = ret2;
8733

8734 8735 8736
		ret2 = btrfs_end_transaction(trans);
		if (ret2 && !ret)
			ret = ret2;
8737
		btrfs_btree_balance_dirty(fs_info);
8738
	}
8739
out:
8740
	btrfs_free_block_rsv(fs_info, rsv);
8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754
	/*
	 * So if we truncate and then write and fsync we normally would just
	 * write the extents that changed, which is a problem if we need to
	 * first truncate that entire inode.  So set this flag so we write out
	 * all of the extents in the inode to the sync log so we're completely
	 * safe.
	 *
	 * If no extents were dropped or trimmed we don't need to force the next
	 * fsync to truncate all the inode's items from the log and re-log them
	 * all. This means the truncate operation did not change the file size,
	 * or changed it to a smaller size but there was only an implicit hole
	 * between the old i_size and the new i_size, and there were no prealloc
	 * extents beyond i_size to drop.
	 */
J
Josef Bacik 已提交
8755
	if (control.extents_found > 0)
8756
		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
8757

8758
	return ret;
C
Chris Mason 已提交
8759 8760
}

C
Chris Mason 已提交
8761 8762 8763
/*
 * create a new subvolume directory/inode (helper for the ioctl).
 */
8764
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8765
			     struct btrfs_root *new_root,
8766 8767
			     struct btrfs_root *parent_root,
			     struct user_namespace *mnt_userns)
C
Chris Mason 已提交
8768 8769
{
	struct inode *inode;
8770
	int err;
8771
	u64 index = 0;
8772 8773 8774 8775 8776
	u64 ino;

	err = btrfs_get_free_objectid(new_root, &ino);
	if (err < 0)
		return err;
C
Chris Mason 已提交
8777

8778
	inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2,
8779
				ino, ino,
8780 8781
				S_IFDIR | (~current_umask() & S_IRWXUGO),
				&index);
8782
	if (IS_ERR(inode))
C
Christoph Hellwig 已提交
8783
		return PTR_ERR(inode);
C
Chris Mason 已提交
8784 8785 8786
	inode->i_op = &btrfs_dir_inode_operations;
	inode->i_fop = &btrfs_dir_file_operations;

M
Miklos Szeredi 已提交
8787
	set_nlink(inode, 1);
8788
	btrfs_i_size_write(BTRFS_I(inode), 0);
8789
	unlock_new_inode(inode);
8790

8791 8792 8793
	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
	if (err)
		btrfs_err(new_root->fs_info,
8794
			  "error inheriting subvolume %llu properties: %d",
8795 8796
			  new_root->root_key.objectid, err);

8797
	err = btrfs_update_inode(trans, new_root, BTRFS_I(inode));
8798

8799
	iput(inode);
8800
	return err;
C
Chris Mason 已提交
8801 8802 8803 8804
}

struct inode *btrfs_alloc_inode(struct super_block *sb)
{
8805
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
C
Chris Mason 已提交
8806
	struct btrfs_inode *ei;
Y
Yan, Zheng 已提交
8807
	struct inode *inode;
C
Chris Mason 已提交
8808

8809
	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
C
Chris Mason 已提交
8810 8811
	if (!ei)
		return NULL;
Y
Yan, Zheng 已提交
8812 8813 8814

	ei->root = NULL;
	ei->generation = 0;
8815
	ei->last_trans = 0;
8816
	ei->last_sub_trans = 0;
8817
	ei->logged_trans = 0;
Y
Yan, Zheng 已提交
8818
	ei->delalloc_bytes = 0;
8819
	ei->new_delalloc_bytes = 0;
8820
	ei->defrag_bytes = 0;
Y
Yan, Zheng 已提交
8821 8822
	ei->disk_i_size = 0;
	ei->flags = 0;
8823
	ei->ro_flags = 0;
8824
	ei->csum_bytes = 0;
Y
Yan, Zheng 已提交
8825
	ei->index_cnt = (u64)-1;
8826
	ei->dir_index = 0;
Y
Yan, Zheng 已提交
8827
	ei->last_unlink_trans = 0;
8828
	ei->last_reflink_trans = 0;
8829
	ei->last_log_commit = 0;
Y
Yan, Zheng 已提交
8830

8831 8832
	spin_lock_init(&ei->lock);
	ei->outstanding_extents = 0;
8833 8834 8835
	if (sb->s_magic != BTRFS_TEST_MAGIC)
		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
					      BTRFS_BLOCK_RSV_DELALLOC);
8836
	ei->runtime_flags = 0;
8837
	ei->prop_compress = BTRFS_COMPRESS_NONE;
8838
	ei->defrag_compress = BTRFS_COMPRESS_NONE;
Y
Yan, Zheng 已提交
8839

8840 8841
	ei->delayed_node = NULL;

8842 8843 8844
	ei->i_otime.tv_sec = 0;
	ei->i_otime.tv_nsec = 0;

Y
Yan, Zheng 已提交
8845
	inode = &ei->vfs_inode;
8846
	extent_map_tree_init(&ei->extent_tree);
8847 8848 8849
	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
	extent_io_tree_init(fs_info, &ei->io_failure_tree,
			    IO_TREE_INODE_IO_FAILURE, inode);
8850 8851
	extent_io_tree_init(fs_info, &ei->file_extent_tree,
			    IO_TREE_INODE_FILE_EXTENT, inode);
8852 8853
	ei->io_tree.track_uptodate = true;
	ei->io_failure_tree.track_uptodate = true;
8854
	atomic_set(&ei->sync_writers, 0);
Y
Yan, Zheng 已提交
8855
	mutex_init(&ei->log_mutex);
8856
	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
Y
Yan, Zheng 已提交
8857
	INIT_LIST_HEAD(&ei->delalloc_inodes);
8858
	INIT_LIST_HEAD(&ei->delayed_iput);
Y
Yan, Zheng 已提交
8859
	RB_CLEAR_NODE(&ei->rb_node);
8860
	init_rwsem(&ei->i_mmap_lock);
Y
Yan, Zheng 已提交
8861 8862

	return inode;
C
Chris Mason 已提交
8863 8864
}

8865 8866 8867
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
8868
	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
8869 8870 8871 8872
	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif

A
Al Viro 已提交
8873
void btrfs_free_inode(struct inode *inode)
N
Nick Piggin 已提交
8874 8875 8876 8877
{
	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}

8878
void btrfs_destroy_inode(struct inode *vfs_inode)
C
Chris Mason 已提交
8879
{
8880
	struct btrfs_ordered_extent *ordered;
8881 8882
	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
	struct btrfs_root *root = inode->root;
8883

8884 8885 8886 8887 8888
	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
	WARN_ON(vfs_inode->i_data.nrpages);
	WARN_ON(inode->block_rsv.reserved);
	WARN_ON(inode->block_rsv.size);
	WARN_ON(inode->outstanding_extents);
8889 8890 8891 8892
	if (!S_ISDIR(vfs_inode->i_mode)) {
		WARN_ON(inode->delalloc_bytes);
		WARN_ON(inode->new_delalloc_bytes);
	}
8893 8894
	WARN_ON(inode->csum_bytes);
	WARN_ON(inode->defrag_bytes);
C
Chris Mason 已提交
8895

8896 8897 8898 8899 8900 8901
	/*
	 * This can happen where we create an inode, but somebody else also
	 * created the same inode and we need to destroy the one we already
	 * created.
	 */
	if (!root)
A
Al Viro 已提交
8902
		return;
8903

C
Chris Mason 已提交
8904
	while (1) {
8905
		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8906 8907 8908
		if (!ordered)
			break;
		else {
8909
			btrfs_err(root->fs_info,
J
Jeff Mahoney 已提交
8910
				  "found ordered extent %llu %llu on inode cleanup",
8911
				  ordered->file_offset, ordered->num_bytes);
8912
			btrfs_remove_ordered_extent(inode, ordered);
8913 8914 8915 8916
			btrfs_put_ordered_extent(ordered);
			btrfs_put_ordered_extent(ordered);
		}
	}
8917 8918 8919 8920 8921
	btrfs_qgroup_check_reserved_leak(inode);
	inode_tree_del(inode);
	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
	btrfs_put_root(inode->root);
C
Chris Mason 已提交
8922 8923
}

8924
int btrfs_drop_inode(struct inode *inode)
8925 8926
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
8927

8928 8929 8930
	if (root == NULL)
		return 1;

8931
	/* the snap/subvol tree is on deleting */
8932
	if (btrfs_root_refs(&root->root_item) == 0)
8933
		return 1;
8934
	else
8935
		return generic_drop_inode(inode);
8936 8937
}

8938
static void init_once(void *foo)
C
Chris Mason 已提交
8939 8940 8941 8942 8943 8944
{
	struct btrfs_inode *ei = (struct btrfs_inode *) foo;

	inode_init_once(&ei->vfs_inode);
}

8945
void __cold btrfs_destroy_cachep(void)
C
Chris Mason 已提交
8946
{
8947 8948 8949 8950 8951
	/*
	 * Make sure all delayed rcu free inodes are flushed before we
	 * destroy cache.
	 */
	rcu_barrier();
8952 8953 8954 8955
	kmem_cache_destroy(btrfs_inode_cachep);
	kmem_cache_destroy(btrfs_trans_handle_cachep);
	kmem_cache_destroy(btrfs_path_cachep);
	kmem_cache_destroy(btrfs_free_space_cachep);
8956
	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
C
Chris Mason 已提交
8957 8958
}

8959
int __init btrfs_init_cachep(void)
C
Chris Mason 已提交
8960
{
D
David Sterba 已提交
8961
	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8962
			sizeof(struct btrfs_inode), 0,
8963 8964
			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
			init_once);
C
Chris Mason 已提交
8965 8966
	if (!btrfs_inode_cachep)
		goto fail;
8967

D
David Sterba 已提交
8968
	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8969
			sizeof(struct btrfs_trans_handle), 0,
8970
			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
C
Chris Mason 已提交
8971 8972
	if (!btrfs_trans_handle_cachep)
		goto fail;
8973

D
David Sterba 已提交
8974
	btrfs_path_cachep = kmem_cache_create("btrfs_path",
8975
			sizeof(struct btrfs_path), 0,
8976
			SLAB_MEM_SPREAD, NULL);
C
Chris Mason 已提交
8977 8978
	if (!btrfs_path_cachep)
		goto fail;
8979

D
David Sterba 已提交
8980
	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
8981
			sizeof(struct btrfs_free_space), 0,
8982
			SLAB_MEM_SPREAD, NULL);
8983 8984 8985
	if (!btrfs_free_space_cachep)
		goto fail;

8986 8987
	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
							PAGE_SIZE, PAGE_SIZE,
8988
							SLAB_MEM_SPREAD, NULL);
8989 8990 8991
	if (!btrfs_free_space_bitmap_cachep)
		goto fail;

C
Chris Mason 已提交
8992 8993 8994 8995 8996 8997
	return 0;
fail:
	btrfs_destroy_cachep();
	return -ENOMEM;
}

8998 8999
static int btrfs_getattr(struct user_namespace *mnt_userns,
			 const struct path *path, struct kstat *stat,
9000
			 u32 request_mask, unsigned int flags)
C
Chris Mason 已提交
9001
{
9002
	u64 delalloc_bytes;
9003
	u64 inode_bytes;
9004
	struct inode *inode = d_inode(path->dentry);
D
David Sterba 已提交
9005
	u32 blocksize = inode->i_sb->s_blocksize;
Y
Yonghong Song 已提交
9006
	u32 bi_flags = BTRFS_I(inode)->flags;
B
Boris Burkov 已提交
9007
	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
Y
Yonghong Song 已提交
9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019

	stat->result_mask |= STATX_BTIME;
	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
	if (bi_flags & BTRFS_INODE_APPEND)
		stat->attributes |= STATX_ATTR_APPEND;
	if (bi_flags & BTRFS_INODE_COMPRESS)
		stat->attributes |= STATX_ATTR_COMPRESSED;
	if (bi_flags & BTRFS_INODE_IMMUTABLE)
		stat->attributes |= STATX_ATTR_IMMUTABLE;
	if (bi_flags & BTRFS_INODE_NODUMP)
		stat->attributes |= STATX_ATTR_NODUMP;
B
Boris Burkov 已提交
9020 9021
	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
		stat->attributes |= STATX_ATTR_VERITY;
Y
Yonghong Song 已提交
9022 9023 9024 9025 9026

	stat->attributes_mask |= (STATX_ATTR_APPEND |
				  STATX_ATTR_COMPRESSED |
				  STATX_ATTR_IMMUTABLE |
				  STATX_ATTR_NODUMP);
D
David Sterba 已提交
9027

9028
	generic_fillattr(mnt_userns, inode, stat);
9029
	stat->dev = BTRFS_I(inode)->root->anon_dev;
9030 9031

	spin_lock(&BTRFS_I(inode)->lock);
9032
	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9033
	inode_bytes = inode_get_bytes(inode);
9034
	spin_unlock(&BTRFS_I(inode)->lock);
9035
	stat->blocks = (ALIGN(inode_bytes, blocksize) +
9036
			ALIGN(delalloc_bytes, blocksize)) >> 9;
C
Chris Mason 已提交
9037 9038 9039
	return 0;
}

9040 9041 9042 9043 9044
static int btrfs_rename_exchange(struct inode *old_dir,
			      struct dentry *old_dentry,
			      struct inode *new_dir,
			      struct dentry *new_dentry)
{
9045
	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9046 9047 9048 9049 9050
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(old_dir)->root;
	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
	struct inode *new_inode = new_dentry->d_inode;
	struct inode *old_inode = old_dentry->d_inode;
9051
	struct timespec64 ctime = current_time(old_inode);
9052 9053
	struct btrfs_rename_ctx old_rename_ctx;
	struct btrfs_rename_ctx new_rename_ctx;
9054 9055
	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9056 9057 9058
	u64 old_idx = 0;
	u64 new_idx = 0;
	int ret;
9059
	int ret2;
9060
	bool need_abort = false;
9061

9062 9063 9064 9065 9066 9067 9068 9069
	/*
	 * For non-subvolumes allow exchange only within one subvolume, in the
	 * same inode namespace. Two subvolumes (represented as directory) can
	 * be exchanged as they're a logical link and have a fixed inode number.
	 */
	if (root != dest &&
	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
9070 9071 9072
		return -EXDEV;

	/* close the race window with snapshot create/destroy ioctl */
9073 9074
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
9075
		down_read(&fs_info->subvol_sem);
9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090

	/*
	 * We want to reserve the absolute worst case amount of items.  So if
	 * both inodes are subvols and we need to unlink them then that would
	 * require 4 item modifications, but if they are both normal inodes it
	 * would require 5 item modifications, so we'll assume their normal
	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
	 * should cover the worst case number of items we'll modify.
	 */
	trans = btrfs_start_transaction(root, 12);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto out_notrans;
	}

9091 9092 9093 9094 9095
	if (dest != root) {
		ret = btrfs_record_root_in_trans(trans, dest);
		if (ret)
			goto out_fail;
	}
9096

9097 9098 9099 9100
	/*
	 * We need to find a free sequence number both in the source and
	 * in the destination directory for the exchange.
	 */
9101
	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9102 9103
	if (ret)
		goto out_fail;
9104
	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9105 9106 9107 9108 9109 9110 9111 9112 9113
	if (ret)
		goto out_fail;

	BTRFS_I(old_inode)->dir_index = 0ULL;
	BTRFS_I(new_inode)->dir_index = 0ULL;

	/* Reference for the source. */
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
		/* force full log commit if subvolume involved. */
9114
		btrfs_set_log_full_commit(trans);
9115 9116 9117 9118 9119
	} else {
		ret = btrfs_insert_inode_ref(trans, dest,
					     new_dentry->d_name.name,
					     new_dentry->d_name.len,
					     old_ino,
9120 9121
					     btrfs_ino(BTRFS_I(new_dir)),
					     old_idx);
9122 9123
		if (ret)
			goto out_fail;
9124
		need_abort = true;
9125 9126 9127 9128 9129
	}

	/* And now for the dest. */
	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
		/* force full log commit if subvolume involved. */
9130
		btrfs_set_log_full_commit(trans);
9131 9132 9133 9134 9135
	} else {
		ret = btrfs_insert_inode_ref(trans, root,
					     old_dentry->d_name.name,
					     old_dentry->d_name.len,
					     new_ino,
9136 9137
					     btrfs_ino(BTRFS_I(old_dir)),
					     new_idx);
9138 9139 9140
		if (ret) {
			if (need_abort)
				btrfs_abort_transaction(trans, ret);
9141
			goto out_fail;
9142
		}
9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155
	}

	/* Update inode version and ctime/mtime. */
	inode_inc_iversion(old_dir);
	inode_inc_iversion(new_dir);
	inode_inc_iversion(old_inode);
	inode_inc_iversion(new_inode);
	old_dir->i_ctime = old_dir->i_mtime = ctime;
	new_dir->i_ctime = new_dir->i_mtime = ctime;
	old_inode->i_ctime = ctime;
	new_inode->i_ctime = ctime;

	if (old_dentry->d_parent != new_dentry->d_parent) {
9156 9157 9158 9159
		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
				BTRFS_I(old_inode), 1);
		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
				BTRFS_I(new_inode), 1);
9160 9161 9162 9163
	}

	/* src is a subvolume */
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9164
		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9165
	} else { /* src is an inode */
9166
		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9167
					   BTRFS_I(old_dentry->d_inode),
9168
					   old_dentry->d_name.name,
9169 9170
					   old_dentry->d_name.len,
					   &old_rename_ctx);
9171
		if (!ret)
9172
			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9173 9174
	}
	if (ret) {
9175
		btrfs_abort_transaction(trans, ret);
9176 9177 9178 9179 9180
		goto out_fail;
	}

	/* dest is a subvolume */
	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9181
		ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9182
	} else { /* dest is an inode */
9183
		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9184
					   BTRFS_I(new_dentry->d_inode),
9185
					   new_dentry->d_name.name,
9186 9187
					   new_dentry->d_name.len,
					   &new_rename_ctx);
9188
		if (!ret)
9189
			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
9190 9191
	}
	if (ret) {
9192
		btrfs_abort_transaction(trans, ret);
9193 9194 9195
		goto out_fail;
	}

9196
	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9197 9198 9199
			     new_dentry->d_name.name,
			     new_dentry->d_name.len, 0, old_idx);
	if (ret) {
9200
		btrfs_abort_transaction(trans, ret);
9201 9202 9203
		goto out_fail;
	}

9204
	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9205 9206 9207
			     old_dentry->d_name.name,
			     old_dentry->d_name.len, 0, new_idx);
	if (ret) {
9208
		btrfs_abort_transaction(trans, ret);
9209 9210 9211 9212 9213 9214 9215 9216
		goto out_fail;
	}

	if (old_inode->i_nlink == 1)
		BTRFS_I(old_inode)->dir_index = old_idx;
	if (new_inode->i_nlink == 1)
		BTRFS_I(new_inode)->dir_index = new_idx;

9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229
	/*
	 * Now pin the logs of the roots. We do it to ensure that no other task
	 * can sync the logs while we are in progress with the rename, because
	 * that could result in an inconsistency in case any of the inodes that
	 * are part of this rename operation were logged before.
	 */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_pin_log_trans(root);
	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_pin_log_trans(dest);

	/* Do the log updates for all inodes. */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9230
		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9231
				   old_rename_ctx.index, new_dentry->d_parent);
9232
	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
9233
		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
9234
				   new_rename_ctx.index, old_dentry->d_parent);
9235 9236 9237 9238 9239

	/* Now unpin the logs. */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
		btrfs_end_log_trans(root);
	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
9240 9241
		btrfs_end_log_trans(dest);
out_fail:
9242 9243
	ret2 = btrfs_end_transaction(trans);
	ret = ret ? ret : ret2;
9244
out_notrans:
9245 9246
	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
9247
		up_read(&fs_info->subvol_sem);
9248 9249 9250 9251 9252 9253

	return ret;
}

static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
9254
				     struct user_namespace *mnt_userns,
9255 9256 9257 9258 9259 9260 9261 9262
				     struct inode *dir,
				     struct dentry *dentry)
{
	int ret;
	struct inode *inode;
	u64 objectid;
	u64 index;

9263
	ret = btrfs_get_free_objectid(root, &objectid);
9264 9265 9266
	if (ret)
		return ret;

9267
	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
9268 9269
				dentry->d_name.name,
				dentry->d_name.len,
9270
				btrfs_ino(BTRFS_I(dir)),
9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286
				objectid,
				S_IFCHR | WHITEOUT_MODE,
				&index);

	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		return ret;
	}

	inode->i_op = &btrfs_special_inode_operations;
	init_special_inode(inode, inode->i_mode,
		WHITEOUT_DEV);

	ret = btrfs_init_inode_security(trans, inode, dir,
				&dentry->d_name);
	if (ret)
9287
		goto out;
9288

9289 9290
	ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
				BTRFS_I(inode), 0, index);
9291
	if (ret)
9292
		goto out;
9293

9294
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9295
out:
9296
	unlock_new_inode(inode);
9297 9298
	if (ret)
		inode_dec_link_count(inode);
9299 9300
	iput(inode);

9301
	return ret;
9302 9303
}

9304 9305 9306 9307
static int btrfs_rename(struct user_namespace *mnt_userns,
			struct inode *old_dir, struct dentry *old_dentry,
			struct inode *new_dir, struct dentry *new_dentry,
			unsigned int flags)
C
Chris Mason 已提交
9308
{
9309
	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
C
Chris Mason 已提交
9310
	struct btrfs_trans_handle *trans;
9311
	unsigned int trans_num_items;
C
Chris Mason 已提交
9312
	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9313
	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9314 9315
	struct inode *new_inode = d_inode(new_dentry);
	struct inode *old_inode = d_inode(old_dentry);
9316
	struct btrfs_rename_ctx rename_ctx;
9317
	u64 index = 0;
C
Chris Mason 已提交
9318
	int ret;
9319
	int ret2;
9320
	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
C
Chris Mason 已提交
9321

9322
	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9323 9324
		return -EPERM;

9325
	/* we only allow rename subvolume link between subvolumes */
L
Li Zefan 已提交
9326
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9327 9328
		return -EXDEV;

L
Li Zefan 已提交
9329
	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9330
	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
C
Chris Mason 已提交
9331
		return -ENOTEMPTY;
9332

9333 9334 9335
	if (S_ISDIR(old_inode->i_mode) && new_inode &&
	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
		return -ENOTEMPTY;
C
Chris Mason 已提交
9336 9337 9338


	/* check for collisions, even if the  name isn't there */
9339
	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
C
Chris Mason 已提交
9340 9341 9342 9343 9344 9345 9346
			     new_dentry->d_name.name,
			     new_dentry->d_name.len);

	if (ret) {
		if (ret == -EEXIST) {
			/* we shouldn't get
			 * eexist without a new_inode */
9347
			if (WARN_ON(!new_inode)) {
C
Chris Mason 已提交
9348 9349 9350 9351 9352 9353 9354 9355 9356
				return ret;
			}
		} else {
			/* maybe -EOVERFLOW */
			return ret;
		}
	}
	ret = 0;

9357
	/*
9358 9359
	 * we're using rename to replace one file with another.  Start IO on it
	 * now so  we don't add too much work to the end of the transaction
9360
	 */
9361
	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9362 9363
		filemap_flush(old_inode->i_mapping);

9364
	/* close the racy window with snapshot create/destroy ioctl */
L
Li Zefan 已提交
9365
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9366
		down_read(&fs_info->subvol_sem);
9367 9368 9369 9370
	/*
	 * We want to reserve the absolute worst case amount of items.  So if
	 * both inodes are subvols and we need to unlink them then that would
	 * require 4 item modifications, but if they are both normal inodes it
9371
	 * would require 5 item modifications, so we'll assume they are normal
9372 9373
	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
	 * should cover the worst case number of items we'll modify.
9374 9375 9376
	 * If our rename has the whiteout flag, we need more 5 units for the
	 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
	 * when selinux is enabled).
9377
	 */
9378 9379 9380 9381
	trans_num_items = 11;
	if (flags & RENAME_WHITEOUT)
		trans_num_items += 5;
	trans = btrfs_start_transaction(root, trans_num_items);
9382
	if (IS_ERR(trans)) {
9383 9384 9385
		ret = PTR_ERR(trans);
		goto out_notrans;
	}
9386

9387 9388 9389 9390 9391
	if (dest != root) {
		ret = btrfs_record_root_in_trans(trans, dest);
		if (ret)
			goto out_fail;
	}
9392

9393
	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9394 9395
	if (ret)
		goto out_fail;
9396

9397
	BTRFS_I(old_inode)->dir_index = 0ULL;
L
Li Zefan 已提交
9398
	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9399
		/* force full log commit if subvolume involved. */
9400
		btrfs_set_log_full_commit(trans);
9401
	} else {
9402 9403 9404
		ret = btrfs_insert_inode_ref(trans, dest,
					     new_dentry->d_name.name,
					     new_dentry->d_name.len,
L
Li Zefan 已提交
9405
					     old_ino,
9406
					     btrfs_ino(BTRFS_I(new_dir)), index);
9407 9408
		if (ret)
			goto out_fail;
9409
	}
9410

9411 9412 9413
	inode_inc_iversion(old_dir);
	inode_inc_iversion(new_dir);
	inode_inc_iversion(old_inode);
9414 9415
	old_dir->i_ctime = old_dir->i_mtime =
	new_dir->i_ctime = new_dir->i_mtime =
9416
	old_inode->i_ctime = current_time(old_dir);
9417

9418
	if (old_dentry->d_parent != new_dentry->d_parent)
9419 9420
		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
				BTRFS_I(old_inode), 1);
9421

L
Li Zefan 已提交
9422
	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9423
		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9424
	} else {
9425
		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9426
					BTRFS_I(d_inode(old_dentry)),
9427
					old_dentry->d_name.name,
9428 9429
					old_dentry->d_name.len,
					&rename_ctx);
9430
		if (!ret)
9431
			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9432
	}
9433
	if (ret) {
9434
		btrfs_abort_transaction(trans, ret);
9435 9436
		goto out_fail;
	}
C
Chris Mason 已提交
9437 9438

	if (new_inode) {
9439
		inode_inc_iversion(new_inode);
9440
		new_inode->i_ctime = current_time(new_inode);
9441
		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9442
			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9443
			ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9444 9445
			BUG_ON(new_inode->i_nlink == 0);
		} else {
9446
			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9447
						 BTRFS_I(d_inode(new_dentry)),
9448 9449 9450
						 new_dentry->d_name.name,
						 new_dentry->d_name.len);
		}
9451
		if (!ret && new_inode->i_nlink == 0)
9452 9453
			ret = btrfs_orphan_add(trans,
					BTRFS_I(d_inode(new_dentry)));
9454
		if (ret) {
9455
			btrfs_abort_transaction(trans, ret);
9456 9457
			goto out_fail;
		}
C
Chris Mason 已提交
9458
	}
9459

9460
	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9461
			     new_dentry->d_name.name,
9462
			     new_dentry->d_name.len, 0, index);
9463
	if (ret) {
9464
		btrfs_abort_transaction(trans, ret);
9465 9466
		goto out_fail;
	}
C
Chris Mason 已提交
9467

9468 9469 9470
	if (old_inode->i_nlink == 1)
		BTRFS_I(old_inode)->dir_index = index;

9471
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9472
		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9473
				   rename_ctx.index, new_dentry->d_parent);
9474 9475

	if (flags & RENAME_WHITEOUT) {
9476 9477
		ret = btrfs_whiteout_for_rename(trans, root, mnt_userns,
						old_dir, old_dentry);
9478 9479

		if (ret) {
9480
			btrfs_abort_transaction(trans, ret);
9481 9482
			goto out_fail;
		}
9483
	}
C
Chris Mason 已提交
9484
out_fail:
9485 9486
	ret2 = btrfs_end_transaction(trans);
	ret = ret ? ret : ret2;
9487
out_notrans:
L
Li Zefan 已提交
9488
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9489
		up_read(&fs_info->subvol_sem);
J
Josef Bacik 已提交
9490

C
Chris Mason 已提交
9491 9492 9493
	return ret;
}

9494 9495 9496
static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir,
			 struct dentry *old_dentry, struct inode *new_dir,
			 struct dentry *new_dentry, unsigned int flags)
M
Miklos Szeredi 已提交
9497
{
9498
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
9499 9500
		return -EINVAL;

9501 9502 9503 9504
	if (flags & RENAME_EXCHANGE)
		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
					  new_dentry);

9505 9506
	return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
			    new_dentry, flags);
M
Miklos Szeredi 已提交
9507 9508
}

9509 9510 9511 9512 9513 9514 9515
struct btrfs_delalloc_work {
	struct inode *inode;
	struct completion completion;
	struct list_head list;
	struct btrfs_work work;
};

9516 9517 9518
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
	struct btrfs_delalloc_work *delalloc_work;
9519
	struct inode *inode;
9520 9521 9522

	delalloc_work = container_of(work, struct btrfs_delalloc_work,
				     work);
9523
	inode = delalloc_work->inode;
9524 9525 9526
	filemap_flush(inode->i_mapping);
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
				&BTRFS_I(inode)->runtime_flags))
9527
		filemap_flush(inode->i_mapping);
9528

9529
	iput(inode);
9530 9531 9532
	complete(&delalloc_work->completion);
}

9533
static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9534 9535 9536
{
	struct btrfs_delalloc_work *work;

9537
	work = kmalloc(sizeof(*work), GFP_NOFS);
9538 9539 9540 9541 9542 9543
	if (!work)
		return NULL;

	init_completion(&work->completion);
	INIT_LIST_HEAD(&work->list);
	work->inode = inode;
9544
	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9545 9546 9547 9548

	return work;
}

C
Chris Mason 已提交
9549 9550 9551 9552
/*
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
9553 9554
static int start_delalloc_inodes(struct btrfs_root *root,
				 struct writeback_control *wbc, bool snapshot,
9555
				 bool in_reclaim_context)
9556 9557
{
	struct btrfs_inode *binode;
9558
	struct inode *inode;
9559 9560
	struct btrfs_delalloc_work *work, *next;
	struct list_head works;
9561
	struct list_head splice;
9562
	int ret = 0;
9563
	bool full_flush = wbc->nr_to_write == LONG_MAX;
9564

9565
	INIT_LIST_HEAD(&works);
9566
	INIT_LIST_HEAD(&splice);
9567

9568
	mutex_lock(&root->delalloc_mutex);
9569 9570
	spin_lock(&root->delalloc_lock);
	list_splice_init(&root->delalloc_inodes, &splice);
9571 9572
	while (!list_empty(&splice)) {
		binode = list_entry(splice.next, struct btrfs_inode,
9573
				    delalloc_inodes);
9574

9575 9576
		list_move_tail(&binode->delalloc_inodes,
			       &root->delalloc_inodes);
9577 9578 9579 9580 9581

		if (in_reclaim_context &&
		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
			continue;

9582
		inode = igrab(&binode->vfs_inode);
9583
		if (!inode) {
9584
			cond_resched_lock(&root->delalloc_lock);
9585
			continue;
9586
		}
9587
		spin_unlock(&root->delalloc_lock);
9588

9589 9590 9591
		if (snapshot)
			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
				&binode->runtime_flags);
9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602
		if (full_flush) {
			work = btrfs_alloc_delalloc_work(inode);
			if (!work) {
				iput(inode);
				ret = -ENOMEM;
				goto out;
			}
			list_add_tail(&work->list, &works);
			btrfs_queue_work(root->fs_info->flush_workers,
					 &work->work);
		} else {
9603
			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9604 9605
			btrfs_add_delayed_iput(inode);
			if (ret || wbc->nr_to_write <= 0)
9606 9607
				goto out;
		}
9608
		cond_resched();
9609
		spin_lock(&root->delalloc_lock);
9610
	}
9611
	spin_unlock(&root->delalloc_lock);
9612

9613
out:
9614 9615
	list_for_each_entry_safe(work, next, &works, list) {
		list_del_init(&work->list);
9616 9617
		wait_for_completion(&work->completion);
		kfree(work);
9618 9619
	}

9620
	if (!list_empty(&splice)) {
9621 9622 9623 9624
		spin_lock(&root->delalloc_lock);
		list_splice_tail(&splice, &root->delalloc_inodes);
		spin_unlock(&root->delalloc_lock);
	}
9625
	mutex_unlock(&root->delalloc_mutex);
9626 9627
	return ret;
}
9628

9629
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9630
{
9631 9632 9633 9634 9635 9636
	struct writeback_control wbc = {
		.nr_to_write = LONG_MAX,
		.sync_mode = WB_SYNC_NONE,
		.range_start = 0,
		.range_end = LLONG_MAX,
	};
9637
	struct btrfs_fs_info *fs_info = root->fs_info;
9638

J
Josef Bacik 已提交
9639
	if (BTRFS_FS_ERROR(fs_info))
9640 9641
		return -EROFS;

9642
	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9643 9644
}

9645
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9646
			       bool in_reclaim_context)
9647
{
9648
	struct writeback_control wbc = {
9649
		.nr_to_write = nr,
9650 9651 9652 9653
		.sync_mode = WB_SYNC_NONE,
		.range_start = 0,
		.range_end = LLONG_MAX,
	};
9654 9655 9656 9657
	struct btrfs_root *root;
	struct list_head splice;
	int ret;

J
Josef Bacik 已提交
9658
	if (BTRFS_FS_ERROR(fs_info))
9659 9660 9661 9662
		return -EROFS;

	INIT_LIST_HEAD(&splice);

9663
	mutex_lock(&fs_info->delalloc_root_mutex);
9664 9665
	spin_lock(&fs_info->delalloc_root_lock);
	list_splice_init(&fs_info->delalloc_roots, &splice);
9666
	while (!list_empty(&splice)) {
9667 9668 9669 9670
		/*
		 * Reset nr_to_write here so we know that we're doing a full
		 * flush.
		 */
9671
		if (nr == LONG_MAX)
9672 9673
			wbc.nr_to_write = LONG_MAX;

9674 9675
		root = list_first_entry(&splice, struct btrfs_root,
					delalloc_root);
9676
		root = btrfs_grab_root(root);
9677 9678 9679 9680 9681
		BUG_ON(!root);
		list_move_tail(&root->delalloc_root,
			       &fs_info->delalloc_roots);
		spin_unlock(&fs_info->delalloc_root_lock);

9682
		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9683
		btrfs_put_root(root);
9684
		if (ret < 0 || wbc.nr_to_write <= 0)
9685 9686
			goto out;
		spin_lock(&fs_info->delalloc_root_lock);
9687
	}
9688
	spin_unlock(&fs_info->delalloc_root_lock);
9689

9690
	ret = 0;
9691
out:
9692
	if (!list_empty(&splice)) {
9693 9694 9695
		spin_lock(&fs_info->delalloc_root_lock);
		list_splice_tail(&splice, &fs_info->delalloc_roots);
		spin_unlock(&fs_info->delalloc_root_lock);
9696
	}
9697
	mutex_unlock(&fs_info->delalloc_root_mutex);
9698
	return ret;
9699 9700
}

9701 9702
static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir,
			 struct dentry *dentry, const char *symname)
C
Chris Mason 已提交
9703
{
9704
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
C
Chris Mason 已提交
9705 9706 9707 9708
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct btrfs_path *path;
	struct btrfs_key key;
9709
	struct inode *inode = NULL;
C
Chris Mason 已提交
9710 9711
	int err;
	u64 objectid;
9712
	u64 index = 0;
C
Chris Mason 已提交
9713 9714
	int name_len;
	int datasize;
9715
	unsigned long ptr;
C
Chris Mason 已提交
9716
	struct btrfs_file_extent_item *ei;
9717
	struct extent_buffer *leaf;
C
Chris Mason 已提交
9718

9719
	name_len = strlen(symname);
9720
	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
C
Chris Mason 已提交
9721
		return -ENAMETOOLONG;
9722

J
Josef Bacik 已提交
9723 9724 9725
	/*
	 * 2 items for inode item and ref
	 * 2 items for dir items
9726 9727
	 * 1 item for updating parent inode item
	 * 1 item for the inline extent item
J
Josef Bacik 已提交
9728 9729
	 * 1 item for xattr if selinux is on
	 */
9730
	trans = btrfs_start_transaction(root, 7);
9731 9732
	if (IS_ERR(trans))
		return PTR_ERR(trans);
9733

9734
	err = btrfs_get_free_objectid(root, &objectid);
9735 9736 9737
	if (err)
		goto out_unlock;

9738
	inode = btrfs_new_inode(trans, root, mnt_userns, dir,
9739 9740 9741
				dentry->d_name.name, dentry->d_name.len,
				btrfs_ino(BTRFS_I(dir)), objectid,
				S_IFLNK | S_IRWXUGO, &index);
9742 9743
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
9744
		inode = NULL;
C
Chris Mason 已提交
9745
		goto out_unlock;
9746
	}
C
Chris Mason 已提交
9747

9748 9749 9750 9751 9752 9753 9754 9755
	/*
	* If the active LSM wants to access the inode during
	* d_instantiate it needs these. Smack checks to see
	* if the filesystem supports xattrs by looking at the
	* ops vector.
	*/
	inode->i_fop = &btrfs_file_operations;
	inode->i_op = &btrfs_file_inode_operations;
9756 9757 9758 9759
	inode->i_mapping->a_ops = &btrfs_aops;

	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
	if (err)
A
Al Viro 已提交
9760
		goto out_unlock;
9761

C
Chris Mason 已提交
9762
	path = btrfs_alloc_path();
9763 9764
	if (!path) {
		err = -ENOMEM;
A
Al Viro 已提交
9765
		goto out_unlock;
9766
	}
9767
	key.objectid = btrfs_ino(BTRFS_I(inode));
C
Chris Mason 已提交
9768
	key.offset = 0;
9769
	key.type = BTRFS_EXTENT_DATA_KEY;
C
Chris Mason 已提交
9770 9771 9772
	datasize = btrfs_file_extent_calc_inline_size(name_len);
	err = btrfs_insert_empty_item(trans, root, path, &key,
				      datasize);
9773
	if (err) {
9774
		btrfs_free_path(path);
A
Al Viro 已提交
9775
		goto out_unlock;
9776
	}
9777 9778 9779 9780 9781
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei,
C
Chris Mason 已提交
9782
				   BTRFS_FILE_EXTENT_INLINE);
C
Chris Mason 已提交
9783 9784 9785 9786 9787
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_compression(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);

C
Chris Mason 已提交
9788
	ptr = btrfs_file_extent_inline_start(ei);
9789 9790
	write_extent_buffer(leaf, symname, ptr, name_len);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
9791
	btrfs_free_path(path);
9792

C
Chris Mason 已提交
9793
	inode->i_op = &btrfs_symlink_inode_operations;
9794
	inode_nohighmem(inode);
Y
Yan Zheng 已提交
9795
	inode_set_bytes(inode, name_len);
9796
	btrfs_i_size_write(BTRFS_I(inode), name_len);
9797
	err = btrfs_update_inode(trans, root, BTRFS_I(inode));
9798 9799 9800 9801 9802 9803
	/*
	 * Last step, add directory indexes for our symlink inode. This is the
	 * last step to avoid extra cleanup of these indexes if an error happens
	 * elsewhere above.
	 */
	if (!err)
9804 9805
		err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
				BTRFS_I(inode), 0, index);
A
Al Viro 已提交
9806 9807
	if (err)
		goto out_unlock;
9808

9809
	d_instantiate_new(dentry, inode);
C
Chris Mason 已提交
9810 9811

out_unlock:
9812
	btrfs_end_transaction(trans);
A
Al Viro 已提交
9813
	if (err && inode) {
C
Chris Mason 已提交
9814
		inode_dec_link_count(inode);
A
Al Viro 已提交
9815
		discard_new_inode(inode);
C
Chris Mason 已提交
9816
	}
9817
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
9818 9819
	return err;
}
9820

9821 9822
static struct btrfs_trans_handle *insert_prealloc_file_extent(
				       struct btrfs_trans_handle *trans_in,
9823 9824
				       struct btrfs_inode *inode,
				       struct btrfs_key *ins,
9825 9826 9827
				       u64 file_offset)
{
	struct btrfs_file_extent_item stack_fi;
9828
	struct btrfs_replace_extent_info extent_info;
9829 9830
	struct btrfs_trans_handle *trans = trans_in;
	struct btrfs_path *path;
9831 9832
	u64 start = ins->objectid;
	u64 len = ins->offset;
9833
	int qgroup_released;
9834
	int ret;
9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845

	memset(&stack_fi, 0, sizeof(stack_fi));

	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
	/* Encryption and other encoding is reserved and all 0 */

9846 9847 9848
	qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
	if (qgroup_released < 0)
		return ERR_PTR(qgroup_released);
9849 9850

	if (trans) {
9851
		ret = insert_reserved_file_extent(trans, inode,
9852
						  file_offset, &stack_fi,
9853
						  true, qgroup_released);
9854
		if (ret)
9855
			goto free_qgroup;
9856 9857 9858 9859 9860 9861 9862 9863 9864 9865
		return trans;
	}

	extent_info.disk_offset = start;
	extent_info.disk_len = len;
	extent_info.data_offset = 0;
	extent_info.data_len = len;
	extent_info.file_offset = file_offset;
	extent_info.extent_buf = (char *)&stack_fi;
	extent_info.is_new_extent = true;
9866
	extent_info.qgroup_reserved = qgroup_released;
9867 9868 9869
	extent_info.insertions = 0;

	path = btrfs_alloc_path();
9870 9871 9872 9873
	if (!path) {
		ret = -ENOMEM;
		goto free_qgroup;
	}
9874

9875
	ret = btrfs_replace_file_extents(inode, path, file_offset,
9876 9877 9878 9879
				     file_offset + len - 1, &extent_info,
				     &trans);
	btrfs_free_path(path);
	if (ret)
9880
		goto free_qgroup;
9881
	return trans;
9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894

free_qgroup:
	/*
	 * We have released qgroup data range at the beginning of the function,
	 * and normally qgroup_released bytes will be freed when committing
	 * transaction.
	 * But if we error out early, we have to free what we have released
	 * or we leak qgroup data reservation.
	 */
	btrfs_qgroup_free_refroot(inode->root->fs_info,
			inode->root->root_key.objectid, qgroup_released,
			BTRFS_QGROUP_RSV_DATA);
	return ERR_PTR(ret);
9895
}
9896

9897 9898 9899 9900
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
				       u64 start, u64 num_bytes, u64 min_size,
				       loff_t actual_len, u64 *alloc_hint,
				       struct btrfs_trans_handle *trans)
Y
Yan Zheng 已提交
9901
{
9902
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
9903 9904
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
Y
Yan Zheng 已提交
9905 9906 9907
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_key ins;
	u64 cur_offset = start;
9908
	u64 clear_offset = start;
9909
	u64 i_size;
9910
	u64 cur_bytes;
9911
	u64 last_alloc = (u64)-1;
Y
Yan Zheng 已提交
9912
	int ret = 0;
9913
	bool own_trans = true;
9914
	u64 end = start + num_bytes - 1;
Y
Yan Zheng 已提交
9915

9916 9917
	if (trans)
		own_trans = false;
Y
Yan Zheng 已提交
9918
	while (num_bytes > 0) {
9919
		cur_bytes = min_t(u64, num_bytes, SZ_256M);
9920
		cur_bytes = max(cur_bytes, min_size);
9921 9922 9923 9924 9925 9926 9927
		/*
		 * If we are severely fragmented we could end up with really
		 * small allocations, so if the allocator is returning small
		 * chunks lets make its job easier by only searching for those
		 * sized chunks.
		 */
		cur_bytes = min(cur_bytes, last_alloc);
9928 9929
		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
				min_size, 0, *alloc_hint, &ins, 1, 0);
9930
		if (ret)
9931
			break;
9932 9933 9934 9935 9936 9937 9938 9939 9940

		/*
		 * We've reserved this space, and thus converted it from
		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
		 * from here on out we will only need to clear our reservation
		 * for the remaining unreserved area, so advance our
		 * clear_offset by our extent size.
		 */
		clear_offset += ins.offset;
9941

9942
		last_alloc = ins.offset;
9943 9944
		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
						    &ins, cur_offset);
9945 9946 9947 9948 9949 9950 9951
		/*
		 * Now that we inserted the prealloc extent we can finally
		 * decrement the number of reservations in the block group.
		 * If we did it before, we could race with relocation and have
		 * relocation miss the reserved extent, making it fail later.
		 */
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9952 9953
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
9954
			btrfs_free_reserved_extent(fs_info, ins.objectid,
9955
						   ins.offset, 0);
9956 9957
			break;
		}
9958

9959
		btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
C
Chris Mason 已提交
9960
					cur_offset + ins.offset -1, 0);
9961

J
Josef Bacik 已提交
9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973
		em = alloc_extent_map();
		if (!em) {
			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				&BTRFS_I(inode)->runtime_flags);
			goto next;
		}

		em->start = cur_offset;
		em->orig_start = cur_offset;
		em->len = ins.offset;
		em->block_start = ins.objectid;
		em->block_len = ins.offset;
9974
		em->orig_block_len = ins.offset;
J
Josef Bacik 已提交
9975
		em->ram_bytes = ins.offset;
J
Josef Bacik 已提交
9976 9977 9978 9979 9980
		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
		em->generation = trans->transid;

		while (1) {
			write_lock(&em_tree->lock);
J
Josef Bacik 已提交
9981
			ret = add_extent_mapping(em_tree, em, 1);
J
Josef Bacik 已提交
9982 9983 9984
			write_unlock(&em_tree->lock);
			if (ret != -EEXIST)
				break;
9985
			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
J
Josef Bacik 已提交
9986 9987 9988 9989 9990
						cur_offset + ins.offset - 1,
						0);
		}
		free_extent_map(em);
next:
Y
Yan Zheng 已提交
9991 9992
		num_bytes -= ins.offset;
		cur_offset += ins.offset;
9993
		*alloc_hint = ins.objectid + ins.offset;
9994

9995
		inode_inc_iversion(inode);
9996
		inode->i_ctime = current_time(inode);
9997
		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
Y
Yan Zheng 已提交
9998
		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9999 10000
		    (actual_len > inode->i_size) &&
		    (cur_offset > inode->i_size)) {
10001
			if (cur_offset > actual_len)
10002
				i_size = actual_len;
10003
			else
10004 10005
				i_size = cur_offset;
			i_size_write(inode, i_size);
10006
			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
10007 10008
		}

10009
		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
10010 10011

		if (ret) {
10012
			btrfs_abort_transaction(trans, ret);
10013
			if (own_trans)
10014
				btrfs_end_transaction(trans);
10015 10016
			break;
		}
Y
Yan Zheng 已提交
10017

10018
		if (own_trans) {
10019
			btrfs_end_transaction(trans);
10020 10021
			trans = NULL;
		}
10022
	}
10023
	if (clear_offset < end)
10024
		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
10025
			end - clear_offset + 1);
Y
Yan Zheng 已提交
10026 10027 10028
	return ret;
}

10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046
int btrfs_prealloc_file_range(struct inode *inode, int mode,
			      u64 start, u64 num_bytes, u64 min_size,
			      loff_t actual_len, u64 *alloc_hint)
{
	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
					   min_size, actual_len, alloc_hint,
					   NULL);
}

int btrfs_prealloc_file_range_trans(struct inode *inode,
				    struct btrfs_trans_handle *trans, int mode,
				    u64 start, u64 num_bytes, u64 min_size,
				    loff_t actual_len, u64 *alloc_hint)
{
	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
					   min_size, actual_len, alloc_hint, trans);
}

10047 10048 10049 10050 10051
static int btrfs_set_page_dirty(struct page *page)
{
	return __set_page_dirty_nobuffers(page);
}

10052 10053
static int btrfs_permission(struct user_namespace *mnt_userns,
			    struct inode *inode, int mask)
Y
Yan 已提交
10054
{
L
Li Zefan 已提交
10055
	struct btrfs_root *root = BTRFS_I(inode)->root;
10056
	umode_t mode = inode->i_mode;
L
Li Zefan 已提交
10057

10058 10059 10060 10061 10062 10063 10064
	if (mask & MAY_WRITE &&
	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
		if (btrfs_root_readonly(root))
			return -EROFS;
		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
			return -EACCES;
	}
10065
	return generic_permission(mnt_userns, inode, mask);
Y
Yan 已提交
10066
}
C
Chris Mason 已提交
10067

10068 10069
static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
			 struct dentry *dentry, umode_t mode)
10070
{
10071
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct inode *inode = NULL;
	u64 objectid;
	u64 index;
	int ret = 0;

	/*
	 * 5 units required for adding orphan entry
	 */
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

10086
	ret = btrfs_get_free_objectid(root, &objectid);
10087 10088 10089
	if (ret)
		goto out;

10090
	inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0,
10091
			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		inode = NULL;
		goto out;
	}

	inode->i_fop = &btrfs_file_operations;
	inode->i_op = &btrfs_file_inode_operations;

	inode->i_mapping->a_ops = &btrfs_aops;

10103 10104
	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
	if (ret)
A
Al Viro 已提交
10105
		goto out;
10106

10107
	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
10108
	if (ret)
A
Al Viro 已提交
10109
		goto out;
10110
	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10111
	if (ret)
A
Al Viro 已提交
10112
		goto out;
10113

10114 10115 10116 10117 10118 10119 10120 10121
	/*
	 * We set number of links to 0 in btrfs_new_inode(), and here we set
	 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
	 * through:
	 *
	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
	 */
	set_nlink(inode, 1);
10122
	d_tmpfile(dentry, inode);
A
Al Viro 已提交
10123
	unlock_new_inode(inode);
10124 10125
	mark_inode_dirty(inode);
out:
10126
	btrfs_end_transaction(trans);
A
Al Viro 已提交
10127 10128
	if (ret && inode)
		discard_new_inode(inode);
10129
	btrfs_btree_balance_dirty(fs_info);
10130 10131 10132
	return ret;
}

10133
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
10134
{
10135
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10136 10137 10138
	unsigned long index = start >> PAGE_SHIFT;
	unsigned long end_index = end >> PAGE_SHIFT;
	struct page *page;
10139
	u32 len;
10140

10141 10142
	ASSERT(end + 1 - start <= U32_MAX);
	len = end + 1 - start;
10143
	while (index <= end_index) {
10144
		page = find_get_page(inode->vfs_inode.i_mapping, index);
10145
		ASSERT(page); /* Pages should be in the extent_io_tree */
10146 10147

		btrfs_page_set_writeback(fs_info, page, start, len);
10148 10149 10150 10151 10152
		put_page(page);
		index++;
	}
}

O
Omar Sandoval 已提交
10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172
#ifdef CONFIG_SWAP
/*
 * Add an entry indicating a block group or device which is pinned by a
 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
 * negative errno on failure.
 */
static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
				  bool is_block_group)
{
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct btrfs_swapfile_pin *sp, *entry;
	struct rb_node **p;
	struct rb_node *parent = NULL;

	sp = kmalloc(sizeof(*sp), GFP_NOFS);
	if (!sp)
		return -ENOMEM;
	sp->ptr = ptr;
	sp->inode = inode;
	sp->is_block_group = is_block_group;
10173
	sp->bg_extent_count = 1;
O
Omar Sandoval 已提交
10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186

	spin_lock(&fs_info->swapfile_pins_lock);
	p = &fs_info->swapfile_pins.rb_node;
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
		if (sp->ptr < entry->ptr ||
		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
			p = &(*p)->rb_left;
		} else if (sp->ptr > entry->ptr ||
			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
			p = &(*p)->rb_right;
		} else {
10187 10188
			if (is_block_group)
				entry->bg_extent_count++;
O
Omar Sandoval 已提交
10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213
			spin_unlock(&fs_info->swapfile_pins_lock);
			kfree(sp);
			return 1;
		}
	}
	rb_link_node(&sp->node, parent, p);
	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
	spin_unlock(&fs_info->swapfile_pins_lock);
	return 0;
}

/* Free all of the entries pinned by this swapfile. */
static void btrfs_free_swapfile_pins(struct inode *inode)
{
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct btrfs_swapfile_pin *sp;
	struct rb_node *node, *next;

	spin_lock(&fs_info->swapfile_pins_lock);
	node = rb_first(&fs_info->swapfile_pins);
	while (node) {
		next = rb_next(node);
		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
		if (sp->inode == inode) {
			rb_erase(&sp->node, &fs_info->swapfile_pins);
10214 10215 10216
			if (sp->is_block_group) {
				btrfs_dec_block_group_swap_extents(sp->ptr,
							   sp->bg_extent_count);
O
Omar Sandoval 已提交
10217
				btrfs_put_block_group(sp->ptr);
10218
			}
O
Omar Sandoval 已提交
10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239
			kfree(sp);
		}
		node = next;
	}
	spin_unlock(&fs_info->swapfile_pins_lock);
}

struct btrfs_swap_info {
	u64 start;
	u64 block_start;
	u64 block_len;
	u64 lowest_ppage;
	u64 highest_ppage;
	unsigned long nr_pages;
	int nr_extents;
};

static int btrfs_add_swap_extent(struct swap_info_struct *sis,
				 struct btrfs_swap_info *bsi)
{
	unsigned long nr_pages;
10240
	unsigned long max_pages;
O
Omar Sandoval 已提交
10241 10242 10243
	u64 first_ppage, first_ppage_reported, next_ppage;
	int ret;

10244 10245 10246 10247 10248 10249 10250 10251 10252
	/*
	 * Our swapfile may have had its size extended after the swap header was
	 * written. In that case activating the swapfile should not go beyond
	 * the max size set in the swap header.
	 */
	if (bsi->nr_pages >= sis->max)
		return 0;

	max_pages = sis->max - bsi->nr_pages;
O
Omar Sandoval 已提交
10253 10254 10255 10256 10257 10258 10259
	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
				PAGE_SIZE) >> PAGE_SHIFT;

	if (first_ppage >= next_ppage)
		return 0;
	nr_pages = next_ppage - first_ppage;
10260
	nr_pages = min(nr_pages, max_pages);
O
Omar Sandoval 已提交
10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289

	first_ppage_reported = first_ppage;
	if (bsi->start == 0)
		first_ppage_reported++;
	if (bsi->lowest_ppage > first_ppage_reported)
		bsi->lowest_ppage = first_ppage_reported;
	if (bsi->highest_ppage < (next_ppage - 1))
		bsi->highest_ppage = next_ppage - 1;

	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
	if (ret < 0)
		return ret;
	bsi->nr_extents += ret;
	bsi->nr_pages += nr_pages;
	return 0;
}

static void btrfs_swap_deactivate(struct file *file)
{
	struct inode *inode = file_inode(file);

	btrfs_free_swapfile_pins(inode);
	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
}

static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
			       sector_t *span)
{
	struct inode *inode = file_inode(file);
10290 10291
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
O
Omar Sandoval 已提交
10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct extent_state *cached_state = NULL;
	struct extent_map *em = NULL;
	struct btrfs_device *device = NULL;
	struct btrfs_swap_info bsi = {
		.lowest_ppage = (sector_t)-1ULL,
	};
	int ret = 0;
	u64 isize;
	u64 start;

	/*
	 * If the swap file was just created, make sure delalloc is done. If the
	 * file changes again after this, the user is doing something stupid and
	 * we don't really care.
	 */
	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
	if (ret)
		return ret;

	/*
	 * The inode is locked, so these flags won't change after we check them.
	 */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
		btrfs_warn(fs_info, "swapfile must not be compressed");
		return -EINVAL;
	}
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
		return -EINVAL;
	}
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
		btrfs_warn(fs_info, "swapfile must not be checksummed");
		return -EINVAL;
	}

	/*
	 * Balance or device remove/replace/resize can move stuff around from
10330 10331 10332 10333 10334
	 * under us. The exclop protection makes sure they aren't running/won't
	 * run concurrently while we are mapping the swap extents, and
	 * fs_info->swapfile_pins prevents them from running while the swap
	 * file is active and moving the extents. Note that this also prevents
	 * a concurrent device add which isn't actually necessary, but it's not
O
Omar Sandoval 已提交
10335 10336
	 * really worth the trouble to allow it.
	 */
10337
	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
O
Omar Sandoval 已提交
10338 10339 10340 10341
		btrfs_warn(fs_info,
	   "cannot activate swapfile while exclusive operation is running");
		return -EBUSY;
	}
10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355

	/*
	 * Prevent snapshot creation while we are activating the swap file.
	 * We do not want to race with snapshot creation. If snapshot creation
	 * already started before we bumped nr_swapfiles from 0 to 1 and
	 * completes before the first write into the swap file after it is
	 * activated, than that write would fallback to COW.
	 */
	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
		btrfs_exclop_finish(fs_info);
		btrfs_warn(fs_info,
	   "cannot activate swapfile because snapshot creation is in progress");
		return -EINVAL;
	}
O
Omar Sandoval 已提交
10356 10357 10358 10359 10360 10361
	/*
	 * Snapshots can create extents which require COW even if NODATACOW is
	 * set. We use this counter to prevent snapshots. We must increment it
	 * before walking the extents because we don't want a concurrent
	 * snapshot to run after we've already checked the extents.
	 */
10362
	atomic_inc(&root->nr_swapfiles);
O
Omar Sandoval 已提交
10363 10364 10365 10366 10367 10368 10369

	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);

	lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
	start = 0;
	while (start < isize) {
		u64 logical_block_start, physical_block_start;
10370
		struct btrfs_block_group *bg;
O
Omar Sandoval 已提交
10371 10372
		u64 len = isize - start;

10373
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
O
Omar Sandoval 已提交
10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}

		if (em->block_start == EXTENT_MAP_HOLE) {
			btrfs_warn(fs_info, "swapfile must not have holes");
			ret = -EINVAL;
			goto out;
		}
		if (em->block_start == EXTENT_MAP_INLINE) {
			/*
			 * It's unlikely we'll ever actually find ourselves
			 * here, as a file small enough to fit inline won't be
			 * big enough to store more than the swap header, but in
			 * case something changes in the future, let's catch it
			 * here rather than later.
			 */
			btrfs_warn(fs_info, "swapfile must not be inline");
			ret = -EINVAL;
			goto out;
		}
		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
			btrfs_warn(fs_info, "swapfile must not be compressed");
			ret = -EINVAL;
			goto out;
		}

		logical_block_start = em->block_start + (start - em->start);
		len = min(len, em->len - (start - em->start));
		free_extent_map(em);
		em = NULL;

10407
		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
O
Omar Sandoval 已提交
10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458
		if (ret < 0) {
			goto out;
		} else if (ret) {
			ret = 0;
		} else {
			btrfs_warn(fs_info,
				   "swapfile must not be copy-on-write");
			ret = -EINVAL;
			goto out;
		}

		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}

		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
			btrfs_warn(fs_info,
				   "swapfile must have single data profile");
			ret = -EINVAL;
			goto out;
		}

		if (device == NULL) {
			device = em->map_lookup->stripes[0].dev;
			ret = btrfs_add_swapfile_pin(inode, device, false);
			if (ret == 1)
				ret = 0;
			else if (ret)
				goto out;
		} else if (device != em->map_lookup->stripes[0].dev) {
			btrfs_warn(fs_info, "swapfile must be on one device");
			ret = -EINVAL;
			goto out;
		}

		physical_block_start = (em->map_lookup->stripes[0].physical +
					(logical_block_start - em->start));
		len = min(len, em->len - (logical_block_start - em->start));
		free_extent_map(em);
		em = NULL;

		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
		if (!bg) {
			btrfs_warn(fs_info,
			   "could not find block group containing swapfile");
			ret = -EINVAL;
			goto out;
		}

10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469
		if (!btrfs_inc_block_group_swap_extents(bg)) {
			btrfs_warn(fs_info,
			   "block group for swapfile at %llu is read-only%s",
			   bg->start,
			   atomic_read(&fs_info->scrubs_running) ?
				       " (scrub running)" : "");
			btrfs_put_block_group(bg);
			ret = -EINVAL;
			goto out;
		}

O
Omar Sandoval 已提交
10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507
		ret = btrfs_add_swapfile_pin(inode, bg, true);
		if (ret) {
			btrfs_put_block_group(bg);
			if (ret == 1)
				ret = 0;
			else
				goto out;
		}

		if (bsi.block_len &&
		    bsi.block_start + bsi.block_len == physical_block_start) {
			bsi.block_len += len;
		} else {
			if (bsi.block_len) {
				ret = btrfs_add_swap_extent(sis, &bsi);
				if (ret)
					goto out;
			}
			bsi.start = start;
			bsi.block_start = physical_block_start;
			bsi.block_len = len;
		}

		start += len;
	}

	if (bsi.block_len)
		ret = btrfs_add_swap_extent(sis, &bsi);

out:
	if (!IS_ERR_OR_NULL(em))
		free_extent_map(em);

	unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);

	if (ret)
		btrfs_swap_deactivate(file);

10508 10509
	btrfs_drew_write_unlock(&root->snapshot_lock);

10510
	btrfs_exclop_finish(fs_info);
O
Omar Sandoval 已提交
10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534

	if (ret)
		return ret;

	if (device)
		sis->bdev = device->bdev;
	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
	sis->max = bsi.nr_pages;
	sis->pages = bsi.nr_pages - 1;
	sis->highest_bit = bsi.nr_pages - 1;
	return bsi.nr_extents;
}
#else
static void btrfs_swap_deactivate(struct file *file)
{
}

static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
			       sector_t *span)
{
	return -EOPNOTSUPP;
}
#endif

10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555
/*
 * Update the number of bytes used in the VFS' inode. When we replace extents in
 * a range (clone, dedupe, fallocate's zero range), we must update the number of
 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
 * always get a correct value.
 */
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
			      const u64 add_bytes,
			      const u64 del_bytes)
{
	if (add_bytes == del_bytes)
		return;

	spin_lock(&inode->lock);
	if (del_bytes > 0)
		inode_sub_bytes(&inode->vfs_inode, del_bytes);
	if (add_bytes > 0)
		inode_add_bytes(&inode->vfs_inode, add_bytes);
	spin_unlock(&inode->lock);
}

10556
static const struct inode_operations btrfs_dir_inode_operations = {
10557
	.getattr	= btrfs_getattr,
C
Chris Mason 已提交
10558 10559 10560 10561 10562 10563
	.lookup		= btrfs_lookup,
	.create		= btrfs_create,
	.unlink		= btrfs_unlink,
	.link		= btrfs_link,
	.mkdir		= btrfs_mkdir,
	.rmdir		= btrfs_rmdir,
10564
	.rename		= btrfs_rename2,
C
Chris Mason 已提交
10565 10566
	.symlink	= btrfs_symlink,
	.setattr	= btrfs_setattr,
J
Josef Bacik 已提交
10567
	.mknod		= btrfs_mknod,
J
Josef Bacik 已提交
10568
	.listxattr	= btrfs_listxattr,
Y
Yan 已提交
10569
	.permission	= btrfs_permission,
10570
	.get_acl	= btrfs_get_acl,
10571
	.set_acl	= btrfs_set_acl,
10572
	.update_time	= btrfs_update_time,
10573
	.tmpfile        = btrfs_tmpfile,
M
Miklos Szeredi 已提交
10574 10575
	.fileattr_get	= btrfs_fileattr_get,
	.fileattr_set	= btrfs_fileattr_set,
C
Chris Mason 已提交
10576
};
10577

10578
static const struct file_operations btrfs_dir_file_operations = {
C
Chris Mason 已提交
10579 10580
	.llseek		= generic_file_llseek,
	.read		= generic_read_dir,
10581
	.iterate_shared	= btrfs_real_readdir,
10582
	.open		= btrfs_opendir,
C
Christoph Hellwig 已提交
10583
	.unlocked_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
10584
#ifdef CONFIG_COMPAT
10585
	.compat_ioctl	= btrfs_compat_ioctl,
C
Chris Mason 已提交
10586
#endif
S
Sage Weil 已提交
10587
	.release        = btrfs_release_file,
10588
	.fsync		= btrfs_sync_file,
C
Chris Mason 已提交
10589 10590
};

10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602
/*
 * btrfs doesn't support the bmap operation because swapfiles
 * use bmap to make a mapping of extents in the file.  They assume
 * these extents won't change over the life of the file and they
 * use the bmap result to do IO directly to the drive.
 *
 * the btrfs bmap call would return logical addresses that aren't
 * suitable for IO and they also will change frequently as COW
 * operations happen.  So, swapfile + btrfs == corruption.
 *
 * For now we're avoiding this by dropping bmap.
 */
10603
static const struct address_space_operations btrfs_aops = {
C
Chris Mason 已提交
10604 10605
	.readpage	= btrfs_readpage,
	.writepage	= btrfs_writepage,
C
Chris Mason 已提交
10606
	.writepages	= btrfs_writepages,
10607
	.readahead	= btrfs_readahead,
10608
	.direct_IO	= noop_direct_IO,
10609 10610
	.invalidatepage = btrfs_invalidatepage,
	.releasepage	= btrfs_releasepage,
10611 10612 10613
#ifdef CONFIG_MIGRATION
	.migratepage	= btrfs_migratepage,
#endif
10614
	.set_page_dirty	= btrfs_set_page_dirty,
10615
	.error_remove_page = generic_error_remove_page,
O
Omar Sandoval 已提交
10616 10617
	.swap_activate	= btrfs_swap_activate,
	.swap_deactivate = btrfs_swap_deactivate,
C
Chris Mason 已提交
10618 10619
};

10620
static const struct inode_operations btrfs_file_inode_operations = {
C
Chris Mason 已提交
10621 10622
	.getattr	= btrfs_getattr,
	.setattr	= btrfs_setattr,
J
Josef Bacik 已提交
10623
	.listxattr      = btrfs_listxattr,
Y
Yan 已提交
10624
	.permission	= btrfs_permission,
Y
Yehuda Sadeh 已提交
10625
	.fiemap		= btrfs_fiemap,
10626
	.get_acl	= btrfs_get_acl,
10627
	.set_acl	= btrfs_set_acl,
10628
	.update_time	= btrfs_update_time,
M
Miklos Szeredi 已提交
10629 10630
	.fileattr_get	= btrfs_fileattr_get,
	.fileattr_set	= btrfs_fileattr_set,
C
Chris Mason 已提交
10631
};
10632
static const struct inode_operations btrfs_special_inode_operations = {
J
Josef Bacik 已提交
10633 10634
	.getattr	= btrfs_getattr,
	.setattr	= btrfs_setattr,
Y
Yan 已提交
10635
	.permission	= btrfs_permission,
J
Josef Bacik 已提交
10636
	.listxattr	= btrfs_listxattr,
10637
	.get_acl	= btrfs_get_acl,
10638
	.set_acl	= btrfs_set_acl,
10639
	.update_time	= btrfs_update_time,
J
Josef Bacik 已提交
10640
};
10641
static const struct inode_operations btrfs_symlink_inode_operations = {
10642
	.get_link	= page_get_link,
10643
	.getattr	= btrfs_getattr,
10644
	.setattr	= btrfs_setattr,
Y
Yan 已提交
10645
	.permission	= btrfs_permission,
J
Jim Owens 已提交
10646
	.listxattr	= btrfs_listxattr,
10647
	.update_time	= btrfs_update_time,
C
Chris Mason 已提交
10648
};
10649

10650
const struct dentry_operations btrfs_dentry_operations = {
10651 10652
	.d_delete	= btrfs_dentry_delete,
};