inode.c 290.0 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

6
#include <linux/kernel.h>
7
#include <linux/bio.h>
C
Chris Mason 已提交
8
#include <linux/buffer_head.h>
S
Sage Weil 已提交
9
#include <linux/file.h>
C
Chris Mason 已提交
10 11 12 13 14 15 16 17 18
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/compat.h>
J
Josef Bacik 已提交
19
#include <linux/xattr.h>
J
Josef Bacik 已提交
20
#include <linux/posix_acl.h>
Y
Yan Zheng 已提交
21
#include <linux/falloc.h>
22
#include <linux/slab.h>
23
#include <linux/ratelimit.h>
24
#include <linux/btrfs.h>
D
David Woodhouse 已提交
25
#include <linux/blkdev.h>
26
#include <linux/posix_acl_xattr.h>
27
#include <linux/uio.h>
28
#include <linux/magic.h>
29
#include <linux/iversion.h>
O
Omar Sandoval 已提交
30
#include <linux/swap.h>
31
#include <linux/migrate.h>
32
#include <linux/sched/mm.h>
33
#include <asm/unaligned.h>
34
#include "misc.h"
C
Chris Mason 已提交
35 36 37 38 39
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
40
#include "ordered-data.h"
41
#include "xattr.h"
42
#include "tree-log.h"
43
#include "volumes.h"
C
Chris Mason 已提交
44
#include "compression.h"
45
#include "locking.h"
46
#include "free-space-cache.h"
47
#include "inode-map.h"
48
#include "props.h"
49
#include "qgroup.h"
50
#include "delalloc-space.h"
51
#include "block-group.h"
C
Chris Mason 已提交
52 53

struct btrfs_iget_args {
54
	struct btrfs_key *location;
C
Chris Mason 已提交
55 56 57
	struct btrfs_root *root;
};

58 59 60 61
struct btrfs_dio_data {
	u64 reserve;
	u64 unsubmitted_oe_range_start;
	u64 unsubmitted_oe_range_end;
62
	int overwrite;
63 64
};

65 66 67 68
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
69
static const struct address_space_operations btrfs_aops;
70
static const struct file_operations btrfs_dir_file_operations;
71
static const struct extent_io_ops btrfs_extent_io_ops;
C
Chris Mason 已提交
72 73 74 75

static struct kmem_cache *btrfs_inode_cachep;
struct kmem_cache *btrfs_trans_handle_cachep;
struct kmem_cache *btrfs_path_cachep;
76
struct kmem_cache *btrfs_free_space_cachep;
77
struct kmem_cache *btrfs_free_space_bitmap_cachep;
C
Chris Mason 已提交
78

79
static int btrfs_setsize(struct inode *inode, struct iattr *attr);
80
static int btrfs_truncate(struct inode *inode, bool skip_writeback);
81
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
82 83
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
84
				   u64 start, u64 end, int *page_started,
85
				   unsigned long *nr_written, int unlock);
86 87 88 89 90
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
				       u64 orig_start, u64 block_start,
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type);
91

92 93 94 95 96 97
static void __endio_write_update_ordered(struct inode *inode,
					 const u64 offset, const u64 bytes,
					 const bool uptodate);

/*
 * Cleanup all submitted ordered extents in specified range to handle errors
98
 * from the btrfs_run_delalloc_range() callback.
99 100 101 102 103
 *
 * NOTE: caller must ensure that when an error happens, it can not call
 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
 * to be released, which we want to happen only when finishing the ordered
104
 * extent (btrfs_finish_ordered_io()).
105 106
 */
static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
107 108
						 struct page *locked_page,
						 u64 offset, u64 bytes)
109
{
110 111
	unsigned long index = offset >> PAGE_SHIFT;
	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
112 113 114
	u64 page_start = page_offset(locked_page);
	u64 page_end = page_start + PAGE_SIZE - 1;

115 116 117 118 119 120 121 122 123 124
	struct page *page;

	while (index <= end_index) {
		page = find_get_page(inode->i_mapping, index);
		index++;
		if (!page)
			continue;
		ClearPagePrivate2(page);
		put_page(page);
	}
125 126 127 128 129 130 131 132 133 134 135 136

	/*
	 * In case this page belongs to the delalloc range being instantiated
	 * then skip it, since the first page of a range is going to be
	 * properly cleaned up by the caller of run_delalloc_range
	 */
	if (page_start >= offset && page_end <= (offset + bytes - 1)) {
		offset += PAGE_SIZE;
		bytes -= PAGE_SIZE;
	}

	return __endio_write_update_ordered(inode, offset, bytes, false);
137 138
}

139
static int btrfs_dirty_inode(struct inode *inode);
140

141 142 143 144 145 146 147
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_inode_set_ops(struct inode *inode)
{
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
}
#endif

148
static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
149 150
				     struct inode *inode,  struct inode *dir,
				     const struct qstr *qstr)
J
Jim Owens 已提交
151 152 153
{
	int err;

154
	err = btrfs_init_acl(trans, inode, dir);
J
Jim Owens 已提交
155
	if (!err)
156
		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
J
Jim Owens 已提交
157 158 159
	return err;
}

C
Chris Mason 已提交
160 161 162 163 164
/*
 * this does all the hard work for inserting an inline extent into
 * the btree.  The caller should have done a btrfs_drop_extents so that
 * no overlapping inline items exist in the btree
 */
165
static int insert_inline_extent(struct btrfs_trans_handle *trans,
166
				struct btrfs_path *path, int extent_inserted,
C
Chris Mason 已提交
167 168
				struct btrfs_root *root, struct inode *inode,
				u64 start, size_t size, size_t compressed_size,
169
				int compress_type,
C
Chris Mason 已提交
170 171 172 173 174 175 176 177 178 179 180
				struct page **compressed_pages)
{
	struct extent_buffer *leaf;
	struct page *page = NULL;
	char *kaddr;
	unsigned long ptr;
	struct btrfs_file_extent_item *ei;
	int ret;
	size_t cur_size = size;
	unsigned long offset;

181 182 183
	ASSERT((compressed_size > 0 && compressed_pages) ||
	       (compressed_size == 0 && !compressed_pages));

184
	if (compressed_size && compressed_pages)
C
Chris Mason 已提交
185 186
		cur_size = compressed_size;

187
	inode_add_bytes(inode, size);
C
Chris Mason 已提交
188

189 190 191
	if (!extent_inserted) {
		struct btrfs_key key;
		size_t datasize;
C
Chris Mason 已提交
192

193
		key.objectid = btrfs_ino(BTRFS_I(inode));
194
		key.offset = start;
195
		key.type = BTRFS_EXTENT_DATA_KEY;
C
Chris Mason 已提交
196

197 198 199 200
		datasize = btrfs_file_extent_calc_inline_size(cur_size);
		path->leave_spinning = 1;
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
201
		if (ret)
202
			goto fail;
C
Chris Mason 已提交
203 204 205 206 207 208 209 210 211 212 213
	}
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
	ptr = btrfs_file_extent_inline_start(ei);

214
	if (compress_type != BTRFS_COMPRESS_NONE) {
C
Chris Mason 已提交
215 216
		struct page *cpage;
		int i = 0;
C
Chris Mason 已提交
217
		while (compressed_size > 0) {
C
Chris Mason 已提交
218
			cpage = compressed_pages[i];
219
			cur_size = min_t(unsigned long, compressed_size,
220
				       PAGE_SIZE);
C
Chris Mason 已提交
221

222
			kaddr = kmap_atomic(cpage);
C
Chris Mason 已提交
223
			write_extent_buffer(leaf, kaddr, ptr, cur_size);
224
			kunmap_atomic(kaddr);
C
Chris Mason 已提交
225 226 227 228 229 230

			i++;
			ptr += cur_size;
			compressed_size -= cur_size;
		}
		btrfs_set_file_extent_compression(leaf, ei,
231
						  compress_type);
C
Chris Mason 已提交
232 233
	} else {
		page = find_get_page(inode->i_mapping,
234
				     start >> PAGE_SHIFT);
C
Chris Mason 已提交
235
		btrfs_set_file_extent_compression(leaf, ei, 0);
236
		kaddr = kmap_atomic(page);
237
		offset = offset_in_page(start);
C
Chris Mason 已提交
238
		write_extent_buffer(leaf, kaddr + offset, ptr, size);
239
		kunmap_atomic(kaddr);
240
		put_page(page);
C
Chris Mason 已提交
241 242
	}
	btrfs_mark_buffer_dirty(leaf);
243
	btrfs_release_path(path);
C
Chris Mason 已提交
244

245 246 247 248 249 250 251 252 253
	/*
	 * We align size to sectorsize for inline extents just for simplicity
	 * sake.
	 */
	size = ALIGN(size, root->fs_info->sectorsize);
	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, size);
	if (ret)
		goto fail;

254 255 256 257 258 259 260 261 262
	/*
	 * we're an inline extent, so nobody can
	 * extend the file past i_size without locking
	 * a page we already have locked.
	 *
	 * We must do any isize and inode updates
	 * before we unlock the pages.  Otherwise we
	 * could end up racing with unlink.
	 */
C
Chris Mason 已提交
263
	BTRFS_I(inode)->disk_i_size = inode->i_size;
264
	ret = btrfs_update_inode(trans, root, inode);
265

C
Chris Mason 已提交
266
fail:
267
	return ret;
C
Chris Mason 已提交
268 269 270 271 272 273 274 275
}


/*
 * conditionally insert an inline extent into the file.  This
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
276
static noinline int cow_file_range_inline(struct inode *inode, u64 start,
277 278 279
					  u64 end, size_t compressed_size,
					  int compress_type,
					  struct page **compressed_pages)
C
Chris Mason 已提交
280
{
281
	struct btrfs_root *root = BTRFS_I(inode)->root;
282
	struct btrfs_fs_info *fs_info = root->fs_info;
283
	struct btrfs_trans_handle *trans;
C
Chris Mason 已提交
284 285 286
	u64 isize = i_size_read(inode);
	u64 actual_end = min(end + 1, isize);
	u64 inline_len = actual_end - start;
287
	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
C
Chris Mason 已提交
288 289
	u64 data_len = inline_len;
	int ret;
290 291 292
	struct btrfs_path *path;
	int extent_inserted = 0;
	u32 extent_item_size;
C
Chris Mason 已提交
293 294 295 296 297

	if (compressed_size)
		data_len = compressed_size;

	if (start > 0 ||
298 299
	    actual_end > fs_info->sectorsize ||
	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
C
Chris Mason 已提交
300
	    (!compressed_size &&
301
	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
C
Chris Mason 已提交
302
	    end + 1 < isize ||
303
	    data_len > fs_info->max_inline) {
C
Chris Mason 已提交
304 305 306
		return 1;
	}

307 308 309 310
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

311
	trans = btrfs_join_transaction(root);
312 313
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
314
		return PTR_ERR(trans);
315
	}
316
	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
317

318 319 320 321 322 323 324 325 326 327
	if (compressed_size && compressed_pages)
		extent_item_size = btrfs_file_extent_calc_inline_size(
		   compressed_size);
	else
		extent_item_size = btrfs_file_extent_calc_inline_size(
		    inline_len);

	ret = __btrfs_drop_extents(trans, root, inode, path,
				   start, aligned_end, NULL,
				   1, 1, extent_item_size, &extent_inserted);
328
	if (ret) {
329
		btrfs_abort_transaction(trans, ret);
330 331
		goto out;
	}
C
Chris Mason 已提交
332 333 334

	if (isize > actual_end)
		inline_len = min_t(u64, isize, actual_end);
335 336
	ret = insert_inline_extent(trans, path, extent_inserted,
				   root, inode, start,
C
Chris Mason 已提交
337
				   inline_len, compressed_size,
338
				   compress_type, compressed_pages);
339
	if (ret && ret != -ENOSPC) {
340
		btrfs_abort_transaction(trans, ret);
341
		goto out;
342
	} else if (ret == -ENOSPC) {
343 344
		ret = 1;
		goto out;
345
	}
346

347
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
348
	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
349
out:
350 351 352 353 354 355
	/*
	 * Don't forget to free the reserved space, as for inlined extent
	 * it won't count as data extent, free them directly here.
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
356
	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
357
	btrfs_free_path(path);
358
	btrfs_end_transaction(trans);
359
	return ret;
C
Chris Mason 已提交
360 361
}

362 363 364 365 366 367
struct async_extent {
	u64 start;
	u64 ram_size;
	u64 compressed_size;
	struct page **pages;
	unsigned long nr_pages;
368
	int compress_type;
369 370 371
	struct list_head list;
};

372
struct async_chunk {
373 374 375 376
	struct inode *inode;
	struct page *locked_page;
	u64 start;
	u64 end;
377
	unsigned int write_flags;
378
	struct list_head extents;
379
	struct cgroup_subsys_state *blkcg_css;
380
	struct btrfs_work work;
381
	atomic_t *pending;
382 383
};

384 385 386 387
struct async_cow {
	/* Number of chunks in flight; must be first in the structure */
	atomic_t num_chunks;
	struct async_chunk chunks[];
388 389
};

390
static noinline int add_async_extent(struct async_chunk *cow,
391 392 393
				     u64 start, u64 ram_size,
				     u64 compressed_size,
				     struct page **pages,
394 395
				     unsigned long nr_pages,
				     int compress_type)
396 397 398 399
{
	struct async_extent *async_extent;

	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
400
	BUG_ON(!async_extent); /* -ENOMEM */
401 402 403 404 405
	async_extent->start = start;
	async_extent->ram_size = ram_size;
	async_extent->compressed_size = compressed_size;
	async_extent->pages = pages;
	async_extent->nr_pages = nr_pages;
406
	async_extent->compress_type = compress_type;
407 408 409 410
	list_add_tail(&async_extent->list, &cow->extents);
	return 0;
}

411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
/*
 * Check if the inode has flags compatible with compression
 */
static inline bool inode_can_compress(struct inode *inode)
{
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
	    BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
		return false;
	return true;
}

/*
 * Check if the inode needs to be submitted to compression, based on mount
 * options, defragmentation, properties or heuristics.
 */
426
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
427
{
428
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
429

430 431 432 433 434 435
	if (!inode_can_compress(inode)) {
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
			btrfs_ino(BTRFS_I(inode)));
		return 0;
	}
436
	/* force compress */
437
	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
438
		return 1;
439 440 441
	/* defrag ioctl */
	if (BTRFS_I(inode)->defrag_compress)
		return 1;
442 443 444
	/* bad compression ratios */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
		return 0;
445
	if (btrfs_test_opt(fs_info, COMPRESS) ||
446
	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
447
	    BTRFS_I(inode)->prop_compress)
448
		return btrfs_compress_heuristic(inode, start, end);
449 450 451
	return 0;
}

452
static inline void inode_should_defrag(struct btrfs_inode *inode,
453 454 455 456
		u64 start, u64 end, u64 num_bytes, u64 small_write)
{
	/* If this is a small write inside eof, kick off a defrag */
	if (num_bytes < small_write &&
457
	    (start > 0 || end + 1 < inode->disk_i_size))
458 459 460
		btrfs_add_inode_defrag(NULL, inode);
}

C
Chris Mason 已提交
461
/*
462 463 464
 * we create compressed extents in two phases.  The first
 * phase compresses a range of pages that have already been
 * locked (both pages and state bits are locked).
C
Chris Mason 已提交
465
 *
466 467 468 469 470
 * This is done inside an ordered work queue, and the compression
 * is spread across many cpus.  The actual IO submission is step
 * two, and the ordered work queue takes care of making sure that
 * happens in the same order things were put onto the queue by
 * writepages and friends.
C
Chris Mason 已提交
471
 *
472 473 474
 * If this code finds it can't get good compression, it puts an
 * entry onto the work queue to write the uncompressed bytes.  This
 * makes sure that both compressed inodes and uncompressed inodes
475 476
 * are written in the same order that the flusher thread sent them
 * down.
C
Chris Mason 已提交
477
 */
478
static noinline int compress_file_range(struct async_chunk *async_chunk)
479
{
480
	struct inode *inode = async_chunk->inode;
481 482
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	u64 blocksize = fs_info->sectorsize;
483 484
	u64 start = async_chunk->start;
	u64 end = async_chunk->end;
C
Chris Mason 已提交
485
	u64 actual_end;
486
	u64 i_size;
487
	int ret = 0;
C
Chris Mason 已提交
488 489 490 491 492 493
	struct page **pages = NULL;
	unsigned long nr_pages;
	unsigned long total_compressed = 0;
	unsigned long total_in = 0;
	int i;
	int will_compress;
494
	int compress_type = fs_info->compress_type;
495
	int compressed_extents = 0;
496
	int redirty = 0;
497

498 499
	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
			SZ_16K);
C
Chris Mason 已提交
500

501 502 503 504 505 506 507 508 509 510 511 512 513
	/*
	 * We need to save i_size before now because it could change in between
	 * us evaluating the size and assigning it.  This is because we lock and
	 * unlock the page in truncate and fallocate, and then modify the i_size
	 * later on.
	 *
	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
	 * does that for us.
	 */
	barrier();
	i_size = i_size_read(inode);
	barrier();
	actual_end = min_t(u64, i_size, end + 1);
C
Chris Mason 已提交
514 515
again:
	will_compress = 0;
516
	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
517 518 519
	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
	nr_pages = min_t(unsigned long, nr_pages,
			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
520

521 522 523 524 525 526 527 528 529 530 531 532 533
	/*
	 * we don't want to send crud past the end of i_size through
	 * compression, that's just a waste of CPU time.  So, if the
	 * end of the file is before the start of our current
	 * requested range of bytes, we bail out to the uncompressed
	 * cleanup code that can deal with all of this.
	 *
	 * It isn't really the fastest way to fix things, but this is a
	 * very uncommon corner.
	 */
	if (actual_end <= start)
		goto cleanup_and_bail_uncompressed;

C
Chris Mason 已提交
534 535
	total_compressed = actual_end - start;

536 537
	/*
	 * skip compression for a small file range(<=blocksize) that
538
	 * isn't an inline extent, since it doesn't save disk space at all.
539 540 541 542 543
	 */
	if (total_compressed <= blocksize &&
	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
		goto cleanup_and_bail_uncompressed;

544 545
	total_compressed = min_t(unsigned long, total_compressed,
			BTRFS_MAX_UNCOMPRESSED);
C
Chris Mason 已提交
546 547
	total_in = 0;
	ret = 0;
548

549 550 551 552
	/*
	 * we do compression for mount -o compress and when the
	 * inode has not been flagged as nocompress.  This flag can
	 * change at any time if we discover bad compression ratios.
C
Chris Mason 已提交
553
	 */
554
	if (inode_need_compress(inode, start, end)) {
C
Chris Mason 已提交
555
		WARN_ON(pages);
556
		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
557 558
		if (!pages) {
			/* just bail out to the uncompressed code */
559
			nr_pages = 0;
560 561
			goto cont;
		}
C
Chris Mason 已提交
562

563 564 565
		if (BTRFS_I(inode)->defrag_compress)
			compress_type = BTRFS_I(inode)->defrag_compress;
		else if (BTRFS_I(inode)->prop_compress)
566
			compress_type = BTRFS_I(inode)->prop_compress;
567

568 569 570 571 572 573 574 575
		/*
		 * we need to call clear_page_dirty_for_io on each
		 * page in the range.  Otherwise applications with the file
		 * mmap'd can wander in and change the page contents while
		 * we are compressing them.
		 *
		 * If the compression fails for any reason, we set the pages
		 * dirty again later on.
576 577 578
		 *
		 * Note that the remaining part is redirtied, the start pointer
		 * has moved, the end is the original one.
579
		 */
580 581 582 583
		if (!redirty) {
			extent_range_clear_dirty_for_io(inode, start, end);
			redirty = 1;
		}
584 585 586 587

		/* Compression level is applied here and only here */
		ret = btrfs_compress_pages(
			compress_type | (fs_info->compress_level << 4),
588
					   inode->i_mapping, start,
589
					   pages,
590
					   &nr_pages,
591
					   &total_in,
592
					   &total_compressed);
C
Chris Mason 已提交
593 594

		if (!ret) {
595
			unsigned long offset = offset_in_page(total_compressed);
596
			struct page *page = pages[nr_pages - 1];
C
Chris Mason 已提交
597 598 599 600 601 602
			char *kaddr;

			/* zero the tail end of the last page, we might be
			 * sending it down to disk
			 */
			if (offset) {
603
				kaddr = kmap_atomic(page);
C
Chris Mason 已提交
604
				memset(kaddr + offset, 0,
605
				       PAGE_SIZE - offset);
606
				kunmap_atomic(kaddr);
C
Chris Mason 已提交
607 608 609 610
			}
			will_compress = 1;
		}
	}
611
cont:
C
Chris Mason 已提交
612 613
	if (start == 0) {
		/* lets try to make an inline extent */
614
		if (ret || total_in < actual_end) {
C
Chris Mason 已提交
615
			/* we didn't compress the entire range, try
616
			 * to make an uncompressed inline extent.
C
Chris Mason 已提交
617
			 */
618 619
			ret = cow_file_range_inline(inode, start, end, 0,
						    BTRFS_COMPRESS_NONE, NULL);
C
Chris Mason 已提交
620
		} else {
621
			/* try making a compressed inline extent */
622
			ret = cow_file_range_inline(inode, start, end,
623 624
						    total_compressed,
						    compress_type, pages);
C
Chris Mason 已提交
625
		}
626
		if (ret <= 0) {
627
			unsigned long clear_flags = EXTENT_DELALLOC |
J
Josef Bacik 已提交
628 629
				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				EXTENT_DO_ACCOUNTING;
630 631 632
			unsigned long page_error_op;

			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
633

634
			/*
635 636 637
			 * inline extent creation worked or returned error,
			 * we don't need to create any more async work items.
			 * Unlock and free up our temp pages.
J
Josef Bacik 已提交
638 639 640 641 642
			 *
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be done _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
643
			 */
644 645
			extent_clear_unlock_delalloc(inode, start, end, NULL,
						     clear_flags,
646
						     PAGE_UNLOCK |
647 648
						     PAGE_CLEAR_DIRTY |
						     PAGE_SET_WRITEBACK |
649
						     page_error_op |
650
						     PAGE_END_WRITEBACK);
651 652 653 654 655 656 657 658

			for (i = 0; i < nr_pages; i++) {
				WARN_ON(pages[i]->mapping);
				put_page(pages[i]);
			}
			kfree(pages);

			return 0;
C
Chris Mason 已提交
659 660 661 662 663 664 665 666 667
		}
	}

	if (will_compress) {
		/*
		 * we aren't doing an inline extent round the compressed size
		 * up to a block size boundary so the allocator does sane
		 * things
		 */
668
		total_compressed = ALIGN(total_compressed, blocksize);
C
Chris Mason 已提交
669 670 671

		/*
		 * one last check to make sure the compression is really a
672 673
		 * win, compare the page count read with the blocks on disk,
		 * compression must free at least one sector size
C
Chris Mason 已提交
674
		 */
675
		total_in = ALIGN(total_in, PAGE_SIZE);
676
		if (total_compressed + blocksize <= total_in) {
677
			compressed_extents++;
678 679 680 681 682 683

			/*
			 * The async work queues will take care of doing actual
			 * allocation on disk for these compressed pages, and
			 * will submit them to the elevator.
			 */
684
			add_async_extent(async_chunk, start, total_in,
685
					total_compressed, pages, nr_pages,
686 687
					compress_type);

688 689
			if (start + total_in < end) {
				start += total_in;
690 691 692 693
				pages = NULL;
				cond_resched();
				goto again;
			}
694
			return compressed_extents;
C
Chris Mason 已提交
695 696
		}
	}
697
	if (pages) {
C
Chris Mason 已提交
698 699 700 701
		/*
		 * the compression code ran but failed to make things smaller,
		 * free any pages it allocated and our page pointer array
		 */
702
		for (i = 0; i < nr_pages; i++) {
C
Chris Mason 已提交
703
			WARN_ON(pages[i]->mapping);
704
			put_page(pages[i]);
C
Chris Mason 已提交
705 706 707 708
		}
		kfree(pages);
		pages = NULL;
		total_compressed = 0;
709
		nr_pages = 0;
C
Chris Mason 已提交
710 711

		/* flag the file so we don't compress in the future */
712
		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
713
		    !(BTRFS_I(inode)->prop_compress)) {
C
Chris Mason 已提交
714
			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
C
Chris Mason 已提交
715
		}
C
Chris Mason 已提交
716
	}
717
cleanup_and_bail_uncompressed:
718 719 720 721 722 723
	/*
	 * No compression, but we still need to write the pages in the file
	 * we've been given so far.  redirty the locked page if it corresponds
	 * to our extent and set things up for the async work queue to run
	 * cow_file_range to do the normal delalloc dance.
	 */
724 725 726
	if (async_chunk->locked_page &&
	    (page_offset(async_chunk->locked_page) >= start &&
	     page_offset(async_chunk->locked_page)) <= end) {
727
		__set_page_dirty_nobuffers(async_chunk->locked_page);
728
		/* unlocked later on in the async handlers */
729
	}
730 731 732

	if (redirty)
		extent_range_redirty_for_io(inode, start, end);
733
	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
734
			 BTRFS_COMPRESS_NONE);
735
	compressed_extents++;
736

737
	return compressed_extents;
738 739
}

740 741 742 743 744 745 746 747 748
static void free_async_extent_pages(struct async_extent *async_extent)
{
	int i;

	if (!async_extent->pages)
		return;

	for (i = 0; i < async_extent->nr_pages; i++) {
		WARN_ON(async_extent->pages[i]->mapping);
749
		put_page(async_extent->pages[i]);
750 751 752 753
	}
	kfree(async_extent->pages);
	async_extent->nr_pages = 0;
	async_extent->pages = NULL;
754 755 756 757 758 759 760 761
}

/*
 * phase two of compressed writeback.  This is the ordered portion
 * of the code, which only gets called in the order the work was
 * queued.  We walk all the async extents created by compress_file_range
 * and send them down to the disk.
 */
762
static noinline void submit_compressed_extents(struct async_chunk *async_chunk)
763
{
764
	struct inode *inode = async_chunk->inode;
765
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
766 767 768 769 770
	struct async_extent *async_extent;
	u64 alloc_hint = 0;
	struct btrfs_key ins;
	struct extent_map *em;
	struct btrfs_root *root = BTRFS_I(inode)->root;
771
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
772
	int ret = 0;
773

774
again:
775 776
	while (!list_empty(&async_chunk->extents)) {
		async_extent = list_entry(async_chunk->extents.next,
777 778
					  struct async_extent, list);
		list_del(&async_extent->list);
C
Chris Mason 已提交
779

780
retry:
781 782
		lock_extent(io_tree, async_extent->start,
			    async_extent->start + async_extent->ram_size - 1);
783 784 785 786 787 788
		/* did the compression code fall back to uncompressed IO? */
		if (!async_extent->pages) {
			int page_started = 0;
			unsigned long nr_written = 0;

			/* allocate blocks */
789
			ret = cow_file_range(inode, async_chunk->locked_page,
790 791 792
					     async_extent->start,
					     async_extent->start +
					     async_extent->ram_size - 1,
793
					     &page_started, &nr_written, 0);
794

795 796
			/* JDM XXX */

797 798 799 800 801 802
			/*
			 * if page_started, cow_file_range inserted an
			 * inline extent and took care of all the unlocking
			 * and IO for us.  Otherwise, we need to submit
			 * all those pages down to the drive.
			 */
803
			if (!page_started && !ret)
804 805
				extent_write_locked_range(inode,
						  async_extent->start,
C
Chris Mason 已提交
806
						  async_extent->start +
807 808
						  async_extent->ram_size - 1,
						  WB_SYNC_ALL);
809
			else if (ret && async_chunk->locked_page)
810
				unlock_page(async_chunk->locked_page);
811 812 813 814 815
			kfree(async_extent);
			cond_resched();
			continue;
		}

816
		ret = btrfs_reserve_extent(root, async_extent->ram_size,
817 818
					   async_extent->compressed_size,
					   async_extent->compressed_size,
819
					   0, alloc_hint, &ins, 1, 1);
820
		if (ret) {
821
			free_async_extent_pages(async_extent);
822

823 824 825 826
			if (ret == -ENOSPC) {
				unlock_extent(io_tree, async_extent->start,
					      async_extent->start +
					      async_extent->ram_size - 1);
827 828 829 830 831 832 833 834 835 836 837 838

				/*
				 * we need to redirty the pages if we decide to
				 * fallback to uncompressed IO, otherwise we
				 * will not submit these pages down to lower
				 * layers.
				 */
				extent_range_redirty_for_io(inode,
						async_extent->start,
						async_extent->start +
						async_extent->ram_size - 1);

839
				goto retry;
840
			}
841
			goto out_free;
842
		}
843 844 845 846
		/*
		 * here we're doing allocation and writeback of the
		 * compressed pages
		 */
847 848 849 850 851 852 853 854 855 856 857
		em = create_io_em(inode, async_extent->start,
				  async_extent->ram_size, /* len */
				  async_extent->start, /* orig_start */
				  ins.objectid, /* block_start */
				  ins.offset, /* block_len */
				  ins.offset, /* orig_block_len */
				  async_extent->ram_size, /* ram_bytes */
				  async_extent->compress_type,
				  BTRFS_ORDERED_COMPRESSED);
		if (IS_ERR(em))
			/* ret value is not necessary due to void function */
858
			goto out_free_reserve;
859
		free_extent_map(em);
860

861 862 863 864 865 866 867
		ret = btrfs_add_ordered_extent_compress(inode,
						async_extent->start,
						ins.objectid,
						async_extent->ram_size,
						ins.offset,
						BTRFS_ORDERED_COMPRESSED,
						async_extent->compress_type);
868
		if (ret) {
869 870
			btrfs_drop_extent_cache(BTRFS_I(inode),
						async_extent->start,
871 872
						async_extent->start +
						async_extent->ram_size - 1, 0);
873
			goto out_free_reserve;
874
		}
875
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
876 877 878 879

		/*
		 * clear dirty, set writeback and unlock the pages.
		 */
880
		extent_clear_unlock_delalloc(inode, async_extent->start,
881 882
				async_extent->start +
				async_extent->ram_size - 1,
883 884
				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
885
				PAGE_SET_WRITEBACK);
886
		if (btrfs_submit_compressed_write(inode,
C
Chris Mason 已提交
887 888 889 890
				    async_extent->start,
				    async_extent->ram_size,
				    ins.objectid,
				    ins.offset, async_extent->pages,
891
				    async_extent->nr_pages,
892 893
				    async_chunk->write_flags,
				    async_chunk->blkcg_css)) {
894 895 896 897 898
			struct page *p = async_extent->pages[0];
			const u64 start = async_extent->start;
			const u64 end = start + async_extent->ram_size - 1;

			p->mapping = inode->i_mapping;
899
			btrfs_writepage_endio_finish_ordered(p, start, end, 0);
900

901
			p->mapping = NULL;
902
			extent_clear_unlock_delalloc(inode, start, end,
903
						     NULL, 0,
904 905
						     PAGE_END_WRITEBACK |
						     PAGE_SET_ERROR);
906
			free_async_extent_pages(async_extent);
907
		}
908 909 910 911
		alloc_hint = ins.objectid + ins.offset;
		kfree(async_extent);
		cond_resched();
	}
912
	return;
913
out_free_reserve:
914
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
915
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
916
out_free:
917
	extent_clear_unlock_delalloc(inode, async_extent->start,
918 919
				     async_extent->start +
				     async_extent->ram_size - 1,
920
				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
921
				     EXTENT_DELALLOC_NEW |
922 923
				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
924 925
				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
				     PAGE_SET_ERROR);
926
	free_async_extent_pages(async_extent);
927
	kfree(async_extent);
928
	goto again;
929 930
}

931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962
static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
				      u64 num_bytes)
{
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
	u64 alloc_hint = 0;

	read_lock(&em_tree->lock);
	em = search_extent_mapping(em_tree, start, num_bytes);
	if (em) {
		/*
		 * if block start isn't an actual block number then find the
		 * first block in this inode and use that as a hint.  If that
		 * block is also bogus then just don't worry about it.
		 */
		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
			free_extent_map(em);
			em = search_extent_mapping(em_tree, 0, 0);
			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
				alloc_hint = em->block_start;
			if (em)
				free_extent_map(em);
		} else {
			alloc_hint = em->block_start;
			free_extent_map(em);
		}
	}
	read_unlock(&em_tree->lock);

	return alloc_hint;
}

963 964 965 966 967 968 969 970 971 972 973 974 975
/*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
 * allocate extents on disk for the range, and create ordered data structs
 * in ram to track those extents.
 *
 * locked_page is the page that writepage had locked already.  We use
 * it to make sure we don't do extra locks or unlocks.
 *
 * *page_started is set to one if we unlock locked_page and do everything
 * required to start IO on it.  It may be clean and already done with
 * IO when we return.
 */
976 977
static noinline int cow_file_range(struct inode *inode,
				   struct page *locked_page,
978
				   u64 start, u64 end, int *page_started,
979
				   unsigned long *nr_written, int unlock)
980
{
981
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
982
	struct btrfs_root *root = BTRFS_I(inode)->root;
983 984 985
	u64 alloc_hint = 0;
	u64 num_bytes;
	unsigned long ram_size;
986
	u64 cur_alloc_size = 0;
987
	u64 blocksize = fs_info->sectorsize;
988 989
	struct btrfs_key ins;
	struct extent_map *em;
990 991 992
	unsigned clear_bits;
	unsigned long page_ops;
	bool extent_reserved = false;
993 994
	int ret = 0;

995
	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
996
		WARN_ON_ONCE(1);
997 998
		ret = -EINVAL;
		goto out_unlock;
999
	}
1000

1001
	num_bytes = ALIGN(end - start + 1, blocksize);
1002
	num_bytes = max(blocksize,  num_bytes);
1003
	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1004

1005
	inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
C
Chris Mason 已提交
1006

1007 1008
	if (start == 0) {
		/* lets try to make an inline extent */
1009 1010
		ret = cow_file_range_inline(inode, start, end, 0,
					    BTRFS_COMPRESS_NONE, NULL);
1011
		if (ret == 0) {
J
Josef Bacik 已提交
1012 1013 1014 1015 1016 1017
			/*
			 * We use DO_ACCOUNTING here because we need the
			 * delalloc_release_metadata to be run _after_ we drop
			 * our outstanding extent for clearing delalloc for this
			 * range.
			 */
1018
			extent_clear_unlock_delalloc(inode, start, end, NULL,
1019
				     EXTENT_LOCKED | EXTENT_DELALLOC |
J
Josef Bacik 已提交
1020 1021
				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1022 1023
				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
				     PAGE_END_WRITEBACK);
1024
			*nr_written = *nr_written +
1025
			     (end - start + PAGE_SIZE) / PAGE_SIZE;
1026 1027
			*page_started = 1;
			goto out;
1028 1029
		} else if (ret < 0) {
			goto out_unlock;
1030 1031 1032
		}
	}

1033
	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1034 1035
	btrfs_drop_extent_cache(BTRFS_I(inode), start,
			start + num_bytes - 1, 0);
1036

1037 1038
	while (num_bytes > 0) {
		cur_alloc_size = num_bytes;
1039
		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1040
					   fs_info->sectorsize, 0, alloc_hint,
1041
					   &ins, 1, 1);
1042
		if (ret < 0)
1043
			goto out_unlock;
1044 1045
		cur_alloc_size = ins.offset;
		extent_reserved = true;
C
Chris Mason 已提交
1046

1047
		ram_size = ins.offset;
1048 1049 1050 1051 1052 1053 1054
		em = create_io_em(inode, start, ins.offset, /* len */
				  start, /* orig_start */
				  ins.objectid, /* block_start */
				  ins.offset, /* block_len */
				  ins.offset, /* orig_block_len */
				  ram_size, /* ram_bytes */
				  BTRFS_COMPRESS_NONE, /* compress_type */
1055
				  BTRFS_ORDERED_REGULAR /* type */);
1056 1057
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
1058
			goto out_reserve;
1059
		}
1060
		free_extent_map(em);
1061 1062

		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1063
					       ram_size, cur_alloc_size, 0);
1064
		if (ret)
1065
			goto out_drop_extent_cache;
C
Chris Mason 已提交
1066

1067 1068 1069 1070
		if (root->root_key.objectid ==
		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
			ret = btrfs_reloc_clone_csums(inode, start,
						      cur_alloc_size);
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
			/*
			 * Only drop cache here, and process as normal.
			 *
			 * We must not allow extent_clear_unlock_delalloc()
			 * at out_unlock label to free meta of this ordered
			 * extent, as its meta should be freed by
			 * btrfs_finish_ordered_io().
			 *
			 * So we must continue until @start is increased to
			 * skip current ordered extent.
			 */
1082
			if (ret)
1083 1084
				btrfs_drop_extent_cache(BTRFS_I(inode), start,
						start + ram_size - 1, 0);
1085 1086
		}

1087
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1088

C
Chris Mason 已提交
1089 1090 1091
		/* we're not doing compressed IO, don't unlock the first
		 * page (which the caller expects to stay locked), don't
		 * clear any dirty bits and don't set any writeback bits
1092 1093 1094
		 *
		 * Do set the Private2 bit so we know this page was properly
		 * setup for writepage
C
Chris Mason 已提交
1095
		 */
1096 1097
		page_ops = unlock ? PAGE_UNLOCK : 0;
		page_ops |= PAGE_SET_PRIVATE2;
1098

1099
		extent_clear_unlock_delalloc(inode, start,
1100
					     start + ram_size - 1,
1101
					     locked_page,
1102
					     EXTENT_LOCKED | EXTENT_DELALLOC,
1103
					     page_ops);
1104 1105
		if (num_bytes < cur_alloc_size)
			num_bytes = 0;
1106
		else
1107
			num_bytes -= cur_alloc_size;
1108 1109
		alloc_hint = ins.objectid + ins.offset;
		start += cur_alloc_size;
1110
		extent_reserved = false;
1111 1112 1113 1114 1115 1116 1117 1118

		/*
		 * btrfs_reloc_clone_csums() error, since start is increased
		 * extent_clear_unlock_delalloc() at out_unlock label won't
		 * free metadata of current ordered extent, we're OK to exit.
		 */
		if (ret)
			goto out_unlock;
1119
	}
1120
out:
1121
	return ret;
1122

1123
out_drop_extent_cache:
1124
	btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1125
out_reserve:
1126
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1127
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1128
out_unlock:
1129 1130
	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
	page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
		PAGE_END_WRITEBACK;
	/*
	 * If we reserved an extent for our delalloc range (or a subrange) and
	 * failed to create the respective ordered extent, then it means that
	 * when we reserved the extent we decremented the extent's size from
	 * the data space_info's bytes_may_use counter and incremented the
	 * space_info's bytes_reserved counter by the same amount. We must make
	 * sure extent_clear_unlock_delalloc() does not try to decrement again
	 * the data space_info's bytes_may_use counter, therefore we do not pass
	 * it the flag EXTENT_CLEAR_DATA_RESV.
	 */
	if (extent_reserved) {
		extent_clear_unlock_delalloc(inode, start,
					     start + cur_alloc_size,
					     locked_page,
					     clear_bits,
					     page_ops);
		start += cur_alloc_size;
		if (start >= end)
			goto out;
	}
1153
	extent_clear_unlock_delalloc(inode, start, end, locked_page,
1154 1155
				     clear_bits | EXTENT_CLEAR_DATA_RESV,
				     page_ops);
1156
	goto out;
1157
}
C
Chris Mason 已提交
1158

1159 1160 1161 1162 1163
/*
 * work queue call back to started compression on a file and pages
 */
static noinline void async_cow_start(struct btrfs_work *work)
{
1164
	struct async_chunk *async_chunk;
1165
	int compressed_extents;
1166

1167
	async_chunk = container_of(work, struct async_chunk, work);
1168

1169 1170
	compressed_extents = compress_file_range(async_chunk);
	if (compressed_extents == 0) {
1171 1172
		btrfs_add_delayed_iput(async_chunk->inode);
		async_chunk->inode = NULL;
1173
	}
1174 1175 1176 1177 1178 1179 1180
}

/*
 * work queue call back to submit previously compressed pages
 */
static noinline void async_cow_submit(struct btrfs_work *work)
{
1181 1182 1183
	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
						     work);
	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1184 1185
	unsigned long nr_pages;

1186
	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1187
		PAGE_SHIFT;
1188

1189
	/* atomic_sub_return implies a barrier */
1190
	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1191 1192
	    5 * SZ_1M)
		cond_wake_up_nomb(&fs_info->async_submit_wait);
1193

1194
	/*
1195
	 * ->inode could be NULL if async_chunk_start has failed to compress,
1196 1197 1198 1199
	 * in which case we don't have anything to submit, yet we need to
	 * always adjust ->async_delalloc_pages as its paired with the init
	 * happening in cow_file_range_async
	 */
1200 1201
	if (async_chunk->inode)
		submit_compressed_extents(async_chunk);
1202
}
C
Chris Mason 已提交
1203

1204 1205
static noinline void async_cow_free(struct btrfs_work *work)
{
1206
	struct async_chunk *async_chunk;
1207

1208 1209 1210
	async_chunk = container_of(work, struct async_chunk, work);
	if (async_chunk->inode)
		btrfs_add_delayed_iput(async_chunk->inode);
1211 1212
	if (async_chunk->blkcg_css)
		css_put(async_chunk->blkcg_css);
1213 1214
	/*
	 * Since the pointer to 'pending' is at the beginning of the array of
1215
	 * async_chunk's, freeing it ensures the whole array has been freed.
1216
	 */
1217
	if (atomic_dec_and_test(async_chunk->pending))
1218
		kvfree(async_chunk->pending);
1219 1220
}

1221 1222 1223
static int cow_file_range_async(struct inode *inode,
				struct writeback_control *wbc,
				struct page *locked_page,
1224
				u64 start, u64 end, int *page_started,
1225
				unsigned long *nr_written)
1226
{
1227
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1228
	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1229 1230
	struct async_cow *ctx;
	struct async_chunk *async_chunk;
1231 1232
	unsigned long nr_pages;
	u64 cur_end;
1233 1234 1235
	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
	int i;
	bool should_compress;
1236
	unsigned nofs_flag;
1237
	const unsigned int write_flags = wbc_to_write_flags(wbc);
1238

1239
	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
1240 1241 1242 1243 1244 1245 1246 1247 1248

	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
		num_chunks = 1;
		should_compress = false;
	} else {
		should_compress = true;
	}

1249 1250 1251 1252
	nofs_flag = memalloc_nofs_save();
	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
	memalloc_nofs_restore(nofs_flag);

1253 1254 1255 1256 1257 1258 1259 1260
	if (!ctx) {
		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
			EXTENT_DO_ACCOUNTING;
		unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
			PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
			PAGE_SET_ERROR;

1261
		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273
					     clear_bits, page_ops);
		return -ENOMEM;
	}

	async_chunk = ctx->chunks;
	atomic_set(&ctx->num_chunks, num_chunks);

	for (i = 0; i < num_chunks; i++) {
		if (should_compress)
			cur_end = min(end, start + SZ_512K - 1);
		else
			cur_end = end;
1274

1275 1276 1277 1278 1279
		/*
		 * igrab is called higher up in the call chain, take only the
		 * lightweight reference for the callback lifetime
		 */
		ihold(inode);
1280 1281 1282 1283 1284 1285 1286
		async_chunk[i].pending = &ctx->num_chunks;
		async_chunk[i].inode = inode;
		async_chunk[i].start = start;
		async_chunk[i].end = cur_end;
		async_chunk[i].write_flags = write_flags;
		INIT_LIST_HEAD(&async_chunk[i].extents);

1287 1288 1289 1290 1291 1292 1293 1294 1295 1296
		/*
		 * The locked_page comes all the way from writepage and its
		 * the original page we were actually given.  As we spread
		 * this large delalloc region across multiple async_chunk
		 * structs, only the first struct needs a pointer to locked_page
		 *
		 * This way we don't need racey decisions about who is supposed
		 * to unlock it.
		 */
		if (locked_page) {
1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307
			/*
			 * Depending on the compressibility, the pages might or
			 * might not go through async.  We want all of them to
			 * be accounted against wbc once.  Let's do it here
			 * before the paths diverge.  wbc accounting is used
			 * only for foreign writeback detection and doesn't
			 * need full accuracy.  Just account the whole thing
			 * against the first page.
			 */
			wbc_account_cgroup_owner(wbc, locked_page,
						 cur_end - start);
1308 1309 1310 1311 1312 1313
			async_chunk[i].locked_page = locked_page;
			locked_page = NULL;
		} else {
			async_chunk[i].locked_page = NULL;
		}

1314 1315 1316 1317 1318 1319 1320
		if (blkcg_css != blkcg_root_css) {
			css_get(blkcg_css);
			async_chunk[i].blkcg_css = blkcg_css;
		} else {
			async_chunk[i].blkcg_css = NULL;
		}

1321 1322
		btrfs_init_work(&async_chunk[i].work, async_cow_start,
				async_cow_submit, async_cow_free);
1323

1324
		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1325
		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1326

1327
		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1328 1329 1330 1331 1332 1333

		*nr_written += nr_pages;
		start = cur_end + 1;
	}
	*page_started = 1;
	return 0;
1334 1335
}

1336
static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1337 1338 1339 1340 1341 1342
					u64 bytenr, u64 num_bytes)
{
	int ret;
	struct btrfs_ordered_sum *sums;
	LIST_HEAD(list);

1343
	ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
A
Arne Jansen 已提交
1344
				       bytenr + num_bytes - 1, &list, 0);
1345 1346 1347 1348 1349 1350 1351 1352
	if (ret == 0 && list_empty(&list))
		return 0;

	while (!list_empty(&list)) {
		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
		list_del(&sums->list);
		kfree(sums);
	}
1353 1354
	if (ret < 0)
		return ret;
1355 1356 1357
	return 1;
}

C
Chris Mason 已提交
1358 1359 1360 1361 1362 1363 1364
/*
 * when nowcow writeback call back.  This checks for snapshots or COW copies
 * of the extents that exist in the file, and COWs the file as required.
 *
 * If no cow copies or snapshots exist, we write directly to the existing
 * blocks on disk
 */
1365 1366
static noinline int run_delalloc_nocow(struct inode *inode,
				       struct page *locked_page,
1367 1368 1369
				       const u64 start, const u64 end,
				       int *page_started, int force,
				       unsigned long *nr_written)
1370
{
1371
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1372 1373
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_path *path;
1374 1375
	u64 cow_start = (u64)-1;
	u64 cur_offset = start;
1376
	int ret;
1377 1378
	bool check_prev = true;
	const bool freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
1379
	u64 ino = btrfs_ino(BTRFS_I(inode));
1380 1381
	bool nocow = false;
	u64 disk_bytenr = 0;
1382 1383

	path = btrfs_alloc_path();
1384
	if (!path) {
1385
		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1386
					     EXTENT_LOCKED | EXTENT_DELALLOC |
1387 1388
					     EXTENT_DO_ACCOUNTING |
					     EXTENT_DEFRAG, PAGE_UNLOCK |
1389 1390 1391
					     PAGE_CLEAR_DIRTY |
					     PAGE_SET_WRITEBACK |
					     PAGE_END_WRITEBACK);
1392
		return -ENOMEM;
1393
	}
1394

Y
Yan Zheng 已提交
1395
	while (1) {
1396 1397 1398 1399 1400 1401 1402 1403 1404
		struct btrfs_key found_key;
		struct btrfs_file_extent_item *fi;
		struct extent_buffer *leaf;
		u64 extent_end;
		u64 extent_offset;
		u64 num_bytes = 0;
		u64 disk_num_bytes;
		u64 ram_bytes;
		int extent_type;
1405 1406

		nocow = false;
1407

1408
		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
Y
Yan Zheng 已提交
1409
					       cur_offset, 0);
1410
		if (ret < 0)
1411
			goto error;
1412 1413 1414 1415 1416 1417

		/*
		 * If there is no extent for our range when doing the initial
		 * search, then go back to the previous slot as it will be the
		 * one containing the search offset
		 */
Y
Yan Zheng 已提交
1418 1419 1420 1421
		if (ret > 0 && path->slots[0] > 0 && check_prev) {
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &found_key,
					      path->slots[0] - 1);
L
Li Zefan 已提交
1422
			if (found_key.objectid == ino &&
Y
Yan Zheng 已提交
1423 1424 1425
			    found_key.type == BTRFS_EXTENT_DATA_KEY)
				path->slots[0]--;
		}
1426
		check_prev = false;
Y
Yan Zheng 已提交
1427
next_slot:
1428
		/* Go to next leaf if we have exhausted the current one */
Y
Yan Zheng 已提交
1429 1430 1431
		leaf = path->nodes[0];
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
1432 1433 1434
			if (ret < 0) {
				if (cow_start != (u64)-1)
					cur_offset = cow_start;
1435
				goto error;
1436
			}
Y
Yan Zheng 已提交
1437 1438 1439 1440
			if (ret > 0)
				break;
			leaf = path->nodes[0];
		}
1441

Y
Yan Zheng 已提交
1442 1443
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

1444
		/* Didn't find anything for our INO */
1445 1446
		if (found_key.objectid > ino)
			break;
1447 1448 1449 1450
		/*
		 * Keep searching until we find an EXTENT_ITEM or there are no
		 * more extents for this inode
		 */
1451 1452 1453 1454 1455
		if (WARN_ON_ONCE(found_key.objectid < ino) ||
		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
			path->slots[0]++;
			goto next_slot;
		}
1456 1457

		/* Found key is not EXTENT_DATA_KEY or starts after req range */
1458
		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
Y
Yan Zheng 已提交
1459 1460 1461
		    found_key.offset > end)
			break;

1462 1463 1464 1465
		/*
		 * If the found extent starts after requested offset, then
		 * adjust extent_end to be right before this extent begins
		 */
Y
Yan Zheng 已提交
1466 1467
		if (found_key.offset > cur_offset) {
			extent_end = found_key.offset;
1468
			extent_type = 0;
Y
Yan Zheng 已提交
1469 1470 1471
			goto out_check;
		}

1472 1473 1474 1475
		/*
		 * Found extent which begins before our range and potentially
		 * intersect it
		 */
Y
Yan Zheng 已提交
1476 1477 1478 1479
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_type = btrfs_file_extent_type(leaf, fi);

J
Josef Bacik 已提交
1480
		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
Y
Yan Zheng 已提交
1481 1482
		if (extent_type == BTRFS_FILE_EXTENT_REG ||
		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
Y
Yan Zheng 已提交
1483
			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1484
			extent_offset = btrfs_file_extent_offset(leaf, fi);
Y
Yan Zheng 已提交
1485 1486
			extent_end = found_key.offset +
				btrfs_file_extent_num_bytes(leaf, fi);
1487 1488
			disk_num_bytes =
				btrfs_file_extent_disk_num_bytes(leaf, fi);
1489
			/*
1490 1491
			 * If the extent we got ends before our current offset,
			 * skip to the next extent.
1492
			 */
1493
			if (extent_end <= cur_offset) {
Y
Yan Zheng 已提交
1494 1495 1496
				path->slots[0]++;
				goto next_slot;
			}
1497
			/* Skip holes */
1498 1499
			if (disk_bytenr == 0)
				goto out_check;
1500
			/* Skip compressed/encrypted/encoded extents */
Y
Yan Zheng 已提交
1501 1502 1503 1504
			if (btrfs_file_extent_compression(leaf, fi) ||
			    btrfs_file_extent_encryption(leaf, fi) ||
			    btrfs_file_extent_other_encoding(leaf, fi))
				goto out_check;
1505
			/*
1506 1507 1508 1509 1510
			 * If extent is created before the last volume's snapshot
			 * this implies the extent is shared, hence we can't do
			 * nocow. This is the same check as in
			 * btrfs_cross_ref_exist but without calling
			 * btrfs_search_slot.
1511
			 */
1512
			if (!freespace_inode &&
1513
			    btrfs_file_extent_generation(leaf, fi) <=
1514 1515
			    btrfs_root_last_snapshot(&root->root_item))
				goto out_check;
Y
Yan Zheng 已提交
1516 1517
			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
				goto out_check;
1518
			/* If extent is RO, we must COW it */
1519
			if (btrfs_extent_readonly(fs_info, disk_bytenr))
Y
Yan Zheng 已提交
1520
				goto out_check;
1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
			ret = btrfs_cross_ref_exist(root, ino,
						    found_key.offset -
						    extent_offset, disk_bytenr);
			if (ret) {
				/*
				 * ret could be -EIO if the above fails to read
				 * metadata.
				 */
				if (ret < 0) {
					if (cow_start != (u64)-1)
						cur_offset = cow_start;
					goto error;
				}

1535
				WARN_ON_ONCE(freespace_inode);
1536
				goto out_check;
1537
			}
1538
			disk_bytenr += extent_offset;
1539 1540
			disk_bytenr += cur_offset - found_key.offset;
			num_bytes = min(end + 1, extent_end) - cur_offset;
1541
			/*
1542 1543
			 * If there are pending snapshots for this root, we
			 * fall into common COW way
1544
			 */
1545
			if (!freespace_inode && atomic_read(&root->snapshot_force_cow))
1546
				goto out_check;
1547 1548 1549 1550 1551
			/*
			 * force cow if csum exists in the range.
			 * this ensure that csum for a given extent are
			 * either valid or do not exist.
			 */
1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
			ret = csum_exist_in_range(fs_info, disk_bytenr,
						  num_bytes);
			if (ret) {
				/*
				 * ret could be -EIO if the above fails to read
				 * metadata.
				 */
				if (ret < 0) {
					if (cow_start != (u64)-1)
						cur_offset = cow_start;
					goto error;
				}
1564
				WARN_ON_ONCE(freespace_inode);
1565
				goto out_check;
1566
			}
1567
			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
1568
				goto out_check;
1569
			nocow = true;
Y
Yan Zheng 已提交
1570
		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1571 1572
			extent_end = found_key.offset + ram_bytes;
			extent_end = ALIGN(extent_end, fs_info->sectorsize);
1573 1574 1575 1576 1577
			/* Skip extents outside of our requested range */
			if (extent_end <= start) {
				path->slots[0]++;
				goto next_slot;
			}
Y
Yan Zheng 已提交
1578
		} else {
1579
			/* If this triggers then we have a memory corruption */
1580
			BUG();
Y
Yan Zheng 已提交
1581 1582
		}
out_check:
1583 1584 1585 1586
		/*
		 * If nocow is false then record the beginning of the range
		 * that needs to be COWed
		 */
Y
Yan Zheng 已提交
1587 1588 1589 1590 1591 1592 1593 1594
		if (!nocow) {
			if (cow_start == (u64)-1)
				cow_start = cur_offset;
			cur_offset = extent_end;
			if (cur_offset > end)
				break;
			path->slots[0]++;
			goto next_slot;
1595 1596
		}

1597
		btrfs_release_path(path);
1598 1599 1600 1601 1602 1603

		/*
		 * COW range from cow_start to found_key.offset - 1. As the key
		 * will contain the beginning of the first extent that can be
		 * NOCOW, following one which needs to be COW'ed
		 */
Y
Yan Zheng 已提交
1604
		if (cow_start != (u64)-1) {
1605 1606
			ret = cow_file_range(inode, locked_page,
					     cow_start, found_key.offset - 1,
1607
					     page_started, nr_written, 1);
1608
			if (ret) {
1609
				if (nocow)
1610
					btrfs_dec_nocow_writers(fs_info,
1611
								disk_bytenr);
1612
				goto error;
1613
			}
Y
Yan Zheng 已提交
1614
			cow_start = (u64)-1;
1615
		}
Y
Yan Zheng 已提交
1616

Y
Yan Zheng 已提交
1617
		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1618
			u64 orig_start = found_key.offset - extent_offset;
1619
			struct extent_map *em;
1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633

			em = create_io_em(inode, cur_offset, num_bytes,
					  orig_start,
					  disk_bytenr, /* block_start */
					  num_bytes, /* block_len */
					  disk_num_bytes, /* orig_block_len */
					  ram_bytes, BTRFS_COMPRESS_NONE,
					  BTRFS_ORDERED_PREALLOC);
			if (IS_ERR(em)) {
				if (nocow)
					btrfs_dec_nocow_writers(fs_info,
								disk_bytenr);
				ret = PTR_ERR(em);
				goto error;
Y
Yan Zheng 已提交
1634
			}
1635
			free_extent_map(em);
1636 1637 1638 1639
			ret = btrfs_add_ordered_extent(inode, cur_offset,
						       disk_bytenr, num_bytes,
						       num_bytes,
						       BTRFS_ORDERED_PREALLOC);
1640 1641 1642 1643 1644 1645 1646
			if (ret) {
				btrfs_drop_extent_cache(BTRFS_I(inode),
							cur_offset,
							cur_offset + num_bytes - 1,
							0);
				goto error;
			}
Y
Yan Zheng 已提交
1647
		} else {
1648 1649 1650 1651
			ret = btrfs_add_ordered_extent(inode, cur_offset,
						       disk_bytenr, num_bytes,
						       num_bytes,
						       BTRFS_ORDERED_NOCOW);
1652 1653
			if (ret)
				goto error;
Y
Yan Zheng 已提交
1654
		}
Y
Yan Zheng 已提交
1655

1656
		if (nocow)
1657
			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1658
		nocow = false;
1659

1660
		if (root->root_key.objectid ==
1661 1662 1663 1664 1665 1666
		    BTRFS_DATA_RELOC_TREE_OBJECTID)
			/*
			 * Error handled later, as we must prevent
			 * extent_clear_unlock_delalloc() in error handler
			 * from freeing metadata of created ordered extent.
			 */
1667 1668 1669
			ret = btrfs_reloc_clone_csums(inode, cur_offset,
						      num_bytes);

1670
		extent_clear_unlock_delalloc(inode, cur_offset,
1671
					     cur_offset + num_bytes - 1,
1672
					     locked_page, EXTENT_LOCKED |
1673 1674 1675 1676
					     EXTENT_DELALLOC |
					     EXTENT_CLEAR_DATA_RESV,
					     PAGE_UNLOCK | PAGE_SET_PRIVATE2);

Y
Yan Zheng 已提交
1677
		cur_offset = extent_end;
1678 1679 1680 1681 1682 1683 1684 1685

		/*
		 * btrfs_reloc_clone_csums() error, now we're OK to call error
		 * handler, as metadata for created ordered extent will only
		 * be freed by btrfs_finish_ordered_io().
		 */
		if (ret)
			goto error;
Y
Yan Zheng 已提交
1686 1687
		if (cur_offset > end)
			break;
1688
	}
1689
	btrfs_release_path(path);
Y
Yan Zheng 已提交
1690

1691
	if (cur_offset <= end && cow_start == (u64)-1)
Y
Yan Zheng 已提交
1692
		cow_start = cur_offset;
1693

Y
Yan Zheng 已提交
1694
	if (cow_start != (u64)-1) {
1695
		cur_offset = end;
1696
		ret = cow_file_range(inode, locked_page, cow_start, end,
1697
				     page_started, nr_written, 1);
1698
		if (ret)
1699
			goto error;
Y
Yan Zheng 已提交
1700 1701
	}

1702
error:
1703 1704 1705
	if (nocow)
		btrfs_dec_nocow_writers(fs_info, disk_bytenr);

1706
	if (ret && cur_offset < end)
1707
		extent_clear_unlock_delalloc(inode, cur_offset, end,
1708
					     locked_page, EXTENT_LOCKED |
1709 1710 1711
					     EXTENT_DELALLOC | EXTENT_DEFRAG |
					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
					     PAGE_CLEAR_DIRTY |
1712 1713
					     PAGE_SET_WRITEBACK |
					     PAGE_END_WRITEBACK);
1714
	btrfs_free_path(path);
1715
	return ret;
1716 1717
}

1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737
static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
{

	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
	    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
		return 0;

	/*
	 * @defrag_bytes is a hint value, no spinlock held here,
	 * if is not zero, it means the file is defragging.
	 * Force cow if given extent needs to be defragged.
	 */
	if (BTRFS_I(inode)->defrag_bytes &&
	    test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
			   EXTENT_DEFRAG, 0, NULL))
		return 1;

	return 0;
}

C
Chris Mason 已提交
1738
/*
1739 1740
 * Function to process delayed allocation (create CoW) for ranges which are
 * being touched for the first time.
C
Chris Mason 已提交
1741
 */
1742
int btrfs_run_delalloc_range(struct inode *inode, struct page *locked_page,
1743 1744
		u64 start, u64 end, int *page_started, unsigned long *nr_written,
		struct writeback_control *wbc)
1745 1746
{
	int ret;
1747
	int force_cow = need_force_cow(inode, start, end);
1748

1749
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
C
Chris Mason 已提交
1750
		ret = run_delalloc_nocow(inode, locked_page, start, end,
C
Chris Mason 已提交
1751
					 page_started, 1, nr_written);
1752
	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
Y
Yan Zheng 已提交
1753
		ret = run_delalloc_nocow(inode, locked_page, start, end,
C
Chris Mason 已提交
1754
					 page_started, 0, nr_written);
1755 1756
	} else if (!inode_can_compress(inode) ||
		   !inode_need_compress(inode, start, end)) {
1757
		ret = cow_file_range(inode, locked_page, start, end,
1758
				      page_started, nr_written, 1);
1759 1760 1761
	} else {
		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
			&BTRFS_I(inode)->runtime_flags);
1762
		ret = cow_file_range_async(inode, wbc, locked_page, start, end,
1763
					   page_started, nr_written);
1764
	}
1765
	if (ret)
1766 1767
		btrfs_cleanup_ordered_extents(inode, locked_page, start,
					      end - start + 1);
1768 1769 1770
	return ret;
}

1771 1772
void btrfs_split_delalloc_extent(struct inode *inode,
				 struct extent_state *orig, u64 split)
J
Josef Bacik 已提交
1773
{
1774 1775
	u64 size;

1776
	/* not delalloc, ignore it */
J
Josef Bacik 已提交
1777
	if (!(orig->state & EXTENT_DELALLOC))
1778
		return;
J
Josef Bacik 已提交
1779

1780 1781
	size = orig->end - orig->start + 1;
	if (size > BTRFS_MAX_EXTENT_SIZE) {
1782
		u32 num_extents;
1783 1784 1785
		u64 new_size;

		/*
1786
		 * See the explanation in btrfs_merge_delalloc_extent, the same
1787
		 * applies here, just in reverse.
1788 1789
		 */
		new_size = orig->end - split + 1;
1790
		num_extents = count_max_extents(new_size);
1791
		new_size = split - orig->start;
1792 1793
		num_extents += count_max_extents(new_size);
		if (count_max_extents(size) >= num_extents)
1794 1795 1796
			return;
	}

1797
	spin_lock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
1798
	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1799
	spin_unlock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
1800 1801 1802
}

/*
1803 1804 1805
 * Handle merged delayed allocation extents so we can keep track of new extents
 * that are just merged onto old extents, such as when we are doing sequential
 * writes, so we can properly account for the metadata space we'll need.
J
Josef Bacik 已提交
1806
 */
1807 1808
void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
				 struct extent_state *other)
J
Josef Bacik 已提交
1809
{
1810
	u64 new_size, old_size;
1811
	u32 num_extents;
1812

J
Josef Bacik 已提交
1813 1814
	/* not delalloc, ignore it */
	if (!(other->state & EXTENT_DELALLOC))
1815
		return;
J
Josef Bacik 已提交
1816

J
Josef Bacik 已提交
1817 1818 1819 1820
	if (new->start > other->start)
		new_size = new->end - other->start + 1;
	else
		new_size = other->end - new->start + 1;
1821 1822 1823 1824

	/* we're not bigger than the max, unreserve the space and go */
	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
		spin_lock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
1825
		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1826 1827 1828 1829 1830
		spin_unlock(&BTRFS_I(inode)->lock);
		return;
	}

	/*
1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
	 * We have to add up either side to figure out how many extents were
	 * accounted for before we merged into one big extent.  If the number of
	 * extents we accounted for is <= the amount we need for the new range
	 * then we can return, otherwise drop.  Think of it like this
	 *
	 * [ 4k][MAX_SIZE]
	 *
	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
	 * need 2 outstanding extents, on one side we have 1 and the other side
	 * we have 1 so they are == and we can return.  But in this case
	 *
	 * [MAX_SIZE+4k][MAX_SIZE+4k]
	 *
	 * Each range on their own accounts for 2 extents, but merged together
	 * they are only 3 extents worth of accounting, so we need to drop in
	 * this case.
1847
	 */
1848
	old_size = other->end - other->start + 1;
1849
	num_extents = count_max_extents(old_size);
1850
	old_size = new->end - new->start + 1;
1851 1852
	num_extents += count_max_extents(old_size);
	if (count_max_extents(new_size) >= num_extents)
1853 1854
		return;

1855
	spin_lock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
1856
	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1857
	spin_unlock(&BTRFS_I(inode)->lock);
J
Josef Bacik 已提交
1858 1859
}

1860 1861 1862
static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
				      struct inode *inode)
{
1863 1864
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

1865 1866 1867 1868 1869 1870 1871 1872
	spin_lock(&root->delalloc_lock);
	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
			      &root->delalloc_inodes);
		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
			&BTRFS_I(inode)->runtime_flags);
		root->nr_delalloc_inodes++;
		if (root->nr_delalloc_inodes == 1) {
1873
			spin_lock(&fs_info->delalloc_root_lock);
1874 1875
			BUG_ON(!list_empty(&root->delalloc_root));
			list_add_tail(&root->delalloc_root,
1876 1877
				      &fs_info->delalloc_roots);
			spin_unlock(&fs_info->delalloc_root_lock);
1878 1879 1880 1881 1882
		}
	}
	spin_unlock(&root->delalloc_lock);
}

1883 1884 1885

void __btrfs_del_delalloc_inode(struct btrfs_root *root,
				struct btrfs_inode *inode)
1886
{
1887
	struct btrfs_fs_info *fs_info = root->fs_info;
1888

1889 1890
	if (!list_empty(&inode->delalloc_inodes)) {
		list_del_init(&inode->delalloc_inodes);
1891
		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1892
			  &inode->runtime_flags);
1893 1894
		root->nr_delalloc_inodes--;
		if (!root->nr_delalloc_inodes) {
1895
			ASSERT(list_empty(&root->delalloc_inodes));
1896
			spin_lock(&fs_info->delalloc_root_lock);
1897 1898
			BUG_ON(list_empty(&root->delalloc_root));
			list_del_init(&root->delalloc_root);
1899
			spin_unlock(&fs_info->delalloc_root_lock);
1900 1901
		}
	}
1902 1903 1904 1905 1906 1907 1908
}

static void btrfs_del_delalloc_inode(struct btrfs_root *root,
				     struct btrfs_inode *inode)
{
	spin_lock(&root->delalloc_lock);
	__btrfs_del_delalloc_inode(root, inode);
1909 1910 1911
	spin_unlock(&root->delalloc_lock);
}

C
Chris Mason 已提交
1912
/*
1913 1914
 * Properly track delayed allocation bytes in the inode and to maintain the
 * list of inodes that have pending delalloc work to be done.
C
Chris Mason 已提交
1915
 */
1916 1917
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
			       unsigned *bits)
1918
{
1919 1920
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

1921 1922
	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
		WARN_ON(1);
1923 1924
	/*
	 * set_bit and clear bit hooks normally require _irqsave/restore
1925
	 * but in this case, we are only testing for the DELALLOC
1926 1927
	 * bit, which is only set or cleared with irqs on
	 */
1928
	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1929
		struct btrfs_root *root = BTRFS_I(inode)->root;
1930
		u64 len = state->end + 1 - state->start;
J
Josef Bacik 已提交
1931
		u32 num_extents = count_max_extents(len);
1932
		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
J
Josef Bacik 已提交
1933

J
Josef Bacik 已提交
1934 1935 1936
		spin_lock(&BTRFS_I(inode)->lock);
		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
		spin_unlock(&BTRFS_I(inode)->lock);
1937

1938
		/* For sanity tests */
1939
		if (btrfs_is_testing(fs_info))
1940 1941
			return;

1942 1943
		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
					 fs_info->delalloc_batch);
1944
		spin_lock(&BTRFS_I(inode)->lock);
1945
		BTRFS_I(inode)->delalloc_bytes += len;
1946 1947
		if (*bits & EXTENT_DEFRAG)
			BTRFS_I(inode)->defrag_bytes += len;
1948
		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1949 1950
					 &BTRFS_I(inode)->runtime_flags))
			btrfs_add_delalloc_inodes(root, inode);
1951
		spin_unlock(&BTRFS_I(inode)->lock);
1952
	}
1953 1954 1955 1956 1957 1958 1959 1960

	if (!(state->state & EXTENT_DELALLOC_NEW) &&
	    (*bits & EXTENT_DELALLOC_NEW)) {
		spin_lock(&BTRFS_I(inode)->lock);
		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
			state->start;
		spin_unlock(&BTRFS_I(inode)->lock);
	}
1961 1962
}

C
Chris Mason 已提交
1963
/*
1964 1965
 * Once a range is no longer delalloc this function ensures that proper
 * accounting happens.
C
Chris Mason 已提交
1966
 */
1967 1968
void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
				 struct extent_state *state, unsigned *bits)
1969
{
1970 1971
	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
	struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
1972
	u64 len = state->end + 1 - state->start;
1973
	u32 num_extents = count_max_extents(len);
1974

1975 1976
	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
		spin_lock(&inode->lock);
1977
		inode->defrag_bytes -= len;
1978 1979
		spin_unlock(&inode->lock);
	}
1980

1981 1982
	/*
	 * set_bit and clear bit hooks normally require _irqsave/restore
1983
	 * but in this case, we are only testing for the DELALLOC
1984 1985
	 * bit, which is only set or cleared with irqs on
	 */
1986
	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1987
		struct btrfs_root *root = inode->root;
1988
		bool do_list = !btrfs_is_free_space_inode(inode);
1989

J
Josef Bacik 已提交
1990 1991 1992
		spin_lock(&inode->lock);
		btrfs_mod_outstanding_extents(inode, -num_extents);
		spin_unlock(&inode->lock);
1993

1994 1995
		/*
		 * We don't reserve metadata space for space cache inodes so we
1996
		 * don't need to call delalloc_release_metadata if there is an
1997 1998
		 * error.
		 */
1999
		if (*bits & EXTENT_CLEAR_META_RESV &&
2000
		    root != fs_info->tree_root)
2001
			btrfs_delalloc_release_metadata(inode, len, false);
2002

2003
		/* For sanity tests. */
2004
		if (btrfs_is_testing(fs_info))
2005 2006
			return;

2007 2008 2009
		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
		    do_list && !(state->state & EXTENT_NORESERVE) &&
		    (*bits & EXTENT_CLEAR_DATA_RESV))
2010 2011
			btrfs_free_reserved_data_space_noquota(
					&inode->vfs_inode,
2012
					state->start, len);
J
Josef Bacik 已提交
2013

2014 2015
		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
					 fs_info->delalloc_batch);
2016 2017 2018
		spin_lock(&inode->lock);
		inode->delalloc_bytes -= len;
		if (do_list && inode->delalloc_bytes == 0 &&
2019
		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2020
					&inode->runtime_flags))
2021
			btrfs_del_delalloc_inode(root, inode);
2022
		spin_unlock(&inode->lock);
2023
	}
2024 2025 2026 2027 2028 2029 2030 2031

	if ((state->state & EXTENT_DELALLOC_NEW) &&
	    (*bits & EXTENT_DELALLOC_NEW)) {
		spin_lock(&inode->lock);
		ASSERT(inode->new_delalloc_bytes >= len);
		inode->new_delalloc_bytes -= len;
		spin_unlock(&inode->lock);
	}
2032 2033
}

C
Chris Mason 已提交
2034
/*
2035 2036 2037
 * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit
 * in a chunk's stripe. This function ensures that bios do not span a
 * stripe/chunk
L
Liu Bo 已提交
2038
 *
2039 2040 2041 2042 2043 2044 2045
 * @page - The page we are about to add to the bio
 * @size - size we want to add to the bio
 * @bio - bio we want to ensure is smaller than a stripe
 * @bio_flags - flags of the bio
 *
 * return 1 if page cannot be added to the bio
 * return 0 if page can be added to the bio
L
Liu Bo 已提交
2046
 * return error otherwise
C
Chris Mason 已提交
2047
 */
2048 2049
int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio,
			     unsigned long bio_flags)
2050
{
2051 2052
	struct inode *inode = page->mapping->host;
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2053
	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
2054 2055 2056
	u64 length = 0;
	u64 map_length;
	int ret;
2057
	struct btrfs_io_geometry geom;
2058

2059 2060 2061
	if (bio_flags & EXTENT_BIO_COMPRESSED)
		return 0;

2062
	length = bio->bi_iter.bi_size;
2063
	map_length = length;
2064 2065
	ret = btrfs_get_io_geometry(fs_info, btrfs_op(bio), logical, map_length,
				    &geom);
L
Liu Bo 已提交
2066 2067
	if (ret < 0)
		return ret;
2068 2069

	if (geom.len < length + size)
2070
		return 1;
2071
	return 0;
2072 2073
}

C
Chris Mason 已提交
2074 2075 2076 2077 2078 2079 2080 2081
/*
 * in order to insert checksums into the metadata in large chunks,
 * we wait until bio submission time.   All the pages in the bio are
 * checksummed and sums are attached onto the ordered extent record.
 *
 * At IO completion time the cums attached on the ordered extent record
 * are inserted into the btree
 */
2082
static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio,
2083
				    u64 bio_offset)
2084
{
2085
	struct inode *inode = private_data;
2086
	blk_status_t ret = 0;
2087

2088
	ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2089
	BUG_ON(ret); /* -ENOMEM */
C
Chris Mason 已提交
2090 2091
	return 0;
}
2092

C
Chris Mason 已提交
2093
/*
2094
 * extent_io.c submission hook. This does the right thing for csum calculation
2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109
 * on write, or reading the csums from the tree before a read.
 *
 * Rules about async/sync submit,
 * a) read:				sync submit
 *
 * b) write without checksum:		sync submit
 *
 * c) write with checksum:
 *    c-1) if bio is issued by fsync:	sync submit
 *         (sync_writers != 0)
 *
 *    c-2) if root is reloc root:	sync submit
 *         (only in case of buffered IO)
 *
 *    c-3) otherwise:			async submit
C
Chris Mason 已提交
2110
 */
2111
static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
2112 2113 2114
					  int mirror_num,
					  unsigned long bio_flags)

2115
{
2116
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2117
	struct btrfs_root *root = BTRFS_I(inode)->root;
2118
	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
2119
	blk_status_t ret = 0;
2120
	int skip_sum;
2121
	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
2122

2123
	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
2124

2125
	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2126
		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
2127

M
Mike Christie 已提交
2128
	if (bio_op(bio) != REQ_OP_WRITE) {
2129
		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
2130
		if (ret)
2131
			goto out;
2132

2133
		if (bio_flags & EXTENT_BIO_COMPRESSED) {
2134 2135 2136 2137
			ret = btrfs_submit_compressed_read(inode, bio,
							   mirror_num,
							   bio_flags);
			goto out;
2138
		} else if (!skip_sum) {
2139
			ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL);
2140
			if (ret)
2141
				goto out;
2142
		}
2143
		goto mapit;
2144
	} else if (async && !skip_sum) {
2145 2146 2147
		/* csum items have already been cloned */
		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
			goto mapit;
2148
		/* we're doing a write, do the async checksumming */
2149
		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2150
					  0, inode, btrfs_submit_bio_start);
2151
		goto out;
2152
	} else if (!skip_sum) {
2153
		ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2154 2155
		if (ret)
			goto out;
2156 2157
	}

2158
mapit:
2159
	ret = btrfs_map_bio(fs_info, bio, mirror_num);
2160 2161

out:
2162 2163
	if (ret) {
		bio->bi_status = ret;
2164 2165
		bio_endio(bio);
	}
2166
	return ret;
2167
}
C
Chris Mason 已提交
2168

C
Chris Mason 已提交
2169 2170 2171 2172
/*
 * given a list of ordered sums record them in the inode.  This happens
 * at IO completion time based on sums calculated at bio submission time.
 */
2173
static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2174
			     struct inode *inode, struct list_head *list)
2175 2176
{
	struct btrfs_ordered_sum *sum;
2177
	int ret;
2178

Q
Qinghuang Feng 已提交
2179
	list_for_each_entry(sum, list, list) {
2180
		trans->adding_csums = true;
2181
		ret = btrfs_csum_file_blocks(trans,
2182
		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
2183
		trans->adding_csums = false;
2184 2185
		if (ret)
			return ret;
2186 2187 2188 2189
	}
	return 0;
}

2190
int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2191
			      unsigned int extra_bits,
2192
			      struct extent_state **cached_state)
2193
{
2194
	WARN_ON(PAGE_ALIGNED(end));
2195
	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2196
				   extra_bits, cached_state);
2197 2198
}

C
Chris Mason 已提交
2199
/* see btrfs_writepage_start_hook for details on why this is required */
2200 2201
struct btrfs_writepage_fixup {
	struct page *page;
2202
	struct inode *inode;
2203 2204 2205
	struct btrfs_work work;
};

2206
static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2207 2208 2209
{
	struct btrfs_writepage_fixup *fixup;
	struct btrfs_ordered_extent *ordered;
2210
	struct extent_state *cached_state = NULL;
2211
	struct extent_changeset *data_reserved = NULL;
2212 2213 2214 2215
	struct page *page;
	struct inode *inode;
	u64 page_start;
	u64 page_end;
2216
	int ret = 0;
2217
	bool free_delalloc_space = true;
2218 2219 2220

	fixup = container_of(work, struct btrfs_writepage_fixup, work);
	page = fixup->page;
2221 2222 2223 2224 2225 2226 2227 2228 2229 2230
	inode = fixup->inode;
	page_start = page_offset(page);
	page_end = page_offset(page) + PAGE_SIZE - 1;

	/*
	 * This is similar to page_mkwrite, we need to reserve the space before
	 * we take the page lock.
	 */
	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
					   PAGE_SIZE);
C
Chris Mason 已提交
2231
again:
2232
	lock_page(page);
2233 2234 2235 2236 2237 2238

	/*
	 * Before we queued this fixup, we took a reference on the page.
	 * page->mapping may go NULL, but it shouldn't be moved to a different
	 * address space.
	 */
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
		/*
		 * Unfortunately this is a little tricky, either
		 *
		 * 1) We got here and our page had already been dealt with and
		 *    we reserved our space, thus ret == 0, so we need to just
		 *    drop our space reservation and bail.  This can happen the
		 *    first time we come into the fixup worker, or could happen
		 *    while waiting for the ordered extent.
		 * 2) Our page was already dealt with, but we happened to get an
		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
		 *    this case we obviously don't have anything to release, but
		 *    because the page was already dealt with we don't want to
		 *    mark the page with an error, so make sure we're resetting
		 *    ret to 0.  This is why we have this check _before_ the ret
		 *    check, because we do not want to have a surprise ENOSPC
		 *    when the page was already properly dealt with.
		 */
		if (!ret) {
			btrfs_delalloc_release_extents(BTRFS_I(inode),
						       PAGE_SIZE);
			btrfs_delalloc_release_space(inode, data_reserved,
						     page_start, PAGE_SIZE,
						     true);
		}
		ret = 0;
2265
		goto out_page;
2266
	}
2267

2268
	/*
2269 2270
	 * We can't mess with the page state unless it is locked, so now that
	 * it is locked bail if we failed to make our space reservation.
2271
	 */
2272 2273
	if (ret)
		goto out_page;
2274

2275
	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2276
			 &cached_state);
C
Chris Mason 已提交
2277 2278

	/* already ordered? We're done */
2279
	if (PagePrivate2(page))
2280
		goto out_reserved;
C
Chris Mason 已提交
2281

2282
	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2283
					PAGE_SIZE);
C
Chris Mason 已提交
2284
	if (ordered) {
2285
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2286
				     page_end, &cached_state);
C
Chris Mason 已提交
2287 2288
		unlock_page(page);
		btrfs_start_ordered_extent(inode, ordered, 1);
2289
		btrfs_put_ordered_extent(ordered);
C
Chris Mason 已提交
2290 2291
		goto again;
	}
2292

2293
	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2294
					&cached_state);
2295
	if (ret)
2296
		goto out_reserved;
2297

2298 2299 2300 2301 2302 2303 2304 2305
	/*
	 * Everything went as planned, we're now the owner of a dirty page with
	 * delayed allocation bits set and space reserved for our COW
	 * destination.
	 *
	 * The page was dirty when we started, nothing should have cleaned it.
	 */
	BUG_ON(!PageDirty(page));
2306
	free_delalloc_space = false;
2307
out_reserved:
2308
	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2309
	if (free_delalloc_space)
2310 2311
		btrfs_delalloc_release_space(inode, data_reserved, page_start,
					     PAGE_SIZE, true);
2312
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2313
			     &cached_state);
2314
out_page:
2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
	if (ret) {
		/*
		 * We hit ENOSPC or other errors.  Update the mapping and page
		 * to reflect the errors and clean the page.
		 */
		mapping_set_error(page->mapping, ret);
		end_extent_writepage(page, ret, page_start, page_end);
		clear_page_dirty_for_io(page);
		SetPageError(page);
	}
	ClearPageChecked(page);
2326
	unlock_page(page);
2327
	put_page(page);
2328
	kfree(fixup);
2329
	extent_changeset_free(data_reserved);
2330 2331 2332 2333 2334 2335
	/*
	 * As a precaution, do a delayed iput in case it would be the last iput
	 * that could need flushing space. Recursing back to fixup worker would
	 * deadlock.
	 */
	btrfs_add_delayed_iput(inode);
2336 2337 2338 2339 2340 2341 2342 2343
}

/*
 * There are a few paths in the higher layers of the kernel that directly
 * set the page dirty bit without asking the filesystem if it is a
 * good idea.  This causes problems because we want to make sure COW
 * properly happens and the data=ordered rules are followed.
 *
C
Chris Mason 已提交
2344
 * In our case any range that doesn't have the ORDERED bit set
2345 2346 2347 2348
 * hasn't been properly setup for IO.  We kick off an async process
 * to fix it up.  The async helper will wait for ordered extents, set
 * the delalloc bit and make it safe to write the page.
 */
2349
int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
2350 2351
{
	struct inode *inode = page->mapping->host;
2352
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2353 2354
	struct btrfs_writepage_fixup *fixup;

2355 2356
	/* this page is properly in the ordered list */
	if (TestClearPagePrivate2(page))
2357 2358
		return 0;

2359 2360 2361 2362 2363 2364 2365
	/*
	 * PageChecked is set below when we create a fixup worker for this page,
	 * don't try to create another one if we're already PageChecked()
	 *
	 * The extent_io writepage code will redirty the page if we send back
	 * EAGAIN.
	 */
2366 2367 2368 2369 2370 2371
	if (PageChecked(page))
		return -EAGAIN;

	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
	if (!fixup)
		return -EAGAIN;
2372

2373 2374 2375 2376 2377 2378 2379
	/*
	 * We are already holding a reference to this inode from
	 * write_cache_pages.  We need to hold it because the space reservation
	 * takes place outside of the page lock, and we can't trust
	 * page->mapping outside of the page lock.
	 */
	ihold(inode);
2380
	SetPageChecked(page);
2381
	get_page(page);
2382
	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2383
	fixup->page = page;
2384
	fixup->inode = inode;
2385
	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2386 2387

	return -EAGAIN;
2388 2389
}

Y
Yan Zheng 已提交
2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
				       struct inode *inode, u64 file_pos,
				       u64 disk_bytenr, u64 disk_num_bytes,
				       u64 num_bytes, u64 ram_bytes,
				       u8 compression, u8 encryption,
				       u16 other_encoding, int extent_type)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_file_extent_item *fi;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key ins;
2402
	u64 qg_released;
2403
	int extent_inserted = 0;
Y
Yan Zheng 已提交
2404 2405 2406
	int ret;

	path = btrfs_alloc_path();
2407 2408
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
2409

C
Chris Mason 已提交
2410 2411 2412 2413 2414 2415 2416 2417 2418
	/*
	 * we may be replacing one extent in the tree with another.
	 * The new extent is pinned in the extent map, and we don't want
	 * to drop it from the cache until it is completely in the btree.
	 *
	 * So, tell btrfs_drop_extents to leave this extent in the cache.
	 * the caller is expected to unpin it and allow it to be merged
	 * with the others.
	 */
2419 2420 2421
	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
				   file_pos + num_bytes, NULL, 0,
				   1, sizeof(*fi), &extent_inserted);
2422 2423
	if (ret)
		goto out;
Y
Yan Zheng 已提交
2424

2425
	if (!extent_inserted) {
2426
		ins.objectid = btrfs_ino(BTRFS_I(inode));
2427 2428 2429 2430 2431 2432 2433 2434 2435
		ins.offset = file_pos;
		ins.type = BTRFS_EXTENT_DATA_KEY;

		path->leave_spinning = 1;
		ret = btrfs_insert_empty_item(trans, root, path, &ins,
					      sizeof(*fi));
		if (ret)
			goto out;
	}
Y
Yan Zheng 已提交
2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448
	leaf = path->nodes[0];
	fi = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
	btrfs_set_file_extent_type(leaf, fi, extent_type);
	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
	btrfs_set_file_extent_offset(leaf, fi, 0);
	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
	btrfs_set_file_extent_compression(leaf, fi, compression);
	btrfs_set_file_extent_encryption(leaf, fi, encryption);
	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2449

Y
Yan Zheng 已提交
2450
	btrfs_mark_buffer_dirty(leaf);
2451
	btrfs_release_path(path);
Y
Yan Zheng 已提交
2452 2453 2454 2455 2456 2457

	inode_add_bytes(inode, num_bytes);

	ins.objectid = disk_bytenr;
	ins.offset = disk_num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;
2458

2459 2460 2461 2462 2463
	ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), file_pos,
						ram_bytes);
	if (ret)
		goto out;

2464
	/*
2465 2466
	 * Release the reserved range from inode dirty range map, as it is
	 * already moved into delayed_ref_head
2467
	 */
2468 2469 2470 2471
	ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
	if (ret < 0)
		goto out;
	qg_released = ret;
2472 2473 2474
	ret = btrfs_alloc_reserved_file_extent(trans, root,
					       btrfs_ino(BTRFS_I(inode)),
					       file_pos, qg_released, &ins);
2475
out:
Y
Yan Zheng 已提交
2476
	btrfs_free_path(path);
2477

2478
	return ret;
Y
Yan Zheng 已提交
2479 2480
}

2481
static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2482 2483
					 u64 start, u64 len)
{
2484
	struct btrfs_block_group *cache;
2485

2486
	cache = btrfs_lookup_block_group(fs_info, start);
2487 2488 2489 2490 2491 2492 2493 2494 2495
	ASSERT(cache);

	spin_lock(&cache->lock);
	cache->delalloc_bytes -= len;
	spin_unlock(&cache->lock);

	btrfs_put_block_group(cache);
}

C
Chris Mason 已提交
2496 2497 2498 2499
/* as ordered data IO finishes, this gets called so we can finish
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
 */
2500
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2501
{
2502
	struct inode *inode = ordered_extent->inode;
2503
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2504
	struct btrfs_root *root = BTRFS_I(inode)->root;
2505
	struct btrfs_trans_handle *trans = NULL;
2506
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2507
	struct extent_state *cached_state = NULL;
2508
	u64 start, end;
2509
	int compress_type = 0;
2510
	int ret = 0;
2511
	u64 logical_len = ordered_extent->num_bytes;
2512
	bool freespace_inode;
2513
	bool truncated = false;
2514 2515
	bool range_locked = false;
	bool clear_new_delalloc_bytes = false;
2516
	bool clear_reserved_extent = true;
2517
	unsigned int clear_bits;
2518

2519 2520 2521
	start = ordered_extent->file_offset;
	end = start + ordered_extent->num_bytes - 1;

2522 2523 2524 2525
	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
		clear_new_delalloc_bytes = true;
2526

2527
	freespace_inode = btrfs_is_free_space_inode(BTRFS_I(inode));
J
Josef Bacik 已提交
2528

2529 2530 2531 2532 2533
	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
		ret = -EIO;
		goto out;
	}

2534
	btrfs_free_io_failure_record(BTRFS_I(inode), start, end);
2535

2536 2537 2538 2539 2540 2541 2542 2543
	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
		truncated = true;
		logical_len = ordered_extent->truncated_len;
		/* Truncated the entire extent, don't bother adding */
		if (!logical_len)
			goto out;
	}

2544
	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2545
		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2546 2547 2548 2549 2550 2551

		/*
		 * For mwrite(mmap + memset to write) case, we still reserve
		 * space for NOCOW range.
		 * As NOCOW won't cause a new delayed ref, just free the space
		 */
2552 2553
		btrfs_qgroup_free_data(inode, NULL, start,
				       ordered_extent->num_bytes);
2554
		btrfs_inode_safe_disk_i_size_write(inode, 0);
2555 2556
		if (freespace_inode)
			trans = btrfs_join_transaction_spacecache(root);
2557 2558 2559 2560 2561 2562
		else
			trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			trans = NULL;
			goto out;
2563
		}
2564
		trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2565 2566
		ret = btrfs_update_inode_fallback(trans, root, inode);
		if (ret) /* -ENOMEM or corruption */
2567
			btrfs_abort_transaction(trans, ret);
2568 2569
		goto out;
	}
2570

2571
	range_locked = true;
2572
	lock_extent_bits(io_tree, start, end, &cached_state);
2573

2574 2575
	if (freespace_inode)
		trans = btrfs_join_transaction_spacecache(root);
J
Josef Bacik 已提交
2576
	else
2577
		trans = btrfs_join_transaction(root);
2578 2579 2580
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		trans = NULL;
2581
		goto out;
2582
	}
C
Chris Mason 已提交
2583

2584
	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2585

C
Chris Mason 已提交
2586
	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2587
		compress_type = ordered_extent->compress_type;
Y
Yan Zheng 已提交
2588
	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2589
		BUG_ON(compress_type);
2590 2591
		btrfs_qgroup_free_data(inode, NULL, start,
				       ordered_extent->num_bytes);
2592
		ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
Y
Yan Zheng 已提交
2593 2594
						ordered_extent->file_offset,
						ordered_extent->file_offset +
2595
						logical_len);
Y
Yan Zheng 已提交
2596
	} else {
2597
		BUG_ON(root == fs_info->tree_root);
2598 2599 2600
		ret = insert_reserved_file_extent(trans, inode, start,
						ordered_extent->disk_bytenr,
						ordered_extent->disk_num_bytes,
2601
						logical_len, logical_len,
2602
						compress_type, 0, 0,
Y
Yan Zheng 已提交
2603
						BTRFS_FILE_EXTENT_REG);
2604 2605
		if (!ret) {
			clear_reserved_extent = false;
2606
			btrfs_release_delalloc_bytes(fs_info,
2607 2608
						ordered_extent->disk_bytenr,
						ordered_extent->disk_num_bytes);
2609
		}
Y
Yan Zheng 已提交
2610
	}
J
Josef Bacik 已提交
2611
	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2612 2613
			   ordered_extent->file_offset,
			   ordered_extent->num_bytes, trans->transid);
2614
	if (ret < 0) {
2615
		btrfs_abort_transaction(trans, ret);
2616
		goto out;
2617
	}
2618

2619 2620 2621 2622 2623
	ret = add_pending_csums(trans, inode, &ordered_extent->list);
	if (ret) {
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
2624

2625
	btrfs_inode_safe_disk_i_size_write(inode, 0);
2626 2627
	ret = btrfs_update_inode_fallback(trans, root, inode);
	if (ret) { /* -ENOMEM or corruption */
2628
		btrfs_abort_transaction(trans, ret);
2629
		goto out;
2630 2631
	}
	ret = 0;
2632
out:
2633 2634 2635 2636 2637
	clear_bits = EXTENT_DEFRAG;
	if (range_locked)
		clear_bits |= EXTENT_LOCKED;
	if (clear_new_delalloc_bytes)
		clear_bits |= EXTENT_DELALLOC_NEW;
2638 2639
	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits,
			 (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0,
2640
			 &cached_state);
2641

2642
	if (trans)
2643
		btrfs_end_transaction(trans);
J
Josef Bacik 已提交
2644

2645
	if (ret || truncated) {
2646
		u64 unwritten_start = start;
2647 2648

		if (truncated)
2649 2650
			unwritten_start += logical_len;
		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
2651 2652

		/* Drop the cache for the part of the extent we didn't write. */
2653
		btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0);
2654

2655 2656 2657
		/*
		 * If the ordered extent had an IOERR or something else went
		 * wrong we need to return the space for this ordered extent
2658 2659
		 * back to the allocator.  We only free the extent in the
		 * truncated case if we didn't write out the extent at all.
2660 2661 2662 2663
		 *
		 * If we made it past insert_reserved_file_extent before we
		 * errored out then we don't need to do this as the accounting
		 * has already been done.
2664
		 */
2665
		if ((ret || !logical_len) &&
2666
		    clear_reserved_extent &&
2667
		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2668 2669 2670 2671 2672
		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
			/*
			 * Discard the range before returning it back to the
			 * free space pool
			 */
2673
			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
2674
				btrfs_discard_extent(fs_info,
2675 2676 2677
						ordered_extent->disk_bytenr,
						ordered_extent->disk_num_bytes,
						NULL);
2678
			btrfs_free_reserved_extent(fs_info,
2679 2680
					ordered_extent->disk_bytenr,
					ordered_extent->disk_num_bytes, 1);
2681
		}
2682 2683
	}

2684
	/*
2685 2686
	 * This needs to be done to make sure anybody waiting knows we are done
	 * updating everything for this ordered extent.
2687 2688 2689
	 */
	btrfs_remove_ordered_extent(inode, ordered_extent);

2690 2691 2692 2693 2694
	/* once for us */
	btrfs_put_ordered_extent(ordered_extent);
	/* once for the tree */
	btrfs_put_ordered_extent(ordered_extent);

2695 2696 2697 2698 2699 2700 2701 2702
	return ret;
}

static void finish_ordered_fn(struct btrfs_work *work)
{
	struct btrfs_ordered_extent *ordered_extent;
	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
	btrfs_finish_ordered_io(ordered_extent);
2703 2704
}

2705 2706
void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
					  u64 end, int uptodate)
2707
{
2708
	struct inode *inode = page->mapping->host;
2709
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2710
	struct btrfs_ordered_extent *ordered_extent = NULL;
2711
	struct btrfs_workqueue *wq;
2712

2713 2714
	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);

2715
	ClearPagePrivate2(page);
2716 2717
	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
					    end - start + 1, uptodate))
2718
		return;
2719

2720
	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
2721
		wq = fs_info->endio_freespace_worker;
2722
	else
2723
		wq = fs_info->endio_write_workers;
2724

2725
	btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
2726
	btrfs_queue_work(wq, &ordered_extent->work);
2727 2728
}

2729 2730 2731 2732 2733
static int __readpage_endio_check(struct inode *inode,
				  struct btrfs_io_bio *io_bio,
				  int icsum, struct page *page,
				  int pgoff, u64 start, size_t len)
{
2734 2735
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
2736
	char *kaddr;
2737 2738 2739
	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
	u8 *csum_expected;
	u8 csum[BTRFS_CSUM_SIZE];
2740

2741
	csum_expected = ((u8 *)io_bio->csum) + icsum * csum_size;
2742 2743

	kaddr = kmap_atomic(page);
2744 2745 2746 2747 2748 2749 2750
	shash->tfm = fs_info->csum_shash;

	crypto_shash_init(shash);
	crypto_shash_update(shash, kaddr + pgoff, len);
	crypto_shash_final(shash, csum);

	if (memcmp(csum, csum_expected, csum_size))
2751 2752 2753 2754 2755
		goto zeroit;

	kunmap_atomic(kaddr);
	return 0;
zeroit:
2756 2757
	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
				    io_bio->mirror_num);
2758 2759 2760 2761 2762 2763
	memset(kaddr + pgoff, 1, len);
	flush_dcache_page(page);
	kunmap_atomic(kaddr);
	return -EIO;
}

C
Chris Mason 已提交
2764 2765
/*
 * when reads are done, we need to check csums to verify the data is correct
2766 2767
 * if there's a match, we allow the bio to finish.  If not, the code in
 * extent_io.c will try to find good copies for us.
C
Chris Mason 已提交
2768
 */
2769 2770 2771
static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
				      u64 phy_offset, struct page *page,
				      u64 start, u64 end, int mirror)
2772
{
M
Miao Xie 已提交
2773
	size_t offset = start - page_offset(page);
2774
	struct inode *inode = page->mapping->host;
2775
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2776
	struct btrfs_root *root = BTRFS_I(inode)->root;
2777

2778 2779
	if (PageChecked(page)) {
		ClearPageChecked(page);
2780
		return 0;
2781
	}
2782 2783

	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2784
		return 0;
2785 2786

	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2787
	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2788
		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
2789
		return 0;
2790
	}
2791

2792
	phy_offset >>= inode->i_sb->s_blocksize_bits;
2793 2794
	return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
				      start, (size_t)(end - start + 1));
2795
}
2796

2797 2798 2799 2800 2801 2802 2803 2804 2805 2806
/*
 * btrfs_add_delayed_iput - perform a delayed iput on @inode
 *
 * @inode: The inode we want to perform iput on
 *
 * This function uses the generic vfs_inode::i_count to track whether we should
 * just decrement it (in case it's > 1) or if this is the last iput then link
 * the inode to the delayed iput machinery. Delayed iputs are processed at
 * transaction commit time/superblock commit/cleaner kthread.
 */
Y
Yan, Zheng 已提交
2807 2808
void btrfs_add_delayed_iput(struct inode *inode)
{
2809
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2810
	struct btrfs_inode *binode = BTRFS_I(inode);
Y
Yan, Zheng 已提交
2811 2812 2813 2814

	if (atomic_add_unless(&inode->i_count, -1, 1))
		return;

2815
	atomic_inc(&fs_info->nr_delayed_iputs);
Y
Yan, Zheng 已提交
2816
	spin_lock(&fs_info->delayed_iput_lock);
2817 2818
	ASSERT(list_empty(&binode->delayed_iput));
	list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
Y
Yan, Zheng 已提交
2819
	spin_unlock(&fs_info->delayed_iput_lock);
2820 2821
	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
		wake_up_process(fs_info->cleaner_kthread);
Y
Yan, Zheng 已提交
2822 2823
}

2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845
static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
				    struct btrfs_inode *inode)
{
	list_del_init(&inode->delayed_iput);
	spin_unlock(&fs_info->delayed_iput_lock);
	iput(&inode->vfs_inode);
	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
		wake_up(&fs_info->delayed_iputs_wait);
	spin_lock(&fs_info->delayed_iput_lock);
}

static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
				   struct btrfs_inode *inode)
{
	if (!list_empty(&inode->delayed_iput)) {
		spin_lock(&fs_info->delayed_iput_lock);
		if (!list_empty(&inode->delayed_iput))
			run_delayed_iput_locked(fs_info, inode);
		spin_unlock(&fs_info->delayed_iput_lock);
	}
}

2846
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
Y
Yan, Zheng 已提交
2847 2848 2849
{

	spin_lock(&fs_info->delayed_iput_lock);
2850 2851 2852 2853 2854
	while (!list_empty(&fs_info->delayed_iputs)) {
		struct btrfs_inode *inode;

		inode = list_first_entry(&fs_info->delayed_iputs,
				struct btrfs_inode, delayed_iput);
2855
		run_delayed_iput_locked(fs_info, inode);
Y
Yan, Zheng 已提交
2856
	}
2857
	spin_unlock(&fs_info->delayed_iput_lock);
Y
Yan, Zheng 已提交
2858 2859
}

2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878
/**
 * btrfs_wait_on_delayed_iputs - wait on the delayed iputs to be done running
 * @fs_info - the fs_info for this fs
 * @return - EINTR if we were killed, 0 if nothing's pending
 *
 * This will wait on any delayed iputs that are currently running with KILLABLE
 * set.  Once they are all done running we will return, unless we are killed in
 * which case we return EINTR. This helps in user operations like fallocate etc
 * that might get blocked on the iputs.
 */
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
{
	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
			atomic_read(&fs_info->nr_delayed_iputs) == 0);
	if (ret)
		return -EINTR;
	return 0;
}

2879
/*
2880 2881
 * This creates an orphan entry for the given inode in case something goes wrong
 * in the middle of an unlink.
2882
 */
2883
int btrfs_orphan_add(struct btrfs_trans_handle *trans,
2884
		     struct btrfs_inode *inode)
2885
{
2886
	int ret;
2887

2888 2889 2890 2891
	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
	if (ret && ret != -EEXIST) {
		btrfs_abort_transaction(trans, ret);
		return ret;
2892 2893 2894
	}

	return 0;
2895 2896 2897
}

/*
2898 2899
 * We have done the delete so we can go ahead and remove the orphan item for
 * this particular inode.
2900
 */
2901
static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
2902
			    struct btrfs_inode *inode)
2903
{
2904
	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
2905 2906 2907 2908 2909 2910
}

/*
 * this cleans up any orphans that may be left on the list from the last use
 * of this root.
 */
2911
int btrfs_orphan_cleanup(struct btrfs_root *root)
2912
{
2913
	struct btrfs_fs_info *fs_info = root->fs_info;
2914 2915 2916 2917 2918
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key, found_key;
	struct btrfs_trans_handle *trans;
	struct inode *inode;
2919
	u64 last_objectid = 0;
2920
	int ret = 0, nr_unlink = 0;
2921

2922
	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2923
		return 0;
2924 2925

	path = btrfs_alloc_path();
2926 2927 2928 2929
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}
2930
	path->reada = READA_BACK;
2931 2932

	key.objectid = BTRFS_ORPHAN_OBJECTID;
2933
	key.type = BTRFS_ORPHAN_ITEM_KEY;
2934 2935 2936 2937
	key.offset = (u64)-1;

	while (1) {
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2938 2939
		if (ret < 0)
			goto out;
2940 2941 2942

		/*
		 * if ret == 0 means we found what we were searching for, which
L
Lucas De Marchi 已提交
2943
		 * is weird, but possible, so only screw with path if we didn't
2944 2945 2946
		 * find the key and see if we have stuff that matches
		 */
		if (ret > 0) {
2947
			ret = 0;
2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959
			if (path->slots[0] == 0)
				break;
			path->slots[0]--;
		}

		/* pull out the item */
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

		/* make sure the item matches what we want */
		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
			break;
2960
		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
2961 2962 2963
			break;

		/* release the path since we're done with it */
2964
		btrfs_release_path(path);
2965 2966 2967 2968 2969 2970

		/*
		 * this is where we are basically btrfs_lookup, without the
		 * crossing root thing.  we store the inode number in the
		 * offset of the orphan item.
		 */
2971 2972

		if (found_key.offset == last_objectid) {
2973 2974
			btrfs_err(fs_info,
				  "Error removing orphan entry, stopping orphan cleanup");
2975 2976 2977 2978 2979 2980
			ret = -EINVAL;
			goto out;
		}

		last_objectid = found_key.offset;

2981 2982 2983
		found_key.objectid = found_key.offset;
		found_key.type = BTRFS_INODE_ITEM_KEY;
		found_key.offset = 0;
2984
		inode = btrfs_iget(fs_info->sb, &found_key, root);
2985
		ret = PTR_ERR_OR_ZERO(inode);
2986
		if (ret && ret != -ENOENT)
2987
			goto out;
2988

2989
		if (ret == -ENOENT && root == fs_info->tree_root) {
2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019
			struct btrfs_root *dead_root;
			struct btrfs_fs_info *fs_info = root->fs_info;
			int is_dead_root = 0;

			/*
			 * this is an orphan in the tree root. Currently these
			 * could come from 2 sources:
			 *  a) a snapshot deletion in progress
			 *  b) a free space cache inode
			 * We need to distinguish those two, as the snapshot
			 * orphan must not get deleted.
			 * find_dead_roots already ran before us, so if this
			 * is a snapshot deletion, we should find the root
			 * in the dead_roots list
			 */
			spin_lock(&fs_info->trans_lock);
			list_for_each_entry(dead_root, &fs_info->dead_roots,
					    root_list) {
				if (dead_root->root_key.objectid ==
				    found_key.objectid) {
					is_dead_root = 1;
					break;
				}
			}
			spin_unlock(&fs_info->trans_lock);
			if (is_dead_root) {
				/* prevent this orphan from being found again */
				key.offset = found_key.objectid - 1;
				continue;
			}
3020

3021
		}
3022

3023
		/*
3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040
		 * If we have an inode with links, there are a couple of
		 * possibilities. Old kernels (before v3.12) used to create an
		 * orphan item for truncate indicating that there were possibly
		 * extent items past i_size that needed to be deleted. In v3.12,
		 * truncate was changed to update i_size in sync with the extent
		 * items, but the (useless) orphan item was still created. Since
		 * v4.18, we don't create the orphan item for truncate at all.
		 *
		 * So, this item could mean that we need to do a truncate, but
		 * only if this filesystem was last used on a pre-v3.12 kernel
		 * and was not cleanly unmounted. The odds of that are quite
		 * slim, and it's a pain to do the truncate now, so just delete
		 * the orphan item.
		 *
		 * It's also possible that this orphan item was supposed to be
		 * deleted but wasn't. The inode number may have been reused,
		 * but either way, we can delete the orphan item.
3041
		 */
3042 3043 3044
		if (ret == -ENOENT || inode->i_nlink) {
			if (!ret)
				iput(inode);
J
Josef Bacik 已提交
3045
			trans = btrfs_start_transaction(root, 1);
3046 3047 3048 3049
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				goto out;
			}
3050 3051
			btrfs_debug(fs_info, "auto deleting %Lu",
				    found_key.objectid);
J
Josef Bacik 已提交
3052 3053
			ret = btrfs_del_orphan_item(trans, root,
						    found_key.objectid);
3054
			btrfs_end_transaction(trans);
3055 3056
			if (ret)
				goto out;
3057 3058 3059
			continue;
		}

3060
		nr_unlink++;
3061 3062 3063 3064

		/* this will do delete_inode and everything for us */
		iput(inode);
	}
3065 3066 3067
	/* release the path since we're done with it */
	btrfs_release_path(path);

3068 3069
	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;

3070
	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3071
		trans = btrfs_join_transaction(root);
3072
		if (!IS_ERR(trans))
3073
			btrfs_end_transaction(trans);
3074
	}
3075 3076

	if (nr_unlink)
3077
		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3078 3079 3080

out:
	if (ret)
3081
		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3082 3083
	btrfs_free_path(path);
	return ret;
3084 3085
}

3086 3087 3088 3089 3090 3091 3092
/*
 * very simple check to peek ahead in the leaf looking for xattrs.  If we
 * don't find any xattrs, we know there can't be any acls.
 *
 * slot is the slot the inode is in, objectid is the objectid of the inode
 */
static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3093 3094
					  int slot, u64 objectid,
					  int *first_xattr_slot)
3095 3096 3097
{
	u32 nritems = btrfs_header_nritems(leaf);
	struct btrfs_key found_key;
3098 3099
	static u64 xattr_access = 0;
	static u64 xattr_default = 0;
3100 3101
	int scanned = 0;

3102
	if (!xattr_access) {
3103 3104 3105 3106
		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3107 3108
	}

3109
	slot++;
3110
	*first_xattr_slot = -1;
3111 3112 3113 3114 3115 3116 3117 3118
	while (slot < nritems) {
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* we found a different objectid, there must not be acls */
		if (found_key.objectid != objectid)
			return 0;

		/* we found an xattr, assume we've got an acl */
3119
		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3120 3121
			if (*first_xattr_slot == -1)
				*first_xattr_slot = slot;
3122 3123 3124 3125
			if (found_key.offset == xattr_access ||
			    found_key.offset == xattr_default)
				return 1;
		}
3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149

		/*
		 * we found a key greater than an xattr key, there can't
		 * be any acls later on
		 */
		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
			return 0;

		slot++;
		scanned++;

		/*
		 * it goes inode, inode backrefs, xattrs, extents,
		 * so if there are a ton of hard links to an inode there can
		 * be a lot of backrefs.  Don't waste time searching too hard,
		 * this is just an optimization
		 */
		if (scanned >= 8)
			break;
	}
	/* we hit the end of the leaf before we found an xattr or
	 * something larger than an xattr.  We have to assume the inode
	 * has acls
	 */
3150 3151
	if (*first_xattr_slot == -1)
		*first_xattr_slot = slot;
3152 3153 3154
	return 1;
}

C
Chris Mason 已提交
3155 3156 3157
/*
 * read an inode from the btree into the in-memory inode
 */
3158 3159
static int btrfs_read_locked_inode(struct inode *inode,
				   struct btrfs_path *in_path)
C
Chris Mason 已提交
3160
{
3161
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3162
	struct btrfs_path *path = in_path;
3163
	struct extent_buffer *leaf;
C
Chris Mason 已提交
3164 3165 3166
	struct btrfs_inode_item *inode_item;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_key location;
3167
	unsigned long ptr;
3168
	int maybe_acls;
J
Josef Bacik 已提交
3169
	u32 rdev;
C
Chris Mason 已提交
3170
	int ret;
3171
	bool filled = false;
3172
	int first_xattr_slot;
3173 3174 3175 3176

	ret = btrfs_fill_inode(inode, &rdev);
	if (!ret)
		filled = true;
C
Chris Mason 已提交
3177

3178 3179 3180 3181 3182
	if (!path) {
		path = btrfs_alloc_path();
		if (!path)
			return -ENOMEM;
	}
3183

C
Chris Mason 已提交
3184
	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
C
Chris Mason 已提交
3185

C
Chris Mason 已提交
3186
	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3187
	if (ret) {
3188 3189
		if (path != in_path)
			btrfs_free_path(path);
A
Al Viro 已提交
3190
		return ret;
3191
	}
C
Chris Mason 已提交
3192

3193
	leaf = path->nodes[0];
3194 3195

	if (filled)
3196
		goto cache_index;
3197

3198 3199 3200
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_inode_item);
	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
M
Miklos Szeredi 已提交
3201
	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3202 3203
	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3204
	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3205 3206
	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
			round_up(i_size_read(inode), fs_info->sectorsize));
3207

3208 3209
	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3210

3211 3212
	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3213

3214 3215
	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3216

3217 3218 3219 3220
	BTRFS_I(inode)->i_otime.tv_sec =
		btrfs_timespec_sec(leaf, &inode_item->otime);
	BTRFS_I(inode)->i_otime.tv_nsec =
		btrfs_timespec_nsec(leaf, &inode_item->otime);
3221

3222
	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3223
	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
J
Josef Bacik 已提交
3224 3225
	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);

3226 3227
	inode_set_iversion_queried(inode,
				   btrfs_inode_sequence(leaf, inode_item));
3228 3229 3230 3231 3232 3233 3234 3235
	inode->i_generation = BTRFS_I(inode)->generation;
	inode->i_rdev = 0;
	rdev = btrfs_inode_rdev(leaf, inode_item);

	BTRFS_I(inode)->index_cnt = (u64)-1;
	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);

cache_index:
J
Josef Bacik 已提交
3236 3237 3238 3239 3240
	/*
	 * If we were modified in the current generation and evicted from memory
	 * and then re-read we need to do a full sync since we don't have any
	 * idea about which extents were modified before we were evicted from
	 * cache.
3241 3242 3243
	 *
	 * This is required for both inode re-read from disk and delayed inode
	 * in delayed_nodes_tree.
J
Josef Bacik 已提交
3244
	 */
3245
	if (BTRFS_I(inode)->last_trans == fs_info->generation)
J
Josef Bacik 已提交
3246 3247 3248
		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			&BTRFS_I(inode)->runtime_flags);

3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271
	/*
	 * We don't persist the id of the transaction where an unlink operation
	 * against the inode was last made. So here we assume the inode might
	 * have been evicted, and therefore the exact value of last_unlink_trans
	 * lost, and set it to last_trans to avoid metadata inconsistencies
	 * between the inode and its parent if the inode is fsync'ed and the log
	 * replayed. For example, in the scenario:
	 *
	 * touch mydir/foo
	 * ln mydir/foo mydir/bar
	 * sync
	 * unlink mydir/bar
	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
	 * xfs_io -c fsync mydir/foo
	 * <power failure>
	 * mount fs, triggers fsync log replay
	 *
	 * We must make sure that when we fsync our inode foo we also log its
	 * parent inode, otherwise after log replay the parent still has the
	 * dentry with the "bar" name but our inode foo has a link count of 1
	 * and doesn't have an inode ref with the name "bar" anymore.
	 *
	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3272
	 * but it guarantees correctness at the expense of occasional full
3273 3274 3275 3276 3277
	 * transaction commits on fsync if our inode is a directory, or if our
	 * inode is not a directory, logging its parent unnecessarily.
	 */
	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;

3278 3279 3280 3281 3282 3283
	path->slots[0]++;
	if (inode->i_nlink != 1 ||
	    path->slots[0] >= btrfs_header_nritems(leaf))
		goto cache_acl;

	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3284
	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299
		goto cache_acl;

	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
	if (location.type == BTRFS_INODE_REF_KEY) {
		struct btrfs_inode_ref *ref;

		ref = (struct btrfs_inode_ref *)ptr;
		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
		struct btrfs_inode_extref *extref;

		extref = (struct btrfs_inode_extref *)ptr;
		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
								     extref);
	}
3300
cache_acl:
3301 3302 3303 3304
	/*
	 * try to precache a NULL acl entry for files that don't have
	 * any xattrs or acls
	 */
L
Li Zefan 已提交
3305
	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3306
			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3307 3308 3309 3310
	if (first_xattr_slot != -1) {
		path->slots[0] = first_xattr_slot;
		ret = btrfs_load_inode_props(inode, path);
		if (ret)
3311
			btrfs_err(fs_info,
3312
				  "error loading props for ino %llu (root %llu): %d",
3313
				  btrfs_ino(BTRFS_I(inode)),
3314 3315
				  root->root_key.objectid, ret);
	}
3316 3317
	if (path != in_path)
		btrfs_free_path(path);
3318

3319 3320
	if (!maybe_acls)
		cache_no_acl(inode);
3321

C
Chris Mason 已提交
3322 3323 3324
	switch (inode->i_mode & S_IFMT) {
	case S_IFREG:
		inode->i_mapping->a_ops = &btrfs_aops;
3325
		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
C
Chris Mason 已提交
3326 3327 3328 3329 3330
		inode->i_fop = &btrfs_file_operations;
		inode->i_op = &btrfs_file_inode_operations;
		break;
	case S_IFDIR:
		inode->i_fop = &btrfs_dir_file_operations;
3331
		inode->i_op = &btrfs_dir_inode_operations;
C
Chris Mason 已提交
3332 3333 3334
		break;
	case S_IFLNK:
		inode->i_op = &btrfs_symlink_inode_operations;
3335
		inode_nohighmem(inode);
3336
		inode->i_mapping->a_ops = &btrfs_aops;
C
Chris Mason 已提交
3337
		break;
J
Josef Bacik 已提交
3338
	default:
J
Jim Owens 已提交
3339
		inode->i_op = &btrfs_special_inode_operations;
J
Josef Bacik 已提交
3340 3341
		init_special_inode(inode, inode->i_mode, rdev);
		break;
C
Chris Mason 已提交
3342
	}
3343

3344
	btrfs_sync_inode_flags_to_i_flags(inode);
3345
	return 0;
C
Chris Mason 已提交
3346 3347
}

C
Chris Mason 已提交
3348 3349 3350
/*
 * given a leaf and an inode, copy the inode fields into the leaf
 */
3351 3352
static void fill_inode_item(struct btrfs_trans_handle *trans,
			    struct extent_buffer *leaf,
3353
			    struct btrfs_inode_item *item,
C
Chris Mason 已提交
3354 3355
			    struct inode *inode)
{
3356 3357
	struct btrfs_map_token token;

3358
	btrfs_init_map_token(&token, leaf);
3359

3360 3361 3362 3363 3364 3365
	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
				   &token);
	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3366

3367
	btrfs_set_token_timespec_sec(leaf, &item->atime,
3368
				     inode->i_atime.tv_sec, &token);
3369
	btrfs_set_token_timespec_nsec(leaf, &item->atime,
3370
				      inode->i_atime.tv_nsec, &token);
3371

3372
	btrfs_set_token_timespec_sec(leaf, &item->mtime,
3373
				     inode->i_mtime.tv_sec, &token);
3374
	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3375
				      inode->i_mtime.tv_nsec, &token);
3376

3377
	btrfs_set_token_timespec_sec(leaf, &item->ctime,
3378
				     inode->i_ctime.tv_sec, &token);
3379
	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3380
				      inode->i_ctime.tv_nsec, &token);
3381

3382 3383 3384 3385 3386
	btrfs_set_token_timespec_sec(leaf, &item->otime,
				     BTRFS_I(inode)->i_otime.tv_sec, &token);
	btrfs_set_token_timespec_nsec(leaf, &item->otime,
				      BTRFS_I(inode)->i_otime.tv_nsec, &token);

3387 3388 3389 3390
	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
				     &token);
	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
					 &token);
3391 3392
	btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
				       &token);
3393 3394 3395 3396
	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
C
Chris Mason 已提交
3397 3398
}

C
Chris Mason 已提交
3399 3400 3401
/*
 * copy everything in the in-memory inode into the btree.
 */
3402
static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
C
Chris Mason 已提交
3403
				struct btrfs_root *root, struct inode *inode)
C
Chris Mason 已提交
3404 3405 3406
{
	struct btrfs_inode_item *inode_item;
	struct btrfs_path *path;
3407
	struct extent_buffer *leaf;
C
Chris Mason 已提交
3408 3409 3410
	int ret;

	path = btrfs_alloc_path();
3411 3412 3413
	if (!path)
		return -ENOMEM;

3414
	path->leave_spinning = 1;
3415 3416
	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
				 1);
C
Chris Mason 已提交
3417 3418 3419 3420 3421 3422
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		goto failed;
	}

3423 3424
	leaf = path->nodes[0];
	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3425
				    struct btrfs_inode_item);
C
Chris Mason 已提交
3426

3427
	fill_inode_item(trans, leaf, inode_item, inode);
3428
	btrfs_mark_buffer_dirty(leaf);
3429
	btrfs_set_inode_last_trans(trans, inode);
C
Chris Mason 已提交
3430 3431 3432 3433 3434 3435
	ret = 0;
failed:
	btrfs_free_path(path);
	return ret;
}

3436 3437 3438 3439 3440 3441
/*
 * copy everything in the in-memory inode into the btree.
 */
noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
				struct btrfs_root *root, struct inode *inode)
{
3442
	struct btrfs_fs_info *fs_info = root->fs_info;
3443 3444 3445 3446 3447 3448 3449 3450 3451
	int ret;

	/*
	 * If the inode is a free space inode, we can deadlock during commit
	 * if we put it into the delayed code.
	 *
	 * The data relocation inode should also be directly updated
	 * without delay
	 */
3452
	if (!btrfs_is_free_space_inode(BTRFS_I(inode))
3453
	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3454
	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
3455 3456
		btrfs_update_root_times(trans, root);

3457 3458 3459 3460 3461 3462 3463 3464 3465
		ret = btrfs_delayed_update_inode(trans, root, inode);
		if (!ret)
			btrfs_set_inode_last_trans(trans, inode);
		return ret;
	}

	return btrfs_update_inode_item(trans, root, inode);
}

3466 3467 3468
noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root,
					 struct inode *inode)
3469 3470 3471 3472 3473 3474 3475 3476 3477
{
	int ret;

	ret = btrfs_update_inode(trans, root, inode);
	if (ret == -ENOSPC)
		return btrfs_update_inode_item(trans, root, inode);
	return ret;
}

C
Chris Mason 已提交
3478 3479 3480 3481 3482
/*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
 */
3483 3484
static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
3485 3486
				struct btrfs_inode *dir,
				struct btrfs_inode *inode,
3487
				const char *name, int name_len)
C
Chris Mason 已提交
3488
{
3489
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
3490 3491 3492
	struct btrfs_path *path;
	int ret = 0;
	struct btrfs_dir_item *di;
3493
	u64 index;
L
Li Zefan 已提交
3494 3495
	u64 ino = btrfs_ino(inode);
	u64 dir_ino = btrfs_ino(dir);
C
Chris Mason 已提交
3496 3497

	path = btrfs_alloc_path();
3498 3499
	if (!path) {
		ret = -ENOMEM;
3500
		goto out;
3501 3502
	}

3503
	path->leave_spinning = 1;
L
Li Zefan 已提交
3504
	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
C
Chris Mason 已提交
3505
				    name, name_len, -1);
3506 3507
	if (IS_ERR_OR_NULL(di)) {
		ret = di ? PTR_ERR(di) : -ENOENT;
C
Chris Mason 已提交
3508 3509 3510
		goto err;
	}
	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3511 3512
	if (ret)
		goto err;
3513
	btrfs_release_path(path);
C
Chris Mason 已提交
3514

3515 3516 3517 3518 3519 3520 3521 3522 3523 3524
	/*
	 * If we don't have dir index, we have to get it by looking up
	 * the inode ref, since we get the inode ref, remove it directly,
	 * it is unnecessary to do delayed deletion.
	 *
	 * But if we have dir index, needn't search inode ref to get it.
	 * Since the inode ref is close to the inode item, it is better
	 * that we delay to delete it, and just do this deletion when
	 * we update the inode item.
	 */
3525
	if (inode->dir_index) {
3526 3527
		ret = btrfs_delayed_delete_inode_ref(inode);
		if (!ret) {
3528
			index = inode->dir_index;
3529 3530 3531 3532
			goto skip_backref;
		}
	}

L
Li Zefan 已提交
3533 3534
	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
				  dir_ino, &index);
3535
	if (ret) {
3536
		btrfs_info(fs_info,
3537
			"failed to delete reference to %.*s, inode %llu parent %llu",
3538
			name_len, name, ino, dir_ino);
3539
		btrfs_abort_transaction(trans, ret);
3540 3541
		goto err;
	}
3542
skip_backref:
3543
	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
3544
	if (ret) {
3545
		btrfs_abort_transaction(trans, ret);
C
Chris Mason 已提交
3546
		goto err;
3547
	}
C
Chris Mason 已提交
3548

3549 3550
	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
			dir_ino);
3551
	if (ret != 0 && ret != -ENOENT) {
3552
		btrfs_abort_transaction(trans, ret);
3553 3554
		goto err;
	}
3555

3556 3557
	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
			index);
3558 3559
	if (ret == -ENOENT)
		ret = 0;
3560
	else if (ret)
3561
		btrfs_abort_transaction(trans, ret);
3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572

	/*
	 * If we have a pending delayed iput we could end up with the final iput
	 * being run in btrfs-cleaner context.  If we have enough of these built
	 * up we can end up burning a lot of time in btrfs-cleaner without any
	 * way to throttle the unlinks.  Since we're currently holding a ref on
	 * the inode we can run the delayed iput here without any issues as the
	 * final iput won't be done until after we drop the ref we're currently
	 * holding.
	 */
	btrfs_run_delayed_iput(fs_info, inode);
C
Chris Mason 已提交
3573 3574
err:
	btrfs_free_path(path);
3575 3576 3577
	if (ret)
		goto out;

3578
	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
3579 3580 3581 3582 3583
	inode_inc_iversion(&inode->vfs_inode);
	inode_inc_iversion(&dir->vfs_inode);
	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
	ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
3584
out:
C
Chris Mason 已提交
3585 3586 3587
	return ret;
}

3588 3589
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root,
3590
		       struct btrfs_inode *dir, struct btrfs_inode *inode,
3591 3592 3593 3594 3595
		       const char *name, int name_len)
{
	int ret;
	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
	if (!ret) {
3596 3597
		drop_nlink(&inode->vfs_inode);
		ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
3598 3599 3600
	}
	return ret;
}
C
Chris Mason 已提交
3601

3602 3603 3604
/*
 * helper to start transaction for unlink and rmdir.
 *
3605 3606 3607 3608
 * unlink and rmdir are special in btrfs, they do not always free space, so
 * if we cannot make our reservations the normal way try and see if there is
 * plenty of slack room in the global reserve to migrate, otherwise we cannot
 * allow the unlink to occur.
3609
 */
3610
static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3611
{
3612
	struct btrfs_root *root = BTRFS_I(dir)->root;
3613

3614 3615 3616 3617 3618 3619 3620
	/*
	 * 1 for the possible orphan item
	 * 1 for the dir item
	 * 1 for the dir index
	 * 1 for the inode ref
	 * 1 for the inode
	 */
3621
	return btrfs_start_transaction_fallback_global_rsv(root, 5);
3622 3623 3624 3625 3626 3627
}

static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
{
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct btrfs_trans_handle *trans;
3628
	struct inode *inode = d_inode(dentry);
3629 3630
	int ret;

3631
	trans = __unlink_start_trans(dir);
3632 3633
	if (IS_ERR(trans))
		return PTR_ERR(trans);
3634

3635 3636
	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
			0);
3637

3638 3639 3640
	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
			dentry->d_name.len);
3641 3642
	if (ret)
		goto out;
3643

3644
	if (inode->i_nlink == 0) {
3645
		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
3646 3647
		if (ret)
			goto out;
3648
	}
3649

3650
out:
3651
	btrfs_end_transaction(trans);
3652
	btrfs_btree_balance_dirty(root->fs_info);
C
Chris Mason 已提交
3653 3654 3655
	return ret;
}

3656
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3657
			       struct inode *dir, struct dentry *dentry)
3658
{
3659
	struct btrfs_root *root = BTRFS_I(dir)->root;
3660
	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
3661 3662 3663 3664
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dir_item *di;
	struct btrfs_key key;
3665 3666
	const char *name = dentry->d_name.name;
	int name_len = dentry->d_name.len;
3667 3668
	u64 index;
	int ret;
3669
	u64 objectid;
3670
	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
3671

3672 3673 3674 3675 3676 3677 3678 3679 3680
	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
		objectid = inode->root->root_key.objectid;
	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
		objectid = inode->location.objectid;
	} else {
		WARN_ON(1);
		return -EINVAL;
	}

3681 3682 3683 3684
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

L
Li Zefan 已提交
3685
	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3686
				   name, name_len, -1);
3687
	if (IS_ERR_OR_NULL(di)) {
3688
		ret = di ? PTR_ERR(di) : -ENOENT;
3689 3690
		goto out;
	}
3691 3692 3693 3694 3695

	leaf = path->nodes[0];
	btrfs_dir_item_key_to_cpu(leaf, di, &key);
	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3696
	if (ret) {
3697
		btrfs_abort_transaction(trans, ret);
3698 3699
		goto out;
	}
3700
	btrfs_release_path(path);
3701

3702 3703 3704 3705 3706 3707 3708 3709 3710 3711
	/*
	 * This is a placeholder inode for a subvolume we didn't have a
	 * reference to at the time of the snapshot creation.  In the meantime
	 * we could have renamed the real subvol link into our snapshot, so
	 * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
	 * Instead simply lookup the dir_index_item for this entry so we can
	 * remove it.  Otherwise we know we have a ref to the root and we can
	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
	 */
	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
L
Li Zefan 已提交
3712
		di = btrfs_search_dir_index_item(root, path, dir_ino,
3713
						 name, name_len);
3714 3715 3716 3717 3718
		if (IS_ERR_OR_NULL(di)) {
			if (!di)
				ret = -ENOENT;
			else
				ret = PTR_ERR(di);
3719
			btrfs_abort_transaction(trans, ret);
3720 3721
			goto out;
		}
3722 3723 3724 3725

		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		index = key.offset;
3726 3727 3728 3729 3730 3731 3732 3733 3734
		btrfs_release_path(path);
	} else {
		ret = btrfs_del_root_ref(trans, objectid,
					 root->root_key.objectid, dir_ino,
					 &index, name, name_len);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
3735 3736
	}

3737
	ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
3738
	if (ret) {
3739
		btrfs_abort_transaction(trans, ret);
3740 3741
		goto out;
	}
3742

3743
	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
3744
	inode_inc_iversion(dir);
3745
	dir->i_mtime = dir->i_ctime = current_time(dir);
3746
	ret = btrfs_update_inode_fallback(trans, root, dir);
3747
	if (ret)
3748
		btrfs_abort_transaction(trans, ret);
3749
out:
3750
	btrfs_free_path(path);
3751
	return ret;
3752 3753
}

3754 3755 3756 3757
/*
 * Helper to check if the subvolume references other subvolumes or if it's
 * default.
 */
3758
static noinline int may_destroy_subvol(struct btrfs_root *root)
3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_path *path;
	struct btrfs_dir_item *di;
	struct btrfs_key key;
	u64 dir_id;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* Make sure this root isn't set as the default subvol */
	dir_id = btrfs_super_root_dir(fs_info->super_copy);
	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
				   dir_id, "default", 7, 0);
	if (di && !IS_ERR(di)) {
		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
		if (key.objectid == root->root_key.objectid) {
			ret = -EPERM;
			btrfs_err(fs_info,
				  "deleting default subvolume %llu is not allowed",
				  key.objectid);
			goto out;
		}
		btrfs_release_path(path);
	}

	key.objectid = root->root_key.objectid;
	key.type = BTRFS_ROOT_REF_KEY;
	key.offset = (u64)-1;

	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	BUG_ON(ret == 0);

	ret = 0;
	if (path->slots[0] > 0) {
		path->slots[0]--;
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		if (key.objectid == root->root_key.objectid &&
		    key.type == BTRFS_ROOT_REF_KEY)
			ret = -ENOTEMPTY;
	}
out:
	btrfs_free_path(path);
	return ret;
}

3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829
/* Delete all dentries for inodes belonging to the root */
static void btrfs_prune_dentries(struct btrfs_root *root)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct rb_node *node;
	struct rb_node *prev;
	struct btrfs_inode *entry;
	struct inode *inode;
	u64 objectid = 0;

	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
		WARN_ON(btrfs_root_refs(&root->root_item) != 0);

	spin_lock(&root->inode_lock);
again:
	node = root->inode_tree.rb_node;
	prev = NULL;
	while (node) {
		prev = node;
		entry = rb_entry(node, struct btrfs_inode, rb_node);

3830
		if (objectid < btrfs_ino(entry))
3831
			node = node->rb_left;
3832
		else if (objectid > btrfs_ino(entry))
3833 3834 3835 3836 3837 3838 3839
			node = node->rb_right;
		else
			break;
	}
	if (!node) {
		while (prev) {
			entry = rb_entry(prev, struct btrfs_inode, rb_node);
3840
			if (objectid <= btrfs_ino(entry)) {
3841 3842 3843 3844 3845 3846 3847 3848
				node = prev;
				break;
			}
			prev = rb_next(prev);
		}
	}
	while (node) {
		entry = rb_entry(node, struct btrfs_inode, rb_node);
3849
		objectid = btrfs_ino(entry) + 1;
3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872
		inode = igrab(&entry->vfs_inode);
		if (inode) {
			spin_unlock(&root->inode_lock);
			if (atomic_read(&inode->i_count) > 1)
				d_prune_aliases(inode);
			/*
			 * btrfs_drop_inode will have it removed from the inode
			 * cache when its usage count hits zero.
			 */
			iput(inode);
			cond_resched();
			spin_lock(&root->inode_lock);
			goto again;
		}

		if (cond_resched_lock(&root->inode_lock))
			goto again;

		node = rb_next(node);
	}
	spin_unlock(&root->inode_lock);
}

3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890
int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct inode *inode = d_inode(dentry);
	struct btrfs_root *dest = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
	struct btrfs_block_rsv block_rsv;
	u64 root_flags;
	int ret;
	int err;

	/*
	 * Don't allow to delete a subvolume with send in progress. This is
	 * inside the inode lock so the error handling that has to drop the bit
	 * again is not run concurrently.
	 */
	spin_lock(&dest->root_item_lock);
3891
	if (dest->send_in_progress) {
3892 3893 3894 3895 3896 3897
		spin_unlock(&dest->root_item_lock);
		btrfs_warn(fs_info,
			   "attempt to delete subvolume %llu during send",
			   dest->root_key.objectid);
		return -EPERM;
	}
3898 3899 3900 3901
	root_flags = btrfs_root_flags(&dest->root_item);
	btrfs_set_root_flags(&dest->root_item,
			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
	spin_unlock(&dest->root_item_lock);
3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914

	down_write(&fs_info->subvol_sem);

	err = may_destroy_subvol(dest);
	if (err)
		goto out_up_write;

	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
	/*
	 * One for dir inode,
	 * two for dir entries,
	 * two for root ref/backref.
	 */
3915
	err = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928
	if (err)
		goto out_up_write;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_release;
	}
	trans->block_rsv = &block_rsv;
	trans->bytes_reserved = block_rsv.size;

	btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));

3929
	ret = btrfs_unlink_subvol(trans, dir, dentry);
3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953
	if (ret) {
		err = ret;
		btrfs_abort_transaction(trans, ret);
		goto out_end_trans;
	}

	btrfs_record_root_in_trans(trans, dest);

	memset(&dest->root_item.drop_progress, 0,
		sizeof(dest->root_item.drop_progress));
	dest->root_item.drop_level = 0;
	btrfs_set_root_refs(&dest->root_item, 0);

	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
		ret = btrfs_insert_orphan_item(trans,
					fs_info->tree_root,
					dest->root_key.objectid);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			err = ret;
			goto out_end_trans;
		}
	}

3954
	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
3955 3956 3957 3958 3959 3960 3961 3962
				  BTRFS_UUID_KEY_SUBVOL,
				  dest->root_key.objectid);
	if (ret && ret != -ENOENT) {
		btrfs_abort_transaction(trans, ret);
		err = ret;
		goto out_end_trans;
	}
	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
3963
		ret = btrfs_uuid_tree_remove(trans,
3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992
					  dest->root_item.received_uuid,
					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
					  dest->root_key.objectid);
		if (ret && ret != -ENOENT) {
			btrfs_abort_transaction(trans, ret);
			err = ret;
			goto out_end_trans;
		}
	}

out_end_trans:
	trans->block_rsv = NULL;
	trans->bytes_reserved = 0;
	ret = btrfs_end_transaction(trans);
	if (ret && !err)
		err = ret;
	inode->i_flags |= S_DEAD;
out_release:
	btrfs_subvolume_release_metadata(fs_info, &block_rsv);
out_up_write:
	up_write(&fs_info->subvol_sem);
	if (err) {
		spin_lock(&dest->root_item_lock);
		root_flags = btrfs_root_flags(&dest->root_item);
		btrfs_set_root_flags(&dest->root_item,
				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
		spin_unlock(&dest->root_item_lock);
	} else {
		d_invalidate(dentry);
3993
		btrfs_prune_dentries(dest);
3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005
		ASSERT(dest->send_in_progress == 0);

		/* the last ref */
		if (dest->ino_cache_inode) {
			iput(dest->ino_cache_inode);
			dest->ino_cache_inode = NULL;
		}
	}

	return err;
}

C
Chris Mason 已提交
4006 4007
static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
{
4008
	struct inode *inode = d_inode(dentry);
4009
	int err = 0;
C
Chris Mason 已提交
4010 4011
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct btrfs_trans_handle *trans;
4012
	u64 last_unlink_trans;
C
Chris Mason 已提交
4013

4014
	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
Y
Yan 已提交
4015
		return -ENOTEMPTY;
4016
	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4017
		return btrfs_delete_subvolume(dir, dentry);
Y
Yan 已提交
4018

4019
	trans = __unlink_start_trans(dir);
4020
	if (IS_ERR(trans))
4021 4022
		return PTR_ERR(trans);

4023
	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4024
		err = btrfs_unlink_subvol(trans, dir, dentry);
4025 4026 4027
		goto out;
	}

4028
	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4029
	if (err)
4030
		goto out;
4031

4032 4033
	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;

C
Chris Mason 已提交
4034
	/* now the directory is empty */
4035 4036 4037
	err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
			dentry->d_name.len);
4038
	if (!err) {
4039
		btrfs_i_size_write(BTRFS_I(inode), 0);
4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053
		/*
		 * Propagate the last_unlink_trans value of the deleted dir to
		 * its parent directory. This is to prevent an unrecoverable
		 * log tree in the case we do something like this:
		 * 1) create dir foo
		 * 2) create snapshot under dir foo
		 * 3) delete the snapshot
		 * 4) rmdir foo
		 * 5) mkdir foo
		 * 6) fsync foo or some file inside foo
		 */
		if (last_unlink_trans >= trans->transid)
			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
	}
4054
out:
4055
	btrfs_end_transaction(trans);
4056
	btrfs_btree_balance_dirty(root->fs_info);
4057

C
Chris Mason 已提交
4058 4059 4060
	return err;
}

4061 4062 4063 4064 4065
/*
 * Return this if we need to call truncate_block for the last bit of the
 * truncate.
 */
#define NEED_TRUNCATE_BLOCK 1
4066

C
Chris Mason 已提交
4067 4068 4069
/*
 * this can truncate away extent items, csum items and directory items.
 * It starts at a high offset and removes keys until it can't find
C
Chris Mason 已提交
4070
 * any higher than new_size
C
Chris Mason 已提交
4071 4072 4073
 *
 * csum items that cross the new i_size are truncated to the new size
 * as well.
4074 4075 4076
 *
 * min_type is the minimum key type to truncate down to.  If set to 0, this
 * will kill all the items on this inode, including the INODE_ITEM_KEY.
C
Chris Mason 已提交
4077
 */
4078 4079 4080 4081
int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct inode *inode,
			       u64 new_size, u32 min_type)
C
Chris Mason 已提交
4082
{
4083
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4084
	struct btrfs_path *path;
4085
	struct extent_buffer *leaf;
C
Chris Mason 已提交
4086
	struct btrfs_file_extent_item *fi;
4087 4088
	struct btrfs_key key;
	struct btrfs_key found_key;
C
Chris Mason 已提交
4089
	u64 extent_start = 0;
4090
	u64 extent_num_bytes = 0;
4091
	u64 extent_offset = 0;
C
Chris Mason 已提交
4092
	u64 item_end = 0;
4093
	u64 last_size = new_size;
4094
	u32 found_type = (u8)-1;
C
Chris Mason 已提交
4095 4096
	int found_extent;
	int del_item;
4097 4098
	int pending_del_nr = 0;
	int pending_del_slot = 0;
4099
	int extent_type = -1;
4100
	int ret;
4101
	u64 ino = btrfs_ino(BTRFS_I(inode));
4102
	u64 bytes_deleted = 0;
4103 4104
	bool be_nice = false;
	bool should_throttle = false;
4105 4106
	const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
	struct extent_state *cached_state = NULL;
4107 4108

	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
C
Chris Mason 已提交
4109

4110 4111 4112 4113
	/*
	 * for non-free space inodes and ref cows, we want to back off from
	 * time to time
	 */
4114
	if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4115
	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4116
		be_nice = true;
4117

4118 4119 4120
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
4121
	path->reada = READA_BACK;
4122

4123 4124 4125
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
		lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
				 &cached_state);
4126

J
Josef Bacik 已提交
4127 4128 4129 4130 4131
	/*
	 * We want to drop from the next block forward in case this new size is
	 * not block aligned since we will be keeping the last block of the
	 * extent just the way it is.
	 */
4132
	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4133
	    root == fs_info->tree_root)
4134
		btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4135
					fs_info->sectorsize),
4136
					(u64)-1, 0);
4137

4138 4139 4140
	/*
	 * This function is also used to drop the items in the log tree before
	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4141
	 * it is used to drop the logged items. So we shouldn't kill the delayed
4142 4143 4144
	 * items.
	 */
	if (min_type == 0 && root == BTRFS_I(inode)->root)
4145
		btrfs_kill_delayed_inode_items(BTRFS_I(inode));
4146

L
Li Zefan 已提交
4147
	key.objectid = ino;
C
Chris Mason 已提交
4148
	key.offset = (u64)-1;
4149 4150
	key.type = (u8)-1;

4151
search_again:
4152 4153 4154 4155 4156
	/*
	 * with a 16K leaf size and 128MB extents, you can actually queue
	 * up a huge file in a single leaf.  Most of the time that
	 * bytes_deleted is > 0, it will be huge by the time we get here
	 */
4157 4158 4159 4160
	if (be_nice && bytes_deleted > SZ_32M &&
	    btrfs_should_end_transaction(trans)) {
		ret = -EAGAIN;
		goto out;
4161 4162
	}

4163
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4164
	if (ret < 0)
4165
		goto out;
C
Chris Mason 已提交
4166

4167
	if (ret > 0) {
4168
		ret = 0;
4169 4170 4171
		/* there are no items in the tree for us to truncate, we're
		 * done
		 */
4172 4173
		if (path->slots[0] == 0)
			goto out;
4174 4175 4176
		path->slots[0]--;
	}

C
Chris Mason 已提交
4177
	while (1) {
4178 4179
		u64 clear_start = 0, clear_len = 0;

C
Chris Mason 已提交
4180
		fi = NULL;
4181 4182
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4183
		found_type = found_key.type;
C
Chris Mason 已提交
4184

L
Li Zefan 已提交
4185
		if (found_key.objectid != ino)
C
Chris Mason 已提交
4186
			break;
4187

4188
		if (found_type < min_type)
C
Chris Mason 已提交
4189 4190
			break;

4191
		item_end = found_key.offset;
C
Chris Mason 已提交
4192
		if (found_type == BTRFS_EXTENT_DATA_KEY) {
4193
			fi = btrfs_item_ptr(leaf, path->slots[0],
C
Chris Mason 已提交
4194
					    struct btrfs_file_extent_item);
4195 4196
			extent_type = btrfs_file_extent_type(leaf, fi);
			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4197
				item_end +=
4198
				    btrfs_file_extent_num_bytes(leaf, fi);
L
Liu Bo 已提交
4199 4200 4201 4202

				trace_btrfs_truncate_show_fi_regular(
					BTRFS_I(inode), leaf, fi,
					found_key.offset);
4203
			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4204 4205
				item_end += btrfs_file_extent_ram_bytes(leaf,
									fi);
L
Liu Bo 已提交
4206 4207 4208 4209

				trace_btrfs_truncate_show_fi_inline(
					BTRFS_I(inode), leaf, fi, path->slots[0],
					found_key.offset);
C
Chris Mason 已提交
4210
			}
4211
			item_end--;
C
Chris Mason 已提交
4212
		}
4213 4214 4215
		if (found_type > min_type) {
			del_item = 1;
		} else {
4216
			if (item_end < new_size)
4217
				break;
4218 4219 4220 4221
			if (found_key.offset >= new_size)
				del_item = 1;
			else
				del_item = 0;
C
Chris Mason 已提交
4222 4223 4224
		}
		found_extent = 0;
		/* FIXME, shrink the extent if the ref count is only 1 */
4225 4226 4227 4228
		if (found_type != BTRFS_EXTENT_DATA_KEY)
			goto delete;

		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
C
Chris Mason 已提交
4229
			u64 num_dec;
4230 4231

			clear_start = found_key.offset;
4232
			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4233
			if (!del_item) {
4234 4235
				u64 orig_num_bytes =
					btrfs_file_extent_num_bytes(leaf, fi);
4236 4237
				extent_num_bytes = ALIGN(new_size -
						found_key.offset,
4238
						fs_info->sectorsize);
4239
				clear_start = ALIGN(new_size, fs_info->sectorsize);
4240 4241 4242
				btrfs_set_file_extent_num_bytes(leaf, fi,
							 extent_num_bytes);
				num_dec = (orig_num_bytes -
C
Chris Mason 已提交
4243
					   extent_num_bytes);
4244 4245 4246
				if (test_bit(BTRFS_ROOT_REF_COWS,
					     &root->state) &&
				    extent_start != 0)
4247
					inode_sub_bytes(inode, num_dec);
4248
				btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
4249
			} else {
4250 4251 4252
				extent_num_bytes =
					btrfs_file_extent_disk_num_bytes(leaf,
									 fi);
4253 4254 4255
				extent_offset = found_key.offset -
					btrfs_file_extent_offset(leaf, fi);

C
Chris Mason 已提交
4256
				/* FIXME blocksize != 4096 */
C
Chris Mason 已提交
4257
				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
C
Chris Mason 已提交
4258 4259
				if (extent_start != 0) {
					found_extent = 1;
4260 4261
					if (test_bit(BTRFS_ROOT_REF_COWS,
						     &root->state))
4262
						inode_sub_bytes(inode, num_dec);
4263
				}
C
Chris Mason 已提交
4264
			}
4265
			clear_len = num_dec;
C
Chris Mason 已提交
4266
		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
C
Chris Mason 已提交
4267 4268 4269 4270 4271 4272
			/*
			 * we can't truncate inline items that have had
			 * special encodings
			 */
			if (!del_item &&
			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
4273 4274 4275 4276 4277 4278
			    btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
			    btrfs_file_extent_compression(leaf, fi) == 0) {
				u32 size = (u32)(new_size - found_key.offset);

				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
				size = btrfs_file_extent_calc_inline_size(size);
4279
				btrfs_truncate_item(path, size, 1);
4280
			} else if (!del_item) {
4281
				/*
4282 4283
				 * We have to bail so the last_size is set to
				 * just before this extent.
4284
				 */
4285
				ret = NEED_TRUNCATE_BLOCK;
4286
				break;
4287 4288 4289 4290 4291 4292 4293
			} else {
				/*
				 * Inline extents are special, we just treat
				 * them as a full sector worth in the file
				 * extent tree just for simplicity sake.
				 */
				clear_len = fs_info->sectorsize;
4294
			}
4295

4296
			if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4297
				inode_sub_bytes(inode, item_end + 1 - new_size);
C
Chris Mason 已提交
4298
		}
4299
delete:
4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313
		/*
		 * We use btrfs_truncate_inode_items() to clean up log trees for
		 * multiple fsyncs, and in this case we don't want to clear the
		 * file extent range because it's just the log.
		 */
		if (root == BTRFS_I(inode)->root) {
			ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
						  clear_start, clear_len);
			if (ret) {
				btrfs_abort_transaction(trans, ret);
				break;
			}
		}

4314 4315 4316 4317
		if (del_item)
			last_size = found_key.offset;
		else
			last_size = new_size;
C
Chris Mason 已提交
4318
		if (del_item) {
4319 4320 4321 4322 4323 4324 4325 4326 4327 4328
			if (!pending_del_nr) {
				/* no pending yet, add ourselves */
				pending_del_slot = path->slots[0];
				pending_del_nr = 1;
			} else if (pending_del_nr &&
				   path->slots[0] + 1 == pending_del_slot) {
				/* hop on the pending chunk */
				pending_del_nr++;
				pending_del_slot = path->slots[0];
			} else {
C
Chris Mason 已提交
4329
				BUG();
4330
			}
C
Chris Mason 已提交
4331 4332 4333
		} else {
			break;
		}
4334
		should_throttle = false;
4335

4336 4337
		if (found_extent &&
		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4338
		     root == fs_info->tree_root)) {
4339 4340
			struct btrfs_ref ref = { 0 };

4341
			bytes_deleted += extent_num_bytes;
4342 4343 4344 4345 4346 4347 4348

			btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
					extent_start, extent_num_bytes, 0);
			ref.real_root = root->root_key.objectid;
			btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
					ino, extent_offset);
			ret = btrfs_free_extent(trans, &ref);
4349 4350 4351 4352
			if (ret) {
				btrfs_abort_transaction(trans, ret);
				break;
			}
4353
			if (be_nice) {
4354
				if (btrfs_should_throttle_delayed_refs(trans))
4355
					should_throttle = true;
4356
			}
C
Chris Mason 已提交
4357
		}
4358

4359 4360 4361 4362
		if (found_type == BTRFS_INODE_ITEM_KEY)
			break;

		if (path->slots[0] == 0 ||
4363
		    path->slots[0] != pending_del_slot ||
J
Josef Bacik 已提交
4364
		    should_throttle) {
4365 4366 4367 4368
			if (pending_del_nr) {
				ret = btrfs_del_items(trans, root, path,
						pending_del_slot,
						pending_del_nr);
4369
				if (ret) {
4370
					btrfs_abort_transaction(trans, ret);
4371
					break;
4372
				}
4373 4374
				pending_del_nr = 0;
			}
4375
			btrfs_release_path(path);
J
Josef Bacik 已提交
4376

4377
			/*
J
Josef Bacik 已提交
4378 4379 4380 4381 4382 4383 4384 4385
			 * We can generate a lot of delayed refs, so we need to
			 * throttle every once and a while and make sure we're
			 * adding enough space to keep up with the work we are
			 * generating.  Since we hold a transaction here we
			 * can't flush, and we don't want to FLUSH_LIMIT because
			 * we could have generated too many delayed refs to
			 * actually allocate, so just bail if we're short and
			 * let the normal reservation dance happen higher up.
4386
			 */
J
Josef Bacik 已提交
4387 4388 4389 4390 4391 4392 4393
			if (should_throttle) {
				ret = btrfs_delayed_refs_rsv_refill(fs_info,
							BTRFS_RESERVE_NO_FLUSH);
				if (ret) {
					ret = -EAGAIN;
					break;
				}
4394
			}
4395
			goto search_again;
4396 4397
		} else {
			path->slots[0]--;
4398
		}
C
Chris Mason 已提交
4399
	}
4400
out:
4401 4402 4403 4404
	if (ret >= 0 && pending_del_nr) {
		int err;

		err = btrfs_del_items(trans, root, path, pending_del_slot,
4405
				      pending_del_nr);
4406 4407 4408 4409
		if (err) {
			btrfs_abort_transaction(trans, err);
			ret = err;
		}
4410
	}
4411 4412
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
		ASSERT(last_size >= new_size);
4413
		if (!ret && last_size > new_size)
4414
			last_size = new_size;
4415
		btrfs_inode_safe_disk_i_size_write(inode, last_size);
4416 4417
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
				     (u64)-1, &cached_state);
4418
	}
4419

C
Chris Mason 已提交
4420
	btrfs_free_path(path);
4421
	return ret;
C
Chris Mason 已提交
4422 4423 4424
}

/*
4425
 * btrfs_truncate_block - read, zero a chunk and write a block
J
Josef Bacik 已提交
4426 4427 4428 4429 4430 4431
 * @inode - inode that we're zeroing
 * @from - the offset to start zeroing
 * @len - the length to zero, 0 to zero the entire range respective to the
 *	offset
 * @front - zero up to the offset instead of from the offset on
 *
4432
 * This will find the block for the "from" offset and cow the block and zero the
J
Josef Bacik 已提交
4433
 * part we want to zero.  This is used with truncate and hole punching.
C
Chris Mason 已提交
4434
 */
4435
int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
J
Josef Bacik 已提交
4436
			int front)
C
Chris Mason 已提交
4437
{
4438
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
4439
	struct address_space *mapping = inode->i_mapping;
4440 4441
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct btrfs_ordered_extent *ordered;
4442
	struct extent_state *cached_state = NULL;
4443
	struct extent_changeset *data_reserved = NULL;
4444
	char *kaddr;
4445
	u32 blocksize = fs_info->sectorsize;
4446
	pgoff_t index = from >> PAGE_SHIFT;
4447
	unsigned offset = from & (blocksize - 1);
C
Chris Mason 已提交
4448
	struct page *page;
4449
	gfp_t mask = btrfs_alloc_write_mask(mapping);
C
Chris Mason 已提交
4450
	int ret = 0;
4451 4452
	u64 block_start;
	u64 block_end;
C
Chris Mason 已提交
4453

4454 4455
	if (IS_ALIGNED(offset, blocksize) &&
	    (!len || IS_ALIGNED(len, blocksize)))
C
Chris Mason 已提交
4456
		goto out;
4457

J
Josef Bacik 已提交
4458 4459 4460
	block_start = round_down(from, blocksize);
	block_end = block_start + blocksize - 1;

4461
	ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
J
Josef Bacik 已提交
4462
					   block_start, blocksize);
4463 4464
	if (ret)
		goto out;
C
Chris Mason 已提交
4465

4466
again:
4467
	page = find_or_create_page(mapping, index, mask);
4468
	if (!page) {
4469
		btrfs_delalloc_release_space(inode, data_reserved,
4470
					     block_start, blocksize, true);
4471
		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4472
		ret = -ENOMEM;
C
Chris Mason 已提交
4473
		goto out;
4474
	}
4475

C
Chris Mason 已提交
4476
	if (!PageUptodate(page)) {
C
Chris Mason 已提交
4477
		ret = btrfs_readpage(NULL, page);
C
Chris Mason 已提交
4478
		lock_page(page);
4479 4480
		if (page->mapping != mapping) {
			unlock_page(page);
4481
			put_page(page);
4482 4483
			goto again;
		}
C
Chris Mason 已提交
4484 4485
		if (!PageUptodate(page)) {
			ret = -EIO;
4486
			goto out_unlock;
C
Chris Mason 已提交
4487 4488
		}
	}
4489
	wait_on_page_writeback(page);
4490

4491
	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4492 4493
	set_page_extent_mapped(page);

4494
	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4495
	if (ordered) {
4496
		unlock_extent_cached(io_tree, block_start, block_end,
4497
				     &cached_state);
4498
		unlock_page(page);
4499
		put_page(page);
4500
		btrfs_start_ordered_extent(inode, ordered, 1);
4501 4502 4503 4504
		btrfs_put_ordered_extent(ordered);
		goto again;
	}

4505
	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4506 4507
			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
			 0, 0, &cached_state);
4508

4509
	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4510
					&cached_state);
J
Josef Bacik 已提交
4511
	if (ret) {
4512
		unlock_extent_cached(io_tree, block_start, block_end,
4513
				     &cached_state);
J
Josef Bacik 已提交
4514 4515 4516
		goto out_unlock;
	}

4517
	if (offset != blocksize) {
J
Josef Bacik 已提交
4518
		if (!len)
4519
			len = blocksize - offset;
4520
		kaddr = kmap(page);
J
Josef Bacik 已提交
4521
		if (front)
4522 4523
			memset(kaddr + (block_start - page_offset(page)),
				0, offset);
J
Josef Bacik 已提交
4524
		else
4525 4526
			memset(kaddr + (block_start - page_offset(page)) +  offset,
				0, len);
4527 4528 4529
		flush_dcache_page(page);
		kunmap(page);
	}
4530
	ClearPageChecked(page);
4531
	set_page_dirty(page);
4532
	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
C
Chris Mason 已提交
4533

4534
out_unlock:
4535
	if (ret)
4536
		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4537
					     blocksize, true);
4538
	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
C
Chris Mason 已提交
4539
	unlock_page(page);
4540
	put_page(page);
C
Chris Mason 已提交
4541
out:
4542
	extent_changeset_free(data_reserved);
C
Chris Mason 已提交
4543 4544 4545
	return ret;
}

4546 4547 4548
static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
			     u64 offset, u64 len)
{
4549
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4550 4551 4552 4553 4554 4555 4556
	struct btrfs_trans_handle *trans;
	int ret;

	/*
	 * Still need to make sure the inode looks like it's been updated so
	 * that any holes get logged if we fsync.
	 */
4557 4558
	if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
		BTRFS_I(inode)->last_trans = fs_info->generation;
4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574
		BTRFS_I(inode)->last_sub_trans = root->log_transid;
		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
		return 0;
	}

	/*
	 * 1 - for the one we're dropping
	 * 1 - for the one we're adding
	 * 1 - for updating the inode.
	 */
	trans = btrfs_start_transaction(root, 3);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
	if (ret) {
4575
		btrfs_abort_transaction(trans, ret);
4576
		btrfs_end_transaction(trans);
4577 4578 4579
		return ret;
	}

4580 4581
	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
			offset, 0, 0, len, 0, len, 0, 0, 0);
4582
	if (ret)
4583
		btrfs_abort_transaction(trans, ret);
4584 4585
	else
		btrfs_update_inode(trans, root, inode);
4586
	btrfs_end_transaction(trans);
4587 4588 4589
	return ret;
}

4590 4591 4592 4593 4594 4595
/*
 * This function puts in dummy file extents for the area we're creating a hole
 * for.  So if we are truncating this file to a larger size we need to insert
 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
 * the range between oldsize and size
 */
4596
int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
C
Chris Mason 已提交
4597
{
4598
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Y
Yan Zheng 已提交
4599 4600
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4601
	struct extent_map *em = NULL;
4602
	struct extent_state *cached_state = NULL;
J
Josef Bacik 已提交
4603
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4604 4605
	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
	u64 block_end = ALIGN(size, fs_info->sectorsize);
Y
Yan Zheng 已提交
4606 4607 4608
	u64 last_byte;
	u64 cur_offset;
	u64 hole_size;
J
Josef Bacik 已提交
4609
	int err = 0;
C
Chris Mason 已提交
4610

4611
	/*
4612 4613
	 * If our size started in the middle of a block we need to zero out the
	 * rest of the block before we expand the i_size, otherwise we could
4614 4615
	 * expose stale data.
	 */
4616
	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4617 4618 4619
	if (err)
		return err;

Y
Yan Zheng 已提交
4620 4621 4622
	if (size <= hole_start)
		return 0;

4623
	btrfs_lock_and_flush_ordered_range(BTRFS_I(inode), hole_start,
4624
					   block_end - 1, &cached_state);
Y
Yan Zheng 已提交
4625 4626
	cur_offset = hole_start;
	while (1) {
4627
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
4628
				      block_end - cur_offset);
4629 4630
		if (IS_ERR(em)) {
			err = PTR_ERR(em);
4631
			em = NULL;
4632 4633
			break;
		}
Y
Yan Zheng 已提交
4634
		last_byte = min(extent_map_end(em), block_end);
4635
		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4636 4637
		hole_size = last_byte - cur_offset;

4638
		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
J
Josef Bacik 已提交
4639
			struct extent_map *hole_em;
J
Josef Bacik 已提交
4640

4641 4642 4643
			err = maybe_insert_hole(root, inode, cur_offset,
						hole_size);
			if (err)
4644
				break;
4645 4646 4647 4648 4649 4650

			err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
							cur_offset, hole_size);
			if (err)
				break;

4651
			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
J
Josef Bacik 已提交
4652 4653 4654 4655 4656 4657 4658 4659 4660 4661
						cur_offset + hole_size - 1, 0);
			hole_em = alloc_extent_map();
			if (!hole_em) {
				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
					&BTRFS_I(inode)->runtime_flags);
				goto next;
			}
			hole_em->start = cur_offset;
			hole_em->len = hole_size;
			hole_em->orig_start = cur_offset;
4662

J
Josef Bacik 已提交
4663 4664
			hole_em->block_start = EXTENT_MAP_HOLE;
			hole_em->block_len = 0;
4665
			hole_em->orig_block_len = 0;
J
Josef Bacik 已提交
4666
			hole_em->ram_bytes = hole_size;
J
Josef Bacik 已提交
4667
			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4668
			hole_em->generation = fs_info->generation;
4669

J
Josef Bacik 已提交
4670 4671
			while (1) {
				write_lock(&em_tree->lock);
J
Josef Bacik 已提交
4672
				err = add_extent_mapping(em_tree, hole_em, 1);
J
Josef Bacik 已提交
4673 4674 4675
				write_unlock(&em_tree->lock);
				if (err != -EEXIST)
					break;
4676 4677
				btrfs_drop_extent_cache(BTRFS_I(inode),
							cur_offset,
J
Josef Bacik 已提交
4678 4679 4680 4681
							cur_offset +
							hole_size - 1, 0);
			}
			free_extent_map(hole_em);
4682 4683 4684 4685 4686
		} else {
			err = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
							cur_offset, hole_size);
			if (err)
				break;
Y
Yan Zheng 已提交
4687
		}
4688
next:
Y
Yan Zheng 已提交
4689
		free_extent_map(em);
4690
		em = NULL;
Y
Yan Zheng 已提交
4691
		cur_offset = last_byte;
4692
		if (cur_offset >= block_end)
Y
Yan Zheng 已提交
4693 4694
			break;
	}
4695
	free_extent_map(em);
4696
	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
Y
Yan Zheng 已提交
4697 4698
	return err;
}
C
Chris Mason 已提交
4699

4700
static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4701
{
4702 4703
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
4704
	loff_t oldsize = i_size_read(inode);
4705 4706
	loff_t newsize = attr->ia_size;
	int mask = attr->ia_valid;
4707 4708
	int ret;

4709 4710 4711 4712 4713 4714
	/*
	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
	 * special case where we need to update the times despite not having
	 * these flags set.  For all other operations the VFS set these flags
	 * explicitly if it wants a timestamp update.
	 */
C
Christoph Hellwig 已提交
4715 4716 4717 4718
	if (newsize != oldsize) {
		inode_inc_iversion(inode);
		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
			inode->i_ctime = inode->i_mtime =
4719
				current_time(inode);
C
Christoph Hellwig 已提交
4720
	}
4721

4722
	if (newsize > oldsize) {
4723
		/*
4724
		 * Don't do an expanding truncate while snapshotting is ongoing.
4725 4726 4727 4728 4729
		 * This is to ensure the snapshot captures a fully consistent
		 * state of this file - if the snapshot captures this expanding
		 * truncation, it must capture all writes that happened before
		 * this truncation.
		 */
4730
		btrfs_drew_write_lock(&root->snapshot_lock);
4731
		ret = btrfs_cont_expand(inode, oldsize, newsize);
4732
		if (ret) {
4733
			btrfs_drew_write_unlock(&root->snapshot_lock);
4734
			return ret;
4735
		}
4736

4737
		trans = btrfs_start_transaction(root, 1);
4738
		if (IS_ERR(trans)) {
4739
			btrfs_drew_write_unlock(&root->snapshot_lock);
4740
			return PTR_ERR(trans);
4741
		}
4742 4743

		i_size_write(inode, newsize);
4744
		btrfs_inode_safe_disk_i_size_write(inode, 0);
4745
		pagecache_isize_extended(inode, oldsize, newsize);
4746
		ret = btrfs_update_inode(trans, root, inode);
4747
		btrfs_drew_write_unlock(&root->snapshot_lock);
4748
		btrfs_end_transaction(trans);
4749
	} else {
4750

4751 4752 4753 4754 4755 4756
		/*
		 * We're truncating a file that used to have good data down to
		 * zero. Make sure it gets into the ordered flush list so that
		 * any new writes get down to disk quickly.
		 */
		if (newsize == 0)
4757 4758
			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
				&BTRFS_I(inode)->runtime_flags);
4759

4760
		truncate_setsize(inode, newsize);
4761

4762
		/* Disable nonlocked read DIO to avoid the endless truncate */
4763
		btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
4764
		inode_dio_wait(inode);
4765
		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
4766

4767
		ret = btrfs_truncate(inode, newsize == oldsize);
4768 4769 4770 4771
		if (ret && inode->i_nlink) {
			int err;

			/*
4772 4773 4774 4775
			 * Truncate failed, so fix up the in-memory size. We
			 * adjusted disk_i_size down as we removed extents, so
			 * wait for disk_i_size to be stable and then update the
			 * in-memory size to match.
4776
			 */
4777
			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
4778
			if (err)
4779 4780
				return err;
			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4781
		}
4782 4783
	}

4784
	return ret;
4785 4786
}

Y
Yan Zheng 已提交
4787 4788
static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
{
4789
	struct inode *inode = d_inode(dentry);
L
Li Zefan 已提交
4790
	struct btrfs_root *root = BTRFS_I(inode)->root;
Y
Yan Zheng 已提交
4791
	int err;
C
Chris Mason 已提交
4792

L
Li Zefan 已提交
4793 4794 4795
	if (btrfs_root_readonly(root))
		return -EROFS;

4796
	err = setattr_prepare(dentry, attr);
Y
Yan Zheng 已提交
4797 4798
	if (err)
		return err;
C
Chris Mason 已提交
4799

4800
	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4801
		err = btrfs_setsize(inode, attr);
4802 4803
		if (err)
			return err;
C
Chris Mason 已提交
4804
	}
Y
Yan Zheng 已提交
4805

C
Christoph Hellwig 已提交
4806 4807
	if (attr->ia_valid) {
		setattr_copy(inode, attr);
4808
		inode_inc_iversion(inode);
4809
		err = btrfs_dirty_inode(inode);
C
Christoph Hellwig 已提交
4810

4811
		if (!err && attr->ia_valid & ATTR_MODE)
4812
			err = posix_acl_chmod(inode, inode->i_mode);
C
Christoph Hellwig 已提交
4813
	}
J
Josef Bacik 已提交
4814

C
Chris Mason 已提交
4815 4816
	return err;
}
4817

4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836
/*
 * While truncating the inode pages during eviction, we get the VFS calling
 * btrfs_invalidatepage() against each page of the inode. This is slow because
 * the calls to btrfs_invalidatepage() result in a huge amount of calls to
 * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
 * extent_state structures over and over, wasting lots of time.
 *
 * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
 * those expensive operations on a per page basis and do only the ordered io
 * finishing, while we release here the extent_map and extent_state structures,
 * without the excessive merging and splitting.
 */
static void evict_inode_truncate_pages(struct inode *inode)
{
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
	struct rb_node *node;

	ASSERT(inode->i_state & I_FREEING);
4837
	truncate_inode_pages_final(&inode->i_data);
4838 4839

	write_lock(&map_tree->lock);
L
Liu Bo 已提交
4840
	while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
4841 4842
		struct extent_map *em;

L
Liu Bo 已提交
4843
		node = rb_first_cached(&map_tree->map);
4844
		em = rb_entry(node, struct extent_map, rb_node);
4845 4846
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
4847 4848
		remove_extent_mapping(map_tree, em);
		free_extent_map(em);
4849 4850 4851 4852 4853
		if (need_resched()) {
			write_unlock(&map_tree->lock);
			cond_resched();
			write_lock(&map_tree->lock);
		}
4854 4855 4856
	}
	write_unlock(&map_tree->lock);

4857 4858 4859
	/*
	 * Keep looping until we have no more ranges in the io tree.
	 * We can have ongoing bios started by readpages (called from readahead)
4860 4861 4862
	 * that have their endio callback (extent_io.c:end_bio_extent_readpage)
	 * still in progress (unlocked the pages in the bio but did not yet
	 * unlocked the ranges in the io tree). Therefore this means some
4863 4864 4865 4866 4867 4868 4869 4870 4871 4872
	 * ranges can still be locked and eviction started because before
	 * submitting those bios, which are executed by a separate task (work
	 * queue kthread), inode references (inode->i_count) were not taken
	 * (which would be dropped in the end io callback of each bio).
	 * Therefore here we effectively end up waiting for those bios and
	 * anyone else holding locked ranges without having bumped the inode's
	 * reference count - if we don't do it, when they access the inode's
	 * io_tree to unlock a range it may be too late, leading to an
	 * use-after-free issue.
	 */
4873 4874 4875 4876
	spin_lock(&io_tree->lock);
	while (!RB_EMPTY_ROOT(&io_tree->state)) {
		struct extent_state *state;
		struct extent_state *cached_state = NULL;
4877 4878
		u64 start;
		u64 end;
4879
		unsigned state_flags;
4880 4881 4882

		node = rb_first(&io_tree->state);
		state = rb_entry(node, struct extent_state, rb_node);
4883 4884
		start = state->start;
		end = state->end;
4885
		state_flags = state->state;
4886 4887
		spin_unlock(&io_tree->lock);

4888
		lock_extent_bits(io_tree, start, end, &cached_state);
Q
Qu Wenruo 已提交
4889 4890 4891 4892 4893 4894 4895 4896 4897

		/*
		 * If still has DELALLOC flag, the extent didn't reach disk,
		 * and its reserved space won't be freed by delayed_ref.
		 * So we need to free its reserved space here.
		 * (Refer to comment in btrfs_invalidatepage, case 2)
		 *
		 * Note, end is the bytenr of last byte, so we need + 1 here.
		 */
4898
		if (state_flags & EXTENT_DELALLOC)
4899
			btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
Q
Qu Wenruo 已提交
4900

4901
		clear_extent_bit(io_tree, start, end,
4902 4903 4904
				 EXTENT_LOCKED | EXTENT_DELALLOC |
				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
				 &cached_state);
4905

4906
		cond_resched();
4907 4908 4909 4910 4911
		spin_lock(&io_tree->lock);
	}
	spin_unlock(&io_tree->lock);
}

4912
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
4913
							struct btrfs_block_rsv *rsv)
4914 4915 4916
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
4917
	struct btrfs_trans_handle *trans;
4918
	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
4919
	int ret;
4920

4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936
	/*
	 * Eviction should be taking place at some place safe because of our
	 * delayed iputs.  However the normal flushing code will run delayed
	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
	 *
	 * We reserve the delayed_refs_extra here again because we can't use
	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
	 * above.  We reserve our extra bit here because we generate a ton of
	 * delayed refs activity by truncating.
	 *
	 * If we cannot make our reservation we'll attempt to steal from the
	 * global reserve, because we really want to be able to free up space.
	 */
	ret = btrfs_block_rsv_refill(root, rsv, rsv->size + delayed_refs_extra,
				     BTRFS_RESERVE_FLUSH_EVICT);
	if (ret) {
4937 4938 4939 4940
		/*
		 * Try to steal from the global reserve if there is space for
		 * it.
		 */
4941 4942 4943 4944 4945 4946 4947 4948
		if (btrfs_check_space_for_delayed_refs(fs_info) ||
		    btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, 0)) {
			btrfs_warn(fs_info,
				   "could not allocate space for delete; will truncate on mount");
			return ERR_PTR(-ENOSPC);
		}
		delayed_refs_extra = 0;
	}
4949

4950 4951 4952 4953 4954 4955 4956 4957 4958
	trans = btrfs_join_transaction(root);
	if (IS_ERR(trans))
		return trans;

	if (delayed_refs_extra) {
		trans->block_rsv = &fs_info->trans_block_rsv;
		trans->bytes_reserved = delayed_refs_extra;
		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
					delayed_refs_extra, 1);
4959
	}
4960
	return trans;
4961 4962
}

A
Al Viro 已提交
4963
void btrfs_evict_inode(struct inode *inode)
C
Chris Mason 已提交
4964
{
4965
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
4966 4967
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(inode)->root;
4968
	struct btrfs_block_rsv *rsv;
C
Chris Mason 已提交
4969 4970
	int ret;

4971 4972
	trace_btrfs_inode_evict(inode);

4973
	if (!root) {
4974
		clear_inode(inode);
4975 4976 4977
		return;
	}

4978 4979
	evict_inode_truncate_pages(inode);

4980 4981 4982
	if (inode->i_nlink &&
	    ((btrfs_root_refs(&root->root_item) != 0 &&
	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
4983
	     btrfs_is_free_space_inode(BTRFS_I(inode))))
A
Al Viro 已提交
4984 4985
		goto no_delete;

4986
	if (is_bad_inode(inode))
C
Chris Mason 已提交
4987
		goto no_delete;
4988

4989
	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
4990

4991
	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
4992 4993
		goto no_delete;

4994
	if (inode->i_nlink > 0) {
4995 4996
		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
4997 4998 4999
		goto no_delete;
	}

5000
	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5001
	if (ret)
5002 5003
		goto no_delete;

5004
	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5005
	if (!rsv)
5006
		goto no_delete;
5007
	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5008
	rsv->failfast = 1;
5009

5010
	btrfs_i_size_write(BTRFS_I(inode), 0);
5011

5012
	while (1) {
5013
		trans = evict_refill_and_join(root, rsv);
5014 5015
		if (IS_ERR(trans))
			goto free_rsv;
5016

5017 5018
		trans->block_rsv = rsv;

5019
		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5020 5021 5022 5023 5024 5025
		trans->block_rsv = &fs_info->trans_block_rsv;
		btrfs_end_transaction(trans);
		btrfs_btree_balance_dirty(fs_info);
		if (ret && ret != -ENOSPC && ret != -EAGAIN)
			goto free_rsv;
		else if (!ret)
5026 5027
			break;
	}
5028

5029
	/*
5030 5031 5032 5033 5034 5035 5036
	 * Errors here aren't a big deal, it just means we leave orphan items in
	 * the tree. They will be cleaned up on the next mount. If the inode
	 * number gets reused, cleanup deletes the orphan item without doing
	 * anything, and unlink reuses the existing orphan item.
	 *
	 * If it turns out that we are dropping too many of these, we might want
	 * to add a mechanism for retrying these after a commit.
5037
	 */
5038
	trans = evict_refill_and_join(root, rsv);
5039 5040 5041 5042 5043 5044
	if (!IS_ERR(trans)) {
		trans->block_rsv = rsv;
		btrfs_orphan_del(trans, BTRFS_I(inode));
		trans->block_rsv = &fs_info->trans_block_rsv;
		btrfs_end_transaction(trans);
	}
5045

5046
	if (!(root == fs_info->tree_root ||
5047
	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5048
		btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
5049

5050 5051
free_rsv:
	btrfs_free_block_rsv(fs_info, rsv);
C
Chris Mason 已提交
5052
no_delete:
5053 5054 5055 5056 5057
	/*
	 * If we didn't successfully delete, the orphan item will still be in
	 * the tree and we'll retry on the next mount. Again, we might also want
	 * to retry these periodically in the future.
	 */
5058
	btrfs_remove_delayed_node(BTRFS_I(inode));
5059
	clear_inode(inode);
C
Chris Mason 已提交
5060 5061 5062
}

/*
5063 5064 5065
 * Return the key found in the dir entry in the location pointer, fill @type
 * with BTRFS_FT_*, and return 0.
 *
5066 5067
 * If no dir entries were found, returns -ENOENT.
 * If found a corrupted location in dir entry, returns -EUCLEAN.
C
Chris Mason 已提交
5068 5069
 */
static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5070
			       struct btrfs_key *location, u8 *type)
C
Chris Mason 已提交
5071 5072 5073 5074 5075 5076
{
	const char *name = dentry->d_name.name;
	int namelen = dentry->d_name.len;
	struct btrfs_dir_item *di;
	struct btrfs_path *path;
	struct btrfs_root *root = BTRFS_I(dir)->root;
Y
Yan 已提交
5077
	int ret = 0;
C
Chris Mason 已提交
5078 5079

	path = btrfs_alloc_path();
5080 5081
	if (!path)
		return -ENOMEM;
5082

5083 5084
	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
			name, namelen, 0);
5085 5086
	if (IS_ERR_OR_NULL(di)) {
		ret = di ? PTR_ERR(di) : -ENOENT;
5087 5088
		goto out;
	}
C
Chris Mason 已提交
5089

5090
	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5091 5092
	if (location->type != BTRFS_INODE_ITEM_KEY &&
	    location->type != BTRFS_ROOT_ITEM_KEY) {
5093
		ret = -EUCLEAN;
5094 5095 5096 5097 5098
		btrfs_warn(root->fs_info,
"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
			   __func__, name, btrfs_ino(BTRFS_I(dir)),
			   location->objectid, location->type, location->offset);
	}
5099 5100
	if (!ret)
		*type = btrfs_dir_type(path->nodes[0], di);
C
Chris Mason 已提交
5101 5102 5103 5104 5105 5106 5107 5108 5109 5110
out:
	btrfs_free_path(path);
	return ret;
}

/*
 * when we hit a tree root in a directory, the btrfs part of the inode
 * needs to be changed to reflect the root directory of the tree root.  This
 * is kind of like crossing a mount point.
 */
5111
static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5112 5113 5114 5115
				    struct inode *dir,
				    struct dentry *dentry,
				    struct btrfs_key *location,
				    struct btrfs_root **sub_root)
C
Chris Mason 已提交
5116
{
5117 5118 5119 5120
	struct btrfs_path *path;
	struct btrfs_root *new_root;
	struct btrfs_root_ref *ref;
	struct extent_buffer *leaf;
5121
	struct btrfs_key key;
5122 5123
	int ret;
	int err = 0;
C
Chris Mason 已提交
5124

5125 5126 5127 5128 5129
	path = btrfs_alloc_path();
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5130

5131
	err = -ENOENT;
5132 5133 5134 5135
	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
	key.type = BTRFS_ROOT_REF_KEY;
	key.offset = location->objectid;

5136
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5137 5138 5139 5140 5141
	if (ret) {
		if (ret < 0)
			err = ret;
		goto out;
	}
C
Chris Mason 已提交
5142

5143 5144
	leaf = path->nodes[0];
	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5145
	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5146 5147
	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
		goto out;
C
Chris Mason 已提交
5148

5149 5150 5151 5152 5153 5154
	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
				   (unsigned long)(ref + 1),
				   dentry->d_name.len);
	if (ret)
		goto out;

5155
	btrfs_release_path(path);
5156

5157
	new_root = btrfs_get_fs_root(fs_info, location, true);
5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170
	if (IS_ERR(new_root)) {
		err = PTR_ERR(new_root);
		goto out;
	}

	*sub_root = new_root;
	location->objectid = btrfs_root_dirid(&new_root->root_item);
	location->type = BTRFS_INODE_ITEM_KEY;
	location->offset = 0;
	err = 0;
out:
	btrfs_free_path(path);
	return err;
C
Chris Mason 已提交
5171 5172
}

5173 5174 5175 5176
static void inode_tree_add(struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_inode *entry;
5177 5178
	struct rb_node **p;
	struct rb_node *parent;
5179
	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5180
	u64 ino = btrfs_ino(BTRFS_I(inode));
5181

A
Al Viro 已提交
5182
	if (inode_unhashed(inode))
5183
		return;
5184
	parent = NULL;
5185
	spin_lock(&root->inode_lock);
5186
	p = &root->inode_tree.rb_node;
5187 5188 5189 5190
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct btrfs_inode, rb_node);

5191
		if (ino < btrfs_ino(entry))
5192
			p = &parent->rb_left;
5193
		else if (ino > btrfs_ino(entry))
5194
			p = &parent->rb_right;
5195 5196
		else {
			WARN_ON(!(entry->vfs_inode.i_state &
A
Al Viro 已提交
5197
				  (I_WILL_FREE | I_FREEING)));
5198
			rb_replace_node(parent, new, &root->inode_tree);
5199 5200
			RB_CLEAR_NODE(parent);
			spin_unlock(&root->inode_lock);
5201
			return;
5202 5203
		}
	}
5204 5205
	rb_link_node(new, parent, p);
	rb_insert_color(new, &root->inode_tree);
5206 5207 5208 5209 5210 5211
	spin_unlock(&root->inode_lock);
}

static void inode_tree_del(struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
5212
	int empty = 0;
5213

5214
	spin_lock(&root->inode_lock);
5215 5216 5217
	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5218
		empty = RB_EMPTY_ROOT(&root->inode_tree);
5219
	}
5220
	spin_unlock(&root->inode_lock);
5221

5222
	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5223 5224 5225 5226 5227 5228 5229 5230
		spin_lock(&root->inode_lock);
		empty = RB_EMPTY_ROOT(&root->inode_tree);
		spin_unlock(&root->inode_lock);
		if (empty)
			btrfs_add_dead_root(root);
	}
}

5231

5232 5233 5234
static int btrfs_init_locked_inode(struct inode *inode, void *p)
{
	struct btrfs_iget_args *args = p;
5235 5236 5237
	inode->i_ino = args->location->objectid;
	memcpy(&BTRFS_I(inode)->location, args->location,
	       sizeof(*args->location));
5238 5239
	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
	BUG_ON(args->root && !BTRFS_I(inode)->root);
C
Chris Mason 已提交
5240 5241 5242 5243 5244 5245
	return 0;
}

static int btrfs_find_actor(struct inode *inode, void *opaque)
{
	struct btrfs_iget_args *args = opaque;
5246
	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
C
Chris Mason 已提交
5247
		args->root == BTRFS_I(inode)->root;
C
Chris Mason 已提交
5248 5249
}

5250
static struct inode *btrfs_iget_locked(struct super_block *s,
5251
				       struct btrfs_key *location,
5252
				       struct btrfs_root *root)
C
Chris Mason 已提交
5253 5254 5255
{
	struct inode *inode;
	struct btrfs_iget_args args;
5256
	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5257

5258
	args.location = location;
C
Chris Mason 已提交
5259 5260
	args.root = root;

5261
	inode = iget5_locked(s, hashval, btrfs_find_actor,
C
Chris Mason 已提交
5262 5263 5264 5265 5266
			     btrfs_init_locked_inode,
			     (void *)&args);
	return inode;
}

5267 5268 5269 5270 5271
/*
 * Get an inode object given its location and corresponding root.
 * Path can be preallocated to prevent recursing back to iget through
 * allocator. NULL is also valid but may require an additional allocation
 * later.
B
Balaji Rao 已提交
5272
 */
5273
struct inode *btrfs_iget_path(struct super_block *s, struct btrfs_key *location,
5274
			      struct btrfs_root *root, struct btrfs_path *path)
B
Balaji Rao 已提交
5275 5276 5277
{
	struct inode *inode;

5278
	inode = btrfs_iget_locked(s, location, root);
B
Balaji Rao 已提交
5279
	if (!inode)
5280
		return ERR_PTR(-ENOMEM);
B
Balaji Rao 已提交
5281 5282

	if (inode->i_state & I_NEW) {
5283 5284
		int ret;

5285
		ret = btrfs_read_locked_inode(inode, path);
5286
		if (!ret) {
5287 5288 5289
			inode_tree_add(inode);
			unlock_new_inode(inode);
		} else {
A
Al Viro 已提交
5290 5291 5292 5293 5294 5295 5296 5297 5298
			iget_failed(inode);
			/*
			 * ret > 0 can come from btrfs_search_slot called by
			 * btrfs_read_locked_inode, this means the inode item
			 * was not found.
			 */
			if (ret > 0)
				ret = -ENOENT;
			inode = ERR_PTR(ret);
5299 5300 5301
		}
	}

B
Balaji Rao 已提交
5302 5303 5304
	return inode;
}

5305
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5306
			 struct btrfs_root *root)
5307
{
5308
	return btrfs_iget_path(s, location, root, NULL);
5309 5310
}

5311 5312 5313 5314 5315 5316 5317 5318 5319
static struct inode *new_simple_dir(struct super_block *s,
				    struct btrfs_key *key,
				    struct btrfs_root *root)
{
	struct inode *inode = new_inode(s);

	if (!inode)
		return ERR_PTR(-ENOMEM);

5320
	BTRFS_I(inode)->root = btrfs_grab_root(root);
5321
	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5322
	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5323 5324

	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5325 5326 5327 5328 5329
	/*
	 * We only need lookup, the rest is read-only and there's no inode
	 * associated with the dentry
	 */
	inode->i_op = &simple_dir_inode_operations;
5330
	inode->i_opflags &= ~IOP_XATTR;
5331 5332
	inode->i_fop = &simple_dir_operations;
	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5333
	inode->i_mtime = current_time(inode);
5334 5335
	inode->i_atime = inode->i_mtime;
	inode->i_ctime = inode->i_mtime;
5336
	BTRFS_I(inode)->i_otime = inode->i_mtime;
5337 5338 5339 5340

	return inode;
}

5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358
static inline u8 btrfs_inode_type(struct inode *inode)
{
	/*
	 * Compile-time asserts that generic FT_* types still match
	 * BTRFS_FT_* types
	 */
	BUILD_BUG_ON(BTRFS_FT_UNKNOWN != FT_UNKNOWN);
	BUILD_BUG_ON(BTRFS_FT_REG_FILE != FT_REG_FILE);
	BUILD_BUG_ON(BTRFS_FT_DIR != FT_DIR);
	BUILD_BUG_ON(BTRFS_FT_CHRDEV != FT_CHRDEV);
	BUILD_BUG_ON(BTRFS_FT_BLKDEV != FT_BLKDEV);
	BUILD_BUG_ON(BTRFS_FT_FIFO != FT_FIFO);
	BUILD_BUG_ON(BTRFS_FT_SOCK != FT_SOCK);
	BUILD_BUG_ON(BTRFS_FT_SYMLINK != FT_SYMLINK);

	return fs_umode_to_ftype(inode->i_mode);
}

5359
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
C
Chris Mason 已提交
5360
{
5361
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
C
Chris Mason 已提交
5362
	struct inode *inode;
5363
	struct btrfs_root *root = BTRFS_I(dir)->root;
C
Chris Mason 已提交
5364 5365
	struct btrfs_root *sub_root = root;
	struct btrfs_key location;
5366
	u8 di_type = 0;
5367
	int ret = 0;
C
Chris Mason 已提交
5368 5369 5370

	if (dentry->d_name.len > BTRFS_NAME_LEN)
		return ERR_PTR(-ENAMETOOLONG);
5371

5372
	ret = btrfs_inode_by_name(dir, dentry, &location, &di_type);
C
Chris Mason 已提交
5373 5374
	if (ret < 0)
		return ERR_PTR(ret);
5375

5376
	if (location.type == BTRFS_INODE_ITEM_KEY) {
5377
		inode = btrfs_iget(dir->i_sb, &location, root);
5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389
		if (IS_ERR(inode))
			return inode;

		/* Do extra check against inode mode with di_type */
		if (btrfs_inode_type(inode) != di_type) {
			btrfs_crit(fs_info,
"inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
				  inode->i_mode, btrfs_inode_type(inode),
				  di_type);
			iput(inode);
			return ERR_PTR(-EUCLEAN);
		}
5390 5391 5392
		return inode;
	}

5393
	ret = fixup_tree_root_location(fs_info, dir, dentry,
5394 5395 5396 5397 5398 5399 5400
				       &location, &sub_root);
	if (ret < 0) {
		if (ret != -ENOENT)
			inode = ERR_PTR(ret);
		else
			inode = new_simple_dir(dir->i_sb, &location, sub_root);
	} else {
5401
		inode = btrfs_iget(dir->i_sb, &location, sub_root);
C
Chris Mason 已提交
5402
	}
5403
	if (root != sub_root)
5404
		btrfs_put_root(sub_root);
5405

5406
	if (!IS_ERR(inode) && root != sub_root) {
5407
		down_read(&fs_info->cleanup_work_sem);
5408
		if (!sb_rdonly(inode->i_sb))
5409
			ret = btrfs_orphan_cleanup(sub_root);
5410
		up_read(&fs_info->cleanup_work_sem);
5411 5412
		if (ret) {
			iput(inode);
5413
			inode = ERR_PTR(ret);
5414
		}
5415 5416
	}

5417 5418 5419
	return inode;
}

N
Nick Piggin 已提交
5420
static int btrfs_dentry_delete(const struct dentry *dentry)
5421 5422
{
	struct btrfs_root *root;
5423
	struct inode *inode = d_inode(dentry);
5424

L
Li Zefan 已提交
5425
	if (!inode && !IS_ROOT(dentry))
5426
		inode = d_inode(dentry->d_parent);
5427

L
Li Zefan 已提交
5428 5429
	if (inode) {
		root = BTRFS_I(inode)->root;
5430 5431
		if (btrfs_root_refs(&root->root_item) == 0)
			return 1;
L
Li Zefan 已提交
5432

5433
		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
L
Li Zefan 已提交
5434
			return 1;
5435
	}
5436 5437 5438
	return 0;
}

5439
static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
5440
				   unsigned int flags)
5441
{
A
Al Viro 已提交
5442
	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5443

A
Al Viro 已提交
5444 5445
	if (inode == ERR_PTR(-ENOENT))
		inode = NULL;
5446
	return d_splice_alias(inode, dentry);
C
Chris Mason 已提交
5447 5448
}

5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486
/*
 * All this infrastructure exists because dir_emit can fault, and we are holding
 * the tree lock when doing readdir.  For now just allocate a buffer and copy
 * our information into that, and then dir_emit from the buffer.  This is
 * similar to what NFS does, only we don't keep the buffer around in pagecache
 * because I'm afraid I'll mess that up.  Long term we need to make filldir do
 * copy_to_user_inatomic so we don't have to worry about page faulting under the
 * tree lock.
 */
static int btrfs_opendir(struct inode *inode, struct file *file)
{
	struct btrfs_file_private *private;

	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
	if (!private)
		return -ENOMEM;
	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
	if (!private->filldir_buf) {
		kfree(private);
		return -ENOMEM;
	}
	file->private_data = private;
	return 0;
}

struct dir_entry {
	u64 ino;
	u64 offset;
	unsigned type;
	int name_len;
};

static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
{
	while (entries--) {
		struct dir_entry *entry = addr;
		char *name = (char *)(entry + 1);

5487 5488 5489 5490
		ctx->pos = get_unaligned(&entry->offset);
		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
					 get_unaligned(&entry->ino),
					 get_unaligned(&entry->type)))
5491
			return 1;
5492 5493
		addr += sizeof(struct dir_entry) +
			get_unaligned(&entry->name_len);
5494 5495 5496 5497 5498
		ctx->pos++;
	}
	return 0;
}

A
Al Viro 已提交
5499
static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
C
Chris Mason 已提交
5500
{
A
Al Viro 已提交
5501
	struct inode *inode = file_inode(file);
C
Chris Mason 已提交
5502
	struct btrfs_root *root = BTRFS_I(inode)->root;
5503
	struct btrfs_file_private *private = file->private_data;
C
Chris Mason 已提交
5504 5505
	struct btrfs_dir_item *di;
	struct btrfs_key key;
5506
	struct btrfs_key found_key;
C
Chris Mason 已提交
5507
	struct btrfs_path *path;
5508
	void *addr;
5509 5510
	struct list_head ins_list;
	struct list_head del_list;
C
Chris Mason 已提交
5511
	int ret;
5512
	struct extent_buffer *leaf;
C
Chris Mason 已提交
5513
	int slot;
5514 5515
	char *name_ptr;
	int name_len;
5516 5517
	int entries = 0;
	int total_len = 0;
5518
	bool put = false;
5519
	struct btrfs_key location;
5520

A
Al Viro 已提交
5521 5522 5523
	if (!dir_emit_dots(file, ctx))
		return 0;

5524
	path = btrfs_alloc_path();
5525 5526
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
5527

5528
	addr = private->filldir_buf;
5529
	path->reada = READA_FORWARD;
5530

5531 5532 5533
	INIT_LIST_HEAD(&ins_list);
	INIT_LIST_HEAD(&del_list);
	put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5534

5535
again:
5536
	key.type = BTRFS_DIR_INDEX_KEY;
A
Al Viro 已提交
5537
	key.offset = ctx->pos;
5538
	key.objectid = btrfs_ino(BTRFS_I(inode));
5539

C
Chris Mason 已提交
5540 5541 5542
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto err;
5543 5544

	while (1) {
5545 5546
		struct dir_entry *entry;

5547
		leaf = path->nodes[0];
C
Chris Mason 已提交
5548
		slot = path->slots[0];
5549 5550 5551 5552 5553 5554 5555
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				goto err;
			else if (ret > 0)
				break;
			continue;
C
Chris Mason 已提交
5556
		}
5557

5558 5559 5560
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		if (found_key.objectid != key.objectid)
C
Chris Mason 已提交
5561
			break;
5562
		if (found_key.type != BTRFS_DIR_INDEX_KEY)
C
Chris Mason 已提交
5563
			break;
A
Al Viro 已提交
5564
		if (found_key.offset < ctx->pos)
5565
			goto next;
5566
		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5567
			goto next;
C
Chris Mason 已提交
5568
		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5569
		name_len = btrfs_dir_name_len(leaf, di);
5570 5571 5572 5573 5574 5575 5576 5577 5578 5579
		if ((total_len + sizeof(struct dir_entry) + name_len) >=
		    PAGE_SIZE) {
			btrfs_release_path(path);
			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
			if (ret)
				goto nopos;
			addr = private->filldir_buf;
			entries = 0;
			total_len = 0;
			goto again;
5580
		}
5581 5582

		entry = addr;
5583
		put_unaligned(name_len, &entry->name_len);
5584
		name_ptr = (char *)(entry + 1);
5585 5586
		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
				   name_len);
5587
		put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)),
5588
				&entry->type);
5589
		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5590 5591
		put_unaligned(location.objectid, &entry->ino);
		put_unaligned(found_key.offset, &entry->offset);
5592 5593 5594
		entries++;
		addr += sizeof(struct dir_entry) + name_len;
		total_len += sizeof(struct dir_entry) + name_len;
5595 5596
next:
		path->slots[0]++;
C
Chris Mason 已提交
5597
	}
5598 5599 5600 5601 5602
	btrfs_release_path(path);

	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
	if (ret)
		goto nopos;
5603

5604
	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5605
	if (ret)
5606 5607
		goto nopos;

5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624
	/*
	 * Stop new entries from being returned after we return the last
	 * entry.
	 *
	 * New directory entries are assigned a strictly increasing
	 * offset.  This means that new entries created during readdir
	 * are *guaranteed* to be seen in the future by that readdir.
	 * This has broken buggy programs which operate on names as
	 * they're returned by readdir.  Until we re-use freed offsets
	 * we have this hack to stop new entries from being returned
	 * under the assumption that they'll never reach this huge
	 * offset.
	 *
	 * This is being careful not to overflow 32bit loff_t unless the
	 * last entry requires it because doing so has broken 32bit apps
	 * in the past.
	 */
5625 5626 5627 5628
	if (ctx->pos >= INT_MAX)
		ctx->pos = LLONG_MAX;
	else
		ctx->pos = INT_MAX;
C
Chris Mason 已提交
5629 5630 5631
nopos:
	ret = 0;
err:
5632 5633
	if (put)
		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
C
Chris Mason 已提交
5634 5635 5636 5637 5638
	btrfs_free_path(path);
	return ret;
}

/*
5639
 * This is somewhat expensive, updating the tree every time the
C
Chris Mason 已提交
5640 5641 5642 5643
 * inode changes.  But, it is most likely to find the inode in cache.
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
5644
static int btrfs_dirty_inode(struct inode *inode)
C
Chris Mason 已提交
5645
{
5646
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
5647 5648
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
5649 5650
	int ret;

5651
	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5652
		return 0;
C
Chris Mason 已提交
5653

5654
	trans = btrfs_join_transaction(root);
5655 5656
	if (IS_ERR(trans))
		return PTR_ERR(trans);
5657 5658

	ret = btrfs_update_inode(trans, root, inode);
5659 5660
	if (ret && ret == -ENOSPC) {
		/* whoops, lets try again with the full transaction */
5661
		btrfs_end_transaction(trans);
5662
		trans = btrfs_start_transaction(root, 1);
5663 5664
		if (IS_ERR(trans))
			return PTR_ERR(trans);
5665

5666 5667
		ret = btrfs_update_inode(trans, root, inode);
	}
5668
	btrfs_end_transaction(trans);
5669
	if (BTRFS_I(inode)->delayed_node)
5670
		btrfs_balance_delayed_items(fs_info);
5671 5672 5673 5674 5675 5676 5677 5678

	return ret;
}

/*
 * This is a copy of file_update_time.  We need this so we can return error on
 * ENOSPC for updating the inode in the case of file write and mmap writes.
 */
5679
static int btrfs_update_time(struct inode *inode, struct timespec64 *now,
5680
			     int flags)
5681
{
5682
	struct btrfs_root *root = BTRFS_I(inode)->root;
5683
	bool dirty = flags & ~S_VERSION;
5684 5685 5686 5687

	if (btrfs_root_readonly(root))
		return -EROFS;

5688
	if (flags & S_VERSION)
5689
		dirty |= inode_maybe_inc_iversion(inode, dirty);
5690 5691 5692 5693 5694 5695
	if (flags & S_CTIME)
		inode->i_ctime = *now;
	if (flags & S_MTIME)
		inode->i_mtime = *now;
	if (flags & S_ATIME)
		inode->i_atime = *now;
5696
	return dirty ? btrfs_dirty_inode(inode) : 0;
C
Chris Mason 已提交
5697 5698
}

C
Chris Mason 已提交
5699 5700 5701 5702 5703
/*
 * find the highest existing sequence number in a directory
 * and then set the in-memory index_cnt variable to reflect
 * free sequence numbers
 */
5704
static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5705
{
5706
	struct btrfs_root *root = inode->root;
5707 5708 5709 5710 5711
	struct btrfs_key key, found_key;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	int ret;

5712
	key.objectid = btrfs_ino(inode);
5713
	key.type = BTRFS_DIR_INDEX_KEY;
5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734
	key.offset = (u64)-1;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	/* FIXME: we should be able to handle this */
	if (ret == 0)
		goto out;
	ret = 0;

	/*
	 * MAGIC NUMBER EXPLANATION:
	 * since we search a directory based on f_pos we have to start at 2
	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
	 * else has to start at 2
	 */
	if (path->slots[0] == 0) {
5735
		inode->index_cnt = 2;
5736 5737 5738 5739 5740 5741 5742 5743
		goto out;
	}

	path->slots[0]--;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

5744
	if (found_key.objectid != btrfs_ino(inode) ||
5745
	    found_key.type != BTRFS_DIR_INDEX_KEY) {
5746
		inode->index_cnt = 2;
5747 5748 5749
		goto out;
	}

5750
	inode->index_cnt = found_key.offset + 1;
5751 5752 5753 5754 5755
out:
	btrfs_free_path(path);
	return ret;
}

C
Chris Mason 已提交
5756 5757 5758 5759
/*
 * helper to find a free sequence number in a given directory.  This current
 * code is very simple, later versions will do smarter things in the btree
 */
5760
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
5761 5762 5763
{
	int ret = 0;

5764 5765
	if (dir->index_cnt == (u64)-1) {
		ret = btrfs_inode_delayed_dir_index_count(dir);
5766 5767 5768 5769 5770
		if (ret) {
			ret = btrfs_set_inode_index_count(dir);
			if (ret)
				return ret;
		}
5771 5772
	}

5773 5774
	*index = dir->index_cnt;
	dir->index_cnt++;
5775 5776 5777 5778

	return ret;
}

5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789
static int btrfs_insert_inode_locked(struct inode *inode)
{
	struct btrfs_iget_args args;
	args.location = &BTRFS_I(inode)->location;
	args.root = BTRFS_I(inode)->root;

	return insert_inode_locked4(inode,
		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
		   btrfs_find_actor, &args);
}

5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817
/*
 * Inherit flags from the parent inode.
 *
 * Currently only the compression flags and the cow flags are inherited.
 */
static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
{
	unsigned int flags;

	if (!dir)
		return;

	flags = BTRFS_I(dir)->flags;

	if (flags & BTRFS_INODE_NOCOMPRESS) {
		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
	} else if (flags & BTRFS_INODE_COMPRESS) {
		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
	}

	if (flags & BTRFS_INODE_NODATACOW) {
		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
		if (S_ISREG(inode->i_mode))
			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
	}

5818
	btrfs_sync_inode_flags_to_i_flags(inode);
5819 5820
}

C
Chris Mason 已提交
5821 5822
static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
5823
				     struct inode *dir,
5824
				     const char *name, int name_len,
A
Al Viro 已提交
5825 5826
				     u64 ref_objectid, u64 objectid,
				     umode_t mode, u64 *index)
C
Chris Mason 已提交
5827
{
5828
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
5829
	struct inode *inode;
5830
	struct btrfs_inode_item *inode_item;
C
Chris Mason 已提交
5831
	struct btrfs_key *location;
5832
	struct btrfs_path *path;
5833 5834 5835
	struct btrfs_inode_ref *ref;
	struct btrfs_key key[2];
	u32 sizes[2];
5836
	int nitems = name ? 2 : 1;
5837
	unsigned long ptr;
5838
	unsigned int nofs_flag;
C
Chris Mason 已提交
5839 5840
	int ret;

5841
	path = btrfs_alloc_path();
5842 5843
	if (!path)
		return ERR_PTR(-ENOMEM);
5844

5845
	nofs_flag = memalloc_nofs_save();
5846
	inode = new_inode(fs_info->sb);
5847
	memalloc_nofs_restore(nofs_flag);
5848 5849
	if (!inode) {
		btrfs_free_path(path);
C
Chris Mason 已提交
5850
		return ERR_PTR(-ENOMEM);
5851
	}
C
Chris Mason 已提交
5852

5853 5854 5855 5856 5857 5858 5859
	/*
	 * O_TMPFILE, set link count to 0, so that after this point,
	 * we fill in an inode item with the correct link count.
	 */
	if (!name)
		set_nlink(inode, 0);

5860 5861 5862 5863 5864 5865
	/*
	 * we have to initialize this early, so we can reclaim the inode
	 * number if we fail afterwards in this function.
	 */
	inode->i_ino = objectid;

5866
	if (dir && name) {
5867 5868
		trace_btrfs_inode_request(dir);

5869
		ret = btrfs_set_inode_index(BTRFS_I(dir), index);
5870
		if (ret) {
5871
			btrfs_free_path(path);
5872
			iput(inode);
5873
			return ERR_PTR(ret);
5874
		}
5875 5876
	} else if (dir) {
		*index = 0;
5877 5878 5879
	}
	/*
	 * index_cnt is ignored for everything but a dir,
5880
	 * btrfs_set_inode_index_count has an explanation for the magic
5881 5882 5883
	 * number
	 */
	BTRFS_I(inode)->index_cnt = 2;
5884
	BTRFS_I(inode)->dir_index = *index;
5885
	BTRFS_I(inode)->root = btrfs_grab_root(root);
5886
	BTRFS_I(inode)->generation = trans->transid;
5887
	inode->i_generation = BTRFS_I(inode)->generation;
5888

J
Josef Bacik 已提交
5889 5890 5891 5892 5893 5894 5895 5896
	/*
	 * We could have gotten an inode number from somebody who was fsynced
	 * and then removed in this same transaction, so let's just set full
	 * sync since it will be a full sync anyway and this will blow away the
	 * old info in the log.
	 */
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);

5897
	key[0].objectid = objectid;
5898
	key[0].type = BTRFS_INODE_ITEM_KEY;
5899 5900 5901
	key[0].offset = 0;

	sizes[0] = sizeof(struct btrfs_inode_item);
5902 5903 5904 5905 5906 5907 5908 5909 5910

	if (name) {
		/*
		 * Start new inodes with an inode_ref. This is slightly more
		 * efficient for small numbers of hard links since they will
		 * be packed into one item. Extended refs will kick in if we
		 * add more hard links than can fit in the ref item.
		 */
		key[1].objectid = objectid;
5911
		key[1].type = BTRFS_INODE_REF_KEY;
5912 5913 5914 5915
		key[1].offset = ref_objectid;

		sizes[1] = name_len + sizeof(*ref);
	}
5916

5917 5918 5919
	location = &BTRFS_I(inode)->location;
	location->objectid = objectid;
	location->offset = 0;
5920
	location->type = BTRFS_INODE_ITEM_KEY;
5921 5922

	ret = btrfs_insert_inode_locked(inode);
A
Al Viro 已提交
5923 5924
	if (ret < 0) {
		iput(inode);
5925
		goto fail;
A
Al Viro 已提交
5926
	}
5927

5928
	path->leave_spinning = 1;
5929
	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
5930
	if (ret != 0)
5931
		goto fail_unlock;
5932

5933
	inode_init_owner(inode, dir, mode);
5934
	inode_set_bytes(inode, 0);
5935

5936
	inode->i_mtime = current_time(inode);
5937 5938
	inode->i_atime = inode->i_mtime;
	inode->i_ctime = inode->i_mtime;
5939
	BTRFS_I(inode)->i_otime = inode->i_mtime;
5940

5941 5942
	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				  struct btrfs_inode_item);
5943
	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
5944
			     sizeof(*inode_item));
5945
	fill_inode_item(trans, path->nodes[0], inode_item, inode);
5946

5947 5948 5949 5950 5951 5952 5953 5954
	if (name) {
		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
				     struct btrfs_inode_ref);
		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
		ptr = (unsigned long)(ref + 1);
		write_extent_buffer(path->nodes[0], name, ptr, name_len);
	}
5955

5956 5957 5958
	btrfs_mark_buffer_dirty(path->nodes[0]);
	btrfs_free_path(path);

5959 5960
	btrfs_inherit_iflags(inode, dir);

5961
	if (S_ISREG(mode)) {
5962
		if (btrfs_test_opt(fs_info, NODATASUM))
5963
			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5964
		if (btrfs_test_opt(fs_info, NODATACOW))
5965 5966
			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
				BTRFS_INODE_NODATASUM;
5967 5968
	}

5969
	inode_tree_add(inode);
5970 5971

	trace_btrfs_inode_new(inode);
5972
	btrfs_set_inode_last_trans(trans, inode);
5973

5974 5975
	btrfs_update_root_times(trans, root);

5976 5977
	ret = btrfs_inode_inherit_props(trans, inode, dir);
	if (ret)
5978
		btrfs_err(fs_info,
5979
			  "error inheriting props for ino %llu (root %llu): %d",
5980
			btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
5981

C
Chris Mason 已提交
5982
	return inode;
5983 5984

fail_unlock:
A
Al Viro 已提交
5985
	discard_new_inode(inode);
5986
fail:
5987
	if (dir && name)
5988
		BTRFS_I(dir)->index_cnt--;
5989 5990
	btrfs_free_path(path);
	return ERR_PTR(ret);
C
Chris Mason 已提交
5991 5992
}

C
Chris Mason 已提交
5993 5994 5995 5996 5997 5998
/*
 * utility function to add 'inode' into 'parent_inode' with
 * a give name and a given sequence number.
 * if 'add_backref' is true, also insert a backref from the
 * inode to the parent directory.
 */
5999
int btrfs_add_link(struct btrfs_trans_handle *trans,
6000
		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6001
		   const char *name, int name_len, int add_backref, u64 index)
C
Chris Mason 已提交
6002
{
6003
	int ret = 0;
C
Chris Mason 已提交
6004
	struct btrfs_key key;
6005 6006 6007
	struct btrfs_root *root = parent_inode->root;
	u64 ino = btrfs_ino(inode);
	u64 parent_ino = btrfs_ino(parent_inode);
6008

L
Li Zefan 已提交
6009
	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6010
		memcpy(&key, &inode->root->root_key, sizeof(key));
6011
	} else {
L
Li Zefan 已提交
6012
		key.objectid = ino;
6013
		key.type = BTRFS_INODE_ITEM_KEY;
6014 6015 6016
		key.offset = 0;
	}

L
Li Zefan 已提交
6017
	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6018
		ret = btrfs_add_root_ref(trans, key.objectid,
6019 6020
					 root->root_key.objectid, parent_ino,
					 index, name, name_len);
6021
	} else if (add_backref) {
L
Li Zefan 已提交
6022 6023
		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
					     parent_ino, index);
6024
	}
C
Chris Mason 已提交
6025

6026 6027 6028
	/* Nothing to clean up yet */
	if (ret)
		return ret;
6029

6030
	ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
6031
				    btrfs_inode_type(&inode->vfs_inode), index);
C
Chris Mason 已提交
6032
	if (ret == -EEXIST || ret == -EOVERFLOW)
6033 6034
		goto fail_dir_item;
	else if (ret) {
6035
		btrfs_abort_transaction(trans, ret);
6036
		return ret;
C
Chris Mason 已提交
6037
	}
6038

6039
	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6040
			   name_len * 2);
6041
	inode_inc_iversion(&parent_inode->vfs_inode);
6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053
	/*
	 * If we are replaying a log tree, we do not want to update the mtime
	 * and ctime of the parent directory with the current time, since the
	 * log replay procedure is responsible for setting them to their correct
	 * values (the ones it had when the fsync was done).
	 */
	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) {
		struct timespec64 now = current_time(&parent_inode->vfs_inode);

		parent_inode->vfs_inode.i_mtime = now;
		parent_inode->vfs_inode.i_ctime = now;
	}
6054
	ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
6055
	if (ret)
6056
		btrfs_abort_transaction(trans, ret);
C
Chris Mason 已提交
6057
	return ret;
6058 6059 6060 6061 6062

fail_dir_item:
	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
		u64 local_index;
		int err;
6063
		err = btrfs_del_root_ref(trans, key.objectid,
6064 6065
					 root->root_key.objectid, parent_ino,
					 &local_index, name, name_len);
6066 6067
		if (err)
			btrfs_abort_transaction(trans, err);
6068 6069 6070 6071 6072 6073
	} else if (add_backref) {
		u64 local_index;
		int err;

		err = btrfs_del_inode_ref(trans, root, name, name_len,
					  ino, parent_ino, &local_index);
6074 6075
		if (err)
			btrfs_abort_transaction(trans, err);
6076
	}
6077 6078

	/* Return the original error code */
6079
	return ret;
C
Chris Mason 已提交
6080 6081 6082
}

static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6083 6084
			    struct btrfs_inode *dir, struct dentry *dentry,
			    struct btrfs_inode *inode, int backref, u64 index)
C
Chris Mason 已提交
6085
{
6086 6087 6088
	int err = btrfs_add_link(trans, dir, inode,
				 dentry->d_name.name, dentry->d_name.len,
				 backref, index);
C
Chris Mason 已提交
6089 6090 6091 6092 6093
	if (err > 0)
		err = -EEXIST;
	return err;
}

J
Josef Bacik 已提交
6094
static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
6095
			umode_t mode, dev_t rdev)
J
Josef Bacik 已提交
6096
{
6097
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
J
Josef Bacik 已提交
6098 6099
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
6100
	struct inode *inode = NULL;
J
Josef Bacik 已提交
6101 6102
	int err;
	u64 objectid;
6103
	u64 index = 0;
J
Josef Bacik 已提交
6104

J
Josef Bacik 已提交
6105 6106 6107 6108 6109
	/*
	 * 2 for inode item and ref
	 * 2 for dir items
	 * 1 for xattr if selinux is on
	 */
6110 6111 6112
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
6113

6114 6115 6116 6117
	err = btrfs_find_free_ino(root, &objectid);
	if (err)
		goto out_unlock;

6118
	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6119 6120
			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
			mode, &index);
6121 6122
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
6123
		inode = NULL;
J
Josef Bacik 已提交
6124
		goto out_unlock;
6125
	}
J
Josef Bacik 已提交
6126

6127 6128 6129 6130 6131 6132 6133
	/*
	* If the active LSM wants to access the inode during
	* d_instantiate it needs these. Smack checks to see
	* if the filesystem supports xattrs by looking at the
	* ops vector.
	*/
	inode->i_op = &btrfs_special_inode_operations;
6134 6135 6136
	init_special_inode(inode, inode->i_mode, rdev);

	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
J
Josef Bacik 已提交
6137
	if (err)
A
Al Viro 已提交
6138
		goto out_unlock;
6139

6140 6141
	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
			0, index);
A
Al Viro 已提交
6142 6143 6144 6145 6146
	if (err)
		goto out_unlock;

	btrfs_update_inode(trans, root, inode);
	d_instantiate_new(dentry, inode);
6147

J
Josef Bacik 已提交
6148
out_unlock:
6149
	btrfs_end_transaction(trans);
6150
	btrfs_btree_balance_dirty(fs_info);
A
Al Viro 已提交
6151
	if (err && inode) {
J
Josef Bacik 已提交
6152
		inode_dec_link_count(inode);
A
Al Viro 已提交
6153
		discard_new_inode(inode);
J
Josef Bacik 已提交
6154 6155 6156 6157
	}
	return err;
}

C
Chris Mason 已提交
6158
static int btrfs_create(struct inode *dir, struct dentry *dentry,
A
Al Viro 已提交
6159
			umode_t mode, bool excl)
C
Chris Mason 已提交
6160
{
6161
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
C
Chris Mason 已提交
6162 6163
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
6164
	struct inode *inode = NULL;
6165
	int err;
C
Chris Mason 已提交
6166
	u64 objectid;
6167
	u64 index = 0;
C
Chris Mason 已提交
6168

J
Josef Bacik 已提交
6169 6170 6171 6172 6173
	/*
	 * 2 for inode item and ref
	 * 2 for dir items
	 * 1 for xattr if selinux is on
	 */
6174 6175 6176
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
J
Josef Bacik 已提交
6177

6178 6179 6180 6181
	err = btrfs_find_free_ino(root, &objectid);
	if (err)
		goto out_unlock;

6182
	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6183 6184
			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
			mode, &index);
6185 6186
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
6187
		inode = NULL;
C
Chris Mason 已提交
6188
		goto out_unlock;
6189
	}
6190 6191 6192 6193 6194 6195 6196 6197
	/*
	* If the active LSM wants to access the inode during
	* d_instantiate it needs these. Smack checks to see
	* if the filesystem supports xattrs by looking at the
	* ops vector.
	*/
	inode->i_fop = &btrfs_file_operations;
	inode->i_op = &btrfs_file_inode_operations;
6198 6199 6200 6201
	inode->i_mapping->a_ops = &btrfs_aops;

	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
	if (err)
A
Al Viro 已提交
6202
		goto out_unlock;
6203 6204 6205

	err = btrfs_update_inode(trans, root, inode);
	if (err)
A
Al Viro 已提交
6206
		goto out_unlock;
6207

6208 6209
	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
			0, index);
C
Chris Mason 已提交
6210
	if (err)
A
Al Viro 已提交
6211
		goto out_unlock;
6212 6213

	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6214
	d_instantiate_new(dentry, inode);
6215

C
Chris Mason 已提交
6216
out_unlock:
6217
	btrfs_end_transaction(trans);
A
Al Viro 已提交
6218
	if (err && inode) {
C
Chris Mason 已提交
6219
		inode_dec_link_count(inode);
A
Al Viro 已提交
6220
		discard_new_inode(inode);
C
Chris Mason 已提交
6221
	}
6222
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
6223 6224 6225 6226 6227 6228
	return err;
}

static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
		      struct dentry *dentry)
{
6229
	struct btrfs_trans_handle *trans = NULL;
C
Chris Mason 已提交
6230
	struct btrfs_root *root = BTRFS_I(dir)->root;
6231
	struct inode *inode = d_inode(old_dentry);
6232
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6233
	u64 index;
C
Chris Mason 已提交
6234 6235 6236
	int err;
	int drop_inode = 0;

6237
	/* do not allow sys_link's with other subvols of the same device */
6238
	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6239
		return -EXDEV;
6240

M
Mark Fasheh 已提交
6241
	if (inode->i_nlink >= BTRFS_LINK_MAX)
6242
		return -EMLINK;
6243

6244
	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6245 6246 6247
	if (err)
		goto fail;

6248
	/*
M
Miao Xie 已提交
6249
	 * 2 items for inode and inode ref
6250
	 * 2 items for dir items
M
Miao Xie 已提交
6251
	 * 1 item for parent inode
6252
	 * 1 item for orphan item deletion if O_TMPFILE
6253
	 */
6254
	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6255 6256
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
6257
		trans = NULL;
6258 6259
		goto fail;
	}
6260

6261 6262
	/* There are several dir indexes for this inode, clear the cache. */
	BTRFS_I(inode)->dir_index = 0ULL;
Z
Zach Brown 已提交
6263
	inc_nlink(inode);
6264
	inode_inc_iversion(inode);
6265
	inode->i_ctime = current_time(inode);
A
Al Viro 已提交
6266
	ihold(inode);
6267
	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6268

6269 6270
	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
			1, index);
6271

6272
	if (err) {
6273
		drop_inode = 1;
6274
	} else {
6275
		struct dentry *parent = dentry->d_parent;
6276 6277
		int ret;

6278
		err = btrfs_update_inode(trans, root, inode);
6279 6280
		if (err)
			goto fail;
6281 6282 6283 6284 6285
		if (inode->i_nlink == 1) {
			/*
			 * If new hard link count is 1, it's a file created
			 * with open(2) O_TMPFILE flag.
			 */
6286
			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6287 6288 6289
			if (err)
				goto fail;
		}
6290
		d_instantiate(dentry, inode);
6291 6292 6293 6294 6295 6296
		ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
					 true, NULL);
		if (ret == BTRFS_NEED_TRANS_COMMIT) {
			err = btrfs_commit_transaction(trans);
			trans = NULL;
		}
6297
	}
C
Chris Mason 已提交
6298

6299
fail:
6300
	if (trans)
6301
		btrfs_end_transaction(trans);
C
Chris Mason 已提交
6302 6303 6304 6305
	if (drop_inode) {
		inode_dec_link_count(inode);
		iput(inode);
	}
6306
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
6307 6308 6309
	return err;
}

6310
static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
C
Chris Mason 已提交
6311
{
6312
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6313
	struct inode *inode = NULL;
C
Chris Mason 已提交
6314 6315 6316
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
	int err = 0;
6317
	u64 objectid = 0;
6318
	u64 index = 0;
C
Chris Mason 已提交
6319

J
Josef Bacik 已提交
6320 6321 6322 6323 6324
	/*
	 * 2 items for inode and ref
	 * 2 items for dir items
	 * 1 for xattr if selinux is on
	 */
6325 6326 6327
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);
C
Chris Mason 已提交
6328

6329 6330 6331 6332
	err = btrfs_find_free_ino(root, &objectid);
	if (err)
		goto out_fail;

6333
	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6334 6335
			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
			S_IFDIR | mode, &index);
C
Chris Mason 已提交
6336 6337
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
6338
		inode = NULL;
C
Chris Mason 已提交
6339 6340
		goto out_fail;
	}
6341

6342 6343 6344
	/* these must be set before we unlock the inode */
	inode->i_op = &btrfs_dir_inode_operations;
	inode->i_fop = &btrfs_dir_file_operations;
J
Josef Bacik 已提交
6345

6346
	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
J
Josef Bacik 已提交
6347
	if (err)
A
Al Viro 已提交
6348
		goto out_fail;
C
Chris Mason 已提交
6349

6350
	btrfs_i_size_write(BTRFS_I(inode), 0);
C
Chris Mason 已提交
6351 6352
	err = btrfs_update_inode(trans, root, inode);
	if (err)
A
Al Viro 已提交
6353
		goto out_fail;
6354

6355 6356 6357
	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
			dentry->d_name.name,
			dentry->d_name.len, 0, index);
C
Chris Mason 已提交
6358
	if (err)
A
Al Viro 已提交
6359
		goto out_fail;
6360

6361
	d_instantiate_new(dentry, inode);
C
Chris Mason 已提交
6362 6363

out_fail:
6364
	btrfs_end_transaction(trans);
A
Al Viro 已提交
6365
	if (err && inode) {
6366
		inode_dec_link_count(inode);
A
Al Viro 已提交
6367
		discard_new_inode(inode);
6368
	}
6369
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
6370 6371 6372
	return err;
}

C
Chris Mason 已提交
6373
static noinline int uncompress_inline(struct btrfs_path *path,
6374
				      struct page *page,
C
Chris Mason 已提交
6375 6376 6377 6378 6379 6380 6381 6382 6383
				      size_t pg_offset, u64 extent_offset,
				      struct btrfs_file_extent_item *item)
{
	int ret;
	struct extent_buffer *leaf = path->nodes[0];
	char *tmp;
	size_t max_size;
	unsigned long inline_size;
	unsigned long ptr;
6384
	int compress_type;
C
Chris Mason 已提交
6385 6386

	WARN_ON(pg_offset != 0);
6387
	compress_type = btrfs_file_extent_compression(leaf, item);
C
Chris Mason 已提交
6388 6389
	max_size = btrfs_file_extent_ram_bytes(leaf, item);
	inline_size = btrfs_file_extent_inline_item_len(leaf,
6390
					btrfs_item_nr(path->slots[0]));
C
Chris Mason 已提交
6391
	tmp = kmalloc(inline_size, GFP_NOFS);
6392 6393
	if (!tmp)
		return -ENOMEM;
C
Chris Mason 已提交
6394 6395 6396 6397
	ptr = btrfs_file_extent_inline_start(item);

	read_extent_buffer(leaf, tmp, ptr, inline_size);

6398
	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6399 6400
	ret = btrfs_decompress(compress_type, tmp, page,
			       extent_offset, inline_size, max_size);
6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414

	/*
	 * decompression code contains a memset to fill in any space between the end
	 * of the uncompressed data and the end of max_size in case the decompressed
	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
	 * the end of an inline extent and the beginning of the next block, so we
	 * cover that region here.
	 */

	if (max_size + pg_offset < PAGE_SIZE) {
		char *map = kmap(page);
		memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
		kunmap(page);
	}
C
Chris Mason 已提交
6415
	kfree(tmp);
6416
	return ret;
C
Chris Mason 已提交
6417 6418
}

6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430
/**
 * btrfs_get_extent - Lookup the first extent overlapping a range in a file.
 * @inode:	file to search in
 * @page:	page to read extent data into if the extent is inline
 * @pg_offset:	offset into @page to copy to
 * @start:	file offset
 * @len:	length of range starting at @start
 *
 * This returns the first &struct extent_map which overlaps with the given
 * range, reading it from the B-tree and caching it if necessary. Note that
 * there may be more extents which overlap the given range after the returned
 * extent_map.
C
Chris Mason 已提交
6431
 *
6432 6433 6434 6435
 * If @page is not NULL and the extent is inline, this also reads the extent
 * data directly into the page and marks the extent up to date in the io_tree.
 *
 * Return: ERR_PTR on error, non-NULL extent_map on success.
C
Chris Mason 已提交
6436
 */
6437
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6438 6439
				    struct page *page, size_t pg_offset,
				    u64 start, u64 len)
6440
{
6441
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6442 6443 6444 6445
	int ret;
	int err = 0;
	u64 extent_start = 0;
	u64 extent_end = 0;
6446
	u64 objectid = btrfs_ino(inode);
6447
	int extent_type = -1;
6448
	struct btrfs_path *path = NULL;
6449
	struct btrfs_root *root = inode->root;
6450
	struct btrfs_file_extent_item *item;
6451 6452
	struct extent_buffer *leaf;
	struct btrfs_key found_key;
6453
	struct extent_map *em = NULL;
6454 6455
	struct extent_map_tree *em_tree = &inode->extent_tree;
	struct extent_io_tree *io_tree = &inode->io_tree;
6456

6457
	read_lock(&em_tree->lock);
6458
	em = lookup_extent_mapping(em_tree, start, len);
6459
	read_unlock(&em_tree->lock);
6460

6461
	if (em) {
6462 6463 6464
		if (em->start > start || em->start + em->len <= start)
			free_extent_map(em);
		else if (em->block_start == EXTENT_MAP_INLINE && page)
6465 6466 6467
			free_extent_map(em);
		else
			goto out;
6468
	}
6469
	em = alloc_extent_map();
6470
	if (!em) {
6471 6472
		err = -ENOMEM;
		goto out;
6473
	}
6474
	em->start = EXTENT_MAP_HOLE;
6475
	em->orig_start = EXTENT_MAP_HOLE;
6476
	em->len = (u64)-1;
C
Chris Mason 已提交
6477
	em->block_len = (u64)-1;
6478

6479
	path = btrfs_alloc_path();
6480
	if (!path) {
6481 6482
		err = -ENOMEM;
		goto out;
6483 6484
	}

6485 6486 6487
	/* Chances are we'll be called again, so go ahead and do readahead */
	path->reada = READA_FORWARD;

6488 6489 6490 6491 6492 6493
	/*
	 * Unless we're going to uncompress the inline extent, no sleep would
	 * happen.
	 */
	path->leave_spinning = 1;

6494
	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6495 6496 6497
	if (ret < 0) {
		err = ret;
		goto out;
6498
	} else if (ret > 0) {
6499 6500 6501 6502 6503
		if (path->slots[0] == 0)
			goto not_found;
		path->slots[0]--;
	}

6504 6505
	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0],
6506
			      struct btrfs_file_extent_item);
6507 6508
	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
	if (found_key.objectid != objectid ||
6509
	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6510 6511 6512 6513 6514 6515 6516 6517
		/*
		 * If we backup past the first extent we want to move forward
		 * and see if there is an extent in front of us, otherwise we'll
		 * say there is a hole for our whole search range which can
		 * cause problems.
		 */
		extent_end = start;
		goto next;
6518 6519
	}

6520
	extent_type = btrfs_file_extent_type(leaf, item);
6521
	extent_start = found_key.offset;
6522
	extent_end = btrfs_file_extent_end(path);
6523 6524
	if (extent_type == BTRFS_FILE_EXTENT_REG ||
	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6525 6526 6527 6528 6529 6530 6531 6532
		/* Only regular file could have regular/prealloc extent */
		if (!S_ISREG(inode->vfs_inode.i_mode)) {
			ret = -EUCLEAN;
			btrfs_crit(fs_info,
		"regular/prealloc extent found for non-regular inode %llu",
				   btrfs_ino(inode));
			goto out;
		}
L
Liu Bo 已提交
6533 6534
		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
						       extent_start);
6535
	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
L
Liu Bo 已提交
6536 6537 6538
		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
						      path->slots[0],
						      extent_start);
Y
Yan Zheng 已提交
6539
	}
6540
next:
Y
Yan Zheng 已提交
6541 6542 6543 6544 6545 6546 6547
	if (start >= extent_end) {
		path->slots[0]++;
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0) {
				err = ret;
				goto out;
6548
			} else if (ret > 0) {
Y
Yan Zheng 已提交
6549
				goto not_found;
6550
			}
Y
Yan Zheng 已提交
6551
			leaf = path->nodes[0];
6552
		}
Y
Yan Zheng 已提交
6553 6554 6555 6556 6557 6558
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid != objectid ||
		    found_key.type != BTRFS_EXTENT_DATA_KEY)
			goto not_found;
		if (start + len <= found_key.offset)
			goto not_found;
6559 6560
		if (start > found_key.offset)
			goto next;
6561 6562

		/* New extent overlaps with existing one */
Y
Yan Zheng 已提交
6563
		em->start = start;
6564
		em->orig_start = start;
Y
Yan Zheng 已提交
6565
		em->len = found_key.offset - start;
6566 6567
		em->block_start = EXTENT_MAP_HOLE;
		goto insert;
Y
Yan Zheng 已提交
6568 6569
	}

6570
	btrfs_extent_item_to_extent_map(inode, path, item, !page, em);
6571

6572 6573
	if (extent_type == BTRFS_FILE_EXTENT_REG ||
	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6574
		goto insert;
6575
	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6576
		unsigned long ptr;
6577
		char *map;
6578 6579 6580
		size_t size;
		size_t extent_offset;
		size_t copy_size;
6581

6582
		if (!page)
6583
			goto out;
6584

6585
		size = btrfs_file_extent_ram_bytes(leaf, item);
Y
Yan Zheng 已提交
6586
		extent_offset = page_offset(page) + pg_offset - extent_start;
6587 6588
		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
				  size - extent_offset);
6589
		em->start = extent_start + extent_offset;
6590
		em->len = ALIGN(copy_size, fs_info->sectorsize);
6591
		em->orig_block_len = em->len;
6592
		em->orig_start = em->start;
6593
		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6594 6595

		btrfs_set_path_blocking(path);
6596
		if (!PageUptodate(page)) {
6597 6598
			if (btrfs_file_extent_compression(leaf, item) !=
			    BTRFS_COMPRESS_NONE) {
6599
				ret = uncompress_inline(path, page, pg_offset,
C
Chris Mason 已提交
6600
							extent_offset, item);
6601 6602 6603 6604
				if (ret) {
					err = ret;
					goto out;
				}
C
Chris Mason 已提交
6605 6606 6607 6608
			} else {
				map = kmap(page);
				read_extent_buffer(leaf, map + pg_offset, ptr,
						   copy_size);
6609
				if (pg_offset + copy_size < PAGE_SIZE) {
6610
					memset(map + pg_offset + copy_size, 0,
6611
					       PAGE_SIZE - pg_offset -
6612 6613
					       copy_size);
				}
C
Chris Mason 已提交
6614 6615
				kunmap(page);
			}
6616
			flush_dcache_page(page);
6617
		}
6618
		set_extent_uptodate(io_tree, em->start,
6619
				    extent_map_end(em) - 1, NULL, GFP_NOFS);
6620 6621 6622 6623
		goto insert;
	}
not_found:
	em->start = start;
6624
	em->orig_start = start;
6625
	em->len = len;
6626
	em->block_start = EXTENT_MAP_HOLE;
6627
insert:
6628
	btrfs_release_path(path);
6629
	if (em->start > start || extent_map_end(em) <= start) {
6630
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
6631 6632
			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
			  em->start, em->len, start, len);
6633 6634 6635
		err = -EIO;
		goto out;
	}
6636 6637

	err = 0;
6638
	write_lock(&em_tree->lock);
6639
	err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6640
	write_unlock(&em_tree->lock);
6641
out:
6642
	btrfs_free_path(path);
6643

6644
	trace_btrfs_get_extent(root, inode, em);
6645

6646 6647 6648 6649
	if (err) {
		free_extent_map(em);
		return ERR_PTR(err);
	}
6650
	BUG_ON(!em); /* Error is always set */
6651 6652 6653
	return em;
}

6654
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
6655
					   u64 start, u64 len)
6656 6657 6658
{
	struct extent_map *em;
	struct extent_map *hole_em = NULL;
6659
	u64 delalloc_start = start;
6660
	u64 end;
6661 6662
	u64 delalloc_len;
	u64 delalloc_end;
6663 6664
	int err = 0;

6665
	em = btrfs_get_extent(inode, NULL, 0, start, len);
6666 6667
	if (IS_ERR(em))
		return em;
6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678
	/*
	 * If our em maps to:
	 * - a hole or
	 * - a pre-alloc extent,
	 * there might actually be delalloc bytes behind it.
	 */
	if (em->block_start != EXTENT_MAP_HOLE &&
	    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		return em;
	else
		hole_em = em;
6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689

	/* check to see if we've wrapped (len == -1 or similar) */
	end = start + len;
	if (end < start)
		end = (u64)-1;
	else
		end -= 1;

	em = NULL;

	/* ok, we didn't find anything, lets look for delalloc */
6690
	delalloc_len = count_range_bits(&inode->io_tree, &delalloc_start,
6691
				 end, len, EXTENT_DELALLOC, 1);
6692 6693 6694
	delalloc_end = delalloc_start + delalloc_len;
	if (delalloc_end < delalloc_start)
		delalloc_end = (u64)-1;
6695 6696

	/*
6697 6698
	 * We didn't find anything useful, return the original results from
	 * get_extent()
6699
	 */
6700
	if (delalloc_start > end || delalloc_end <= start) {
6701 6702 6703 6704 6705
		em = hole_em;
		hole_em = NULL;
		goto out;
	}

6706 6707 6708
	/*
	 * Adjust the delalloc_start to make sure it doesn't go backwards from
	 * the start they passed in
6709
	 */
6710 6711
	delalloc_start = max(start, delalloc_start);
	delalloc_len = delalloc_end - delalloc_start;
6712

6713 6714
	if (delalloc_len > 0) {
		u64 hole_start;
6715
		u64 hole_len;
6716
		const u64 hole_end = extent_map_end(hole_em);
6717

6718
		em = alloc_extent_map();
6719 6720 6721 6722
		if (!em) {
			err = -ENOMEM;
			goto out;
		}
6723 6724

		ASSERT(hole_em);
6725
		/*
6726 6727
		 * When btrfs_get_extent can't find anything it returns one
		 * huge hole
6728
		 *
6729 6730
		 * Make sure what it found really fits our range, and adjust to
		 * make sure it is based on the start from the caller
6731
		 */
6732 6733 6734 6735 6736 6737
		if (hole_end <= start || hole_em->start > end) {
		       free_extent_map(hole_em);
		       hole_em = NULL;
		} else {
		       hole_start = max(hole_em->start, start);
		       hole_len = hole_end - hole_start;
6738
		}
6739 6740 6741 6742 6743 6744

		if (hole_em && delalloc_start > hole_start) {
			/*
			 * Our hole starts before our delalloc, so we have to
			 * return just the parts of the hole that go until the
			 * delalloc starts
6745
			 */
6746
			em->len = min(hole_len, delalloc_start - hole_start);
6747 6748 6749
			em->start = hole_start;
			em->orig_start = hole_start;
			/*
6750 6751
			 * Don't adjust block start at all, it is fixed at
			 * EXTENT_MAP_HOLE
6752 6753 6754
			 */
			em->block_start = hole_em->block_start;
			em->block_len = hole_len;
6755 6756
			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6757
		} else {
6758 6759 6760 6761 6762 6763 6764
			/*
			 * Hole is out of passed range or it starts after
			 * delalloc range
			 */
			em->start = delalloc_start;
			em->len = delalloc_len;
			em->orig_start = delalloc_start;
6765
			em->block_start = EXTENT_MAP_DELALLOC;
6766
			em->block_len = delalloc_len;
6767
		}
6768
	} else {
6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780
		return hole_em;
	}
out:

	free_extent_map(hole_em);
	if (err) {
		free_extent_map(em);
		return ERR_PTR(err);
	}
	return em;
}

6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794
static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
						  const u64 start,
						  const u64 len,
						  const u64 orig_start,
						  const u64 block_start,
						  const u64 block_len,
						  const u64 orig_block_len,
						  const u64 ram_bytes,
						  const int type)
{
	struct extent_map *em = NULL;
	int ret;

	if (type != BTRFS_ORDERED_NOCOW) {
6795 6796 6797 6798 6799
		em = create_io_em(inode, start, len, orig_start,
				  block_start, block_len, orig_block_len,
				  ram_bytes,
				  BTRFS_COMPRESS_NONE, /* compress_type */
				  type);
6800 6801 6802 6803 6804 6805 6806 6807
		if (IS_ERR(em))
			goto out;
	}
	ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
					   len, block_len, type);
	if (ret) {
		if (em) {
			free_extent_map(em);
6808
			btrfs_drop_extent_cache(BTRFS_I(inode), start,
6809 6810 6811 6812 6813 6814 6815 6816 6817
						start + len - 1, 0);
		}
		em = ERR_PTR(ret);
	}
 out:

	return em;
}

6818 6819 6820
static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
						  u64 start, u64 len)
{
6821
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6822
	struct btrfs_root *root = BTRFS_I(inode)->root;
6823
	struct extent_map *em;
6824 6825 6826 6827 6828
	struct btrfs_key ins;
	u64 alloc_hint;
	int ret;

	alloc_hint = get_extent_allocation_hint(inode, start, len);
6829
	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
6830
				   0, alloc_hint, &ins, 1, 1);
6831 6832
	if (ret)
		return ERR_PTR(ret);
6833

6834 6835
	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
				     ins.objectid, ins.offset, ins.offset,
6836
				     ins.offset, BTRFS_ORDERED_REGULAR);
6837
	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
6838
	if (IS_ERR(em))
6839 6840
		btrfs_free_reserved_extent(fs_info, ins.objectid,
					   ins.offset, 1);
6841

6842 6843 6844
	return em;
}

6845 6846 6847 6848
/*
 * returns 1 when the nocow is safe, < 1 on error, 0 if the
 * block must be cow'd
 */
6849
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6850 6851
			      u64 *orig_start, u64 *orig_block_len,
			      u64 *ram_bytes)
6852
{
6853
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6854 6855 6856 6857
	struct btrfs_path *path;
	int ret;
	struct extent_buffer *leaf;
	struct btrfs_root *root = BTRFS_I(inode)->root;
6858
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6859 6860 6861 6862 6863 6864 6865 6866
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;
	u64 disk_bytenr;
	u64 backref_offset;
	u64 extent_end;
	u64 num_bytes;
	int slot;
	int found_type;
6867
	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6868

6869 6870 6871 6872
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

6873 6874
	ret = btrfs_lookup_file_extent(NULL, root, path,
			btrfs_ino(BTRFS_I(inode)), offset, 0);
6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889
	if (ret < 0)
		goto out;

	slot = path->slots[0];
	if (ret == 1) {
		if (slot == 0) {
			/* can't find the item, must cow */
			ret = 0;
			goto out;
		}
		slot--;
	}
	ret = 0;
	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, slot);
6890
	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907
	    key.type != BTRFS_EXTENT_DATA_KEY) {
		/* not our file or wrong item type, must cow */
		goto out;
	}

	if (key.offset > offset) {
		/* Wrong offset, must cow */
		goto out;
	}

	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
	found_type = btrfs_file_extent_type(leaf, fi);
	if (found_type != BTRFS_FILE_EXTENT_REG &&
	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
		/* not a regular extent, must cow */
		goto out;
	}
6908 6909 6910 6911

	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
		goto out;

6912 6913 6914 6915
	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
	if (extent_end <= offset)
		goto out;

6916
	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6917 6918 6919 6920 6921 6922 6923 6924
	if (disk_bytenr == 0)
		goto out;

	if (btrfs_file_extent_compression(leaf, fi) ||
	    btrfs_file_extent_encryption(leaf, fi) ||
	    btrfs_file_extent_other_encoding(leaf, fi))
		goto out;

6925 6926 6927 6928 6929 6930 6931 6932
	/*
	 * Do the same check as in btrfs_cross_ref_exist but without the
	 * unnecessary search.
	 */
	if (btrfs_file_extent_generation(leaf, fi) <=
	    btrfs_root_last_snapshot(&root->root_item))
		goto out;

6933 6934
	backref_offset = btrfs_file_extent_offset(leaf, fi);

6935 6936 6937 6938 6939
	if (orig_start) {
		*orig_start = key.offset - backref_offset;
		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
	}
6940

6941
	if (btrfs_extent_readonly(fs_info, disk_bytenr))
6942
		goto out;
6943 6944 6945 6946 6947

	num_bytes = min(offset + *len, extent_end) - offset;
	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
		u64 range_end;

6948 6949
		range_end = round_up(offset + num_bytes,
				     root->fs_info->sectorsize) - 1;
6950 6951 6952 6953 6954 6955 6956 6957
		ret = test_range_bit(io_tree, offset, range_end,
				     EXTENT_DELALLOC, 0, NULL);
		if (ret) {
			ret = -EAGAIN;
			goto out;
		}
	}

6958
	btrfs_release_path(path);
6959 6960 6961 6962 6963

	/*
	 * look for other files referencing this extent, if we
	 * find any we must cow
	 */
6964

6965
	ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
6966 6967 6968 6969 6970
				    key.offset - backref_offset, disk_bytenr);
	if (ret) {
		ret = 0;
		goto out;
	}
6971 6972 6973 6974 6975 6976 6977 6978 6979

	/*
	 * adjust disk_bytenr and num_bytes to cover just the bytes
	 * in this extent we are about to write.  If there
	 * are any csums in that range we have to cow in order
	 * to keep the csums correct
	 */
	disk_bytenr += backref_offset;
	disk_bytenr += offset - key.offset;
6980 6981
	if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
		goto out;
6982 6983 6984 6985
	/*
	 * all of the above have passed, it is safe to overwrite this extent
	 * without cow
	 */
6986
	*len = num_bytes;
6987 6988 6989 6990 6991 6992
	ret = 1;
out:
	btrfs_free_path(path);
	return ret;
}

6993 6994 6995 6996 6997 6998 6999 7000
static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
			      struct extent_state **cached_state, int writing)
{
	struct btrfs_ordered_extent *ordered;
	int ret = 0;

	while (1) {
		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7001
				 cached_state);
7002 7003
		/*
		 * We're concerned with the entire range that we're going to be
7004
		 * doing DIO to, so we need to make sure there's no ordered
7005 7006
		 * extents in this range.
		 */
7007
		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7008 7009 7010 7011 7012 7013 7014 7015 7016
						     lockend - lockstart + 1);

		/*
		 * We need to make sure there are no buffered pages in this
		 * range either, we could have raced between the invalidate in
		 * generic_file_direct_write and locking the extent.  The
		 * invalidate needs to happen so that reads after a write do not
		 * get stale data.
		 */
7017
		if (!ordered &&
7018 7019
		    (!writing || !filemap_range_has_page(inode->i_mapping,
							 lockstart, lockend)))
7020 7021 7022
			break;

		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7023
				     cached_state);
7024 7025

		if (ordered) {
7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045
			/*
			 * If we are doing a DIO read and the ordered extent we
			 * found is for a buffered write, we can not wait for it
			 * to complete and retry, because if we do so we can
			 * deadlock with concurrent buffered writes on page
			 * locks. This happens only if our DIO read covers more
			 * than one extent map, if at this point has already
			 * created an ordered extent for a previous extent map
			 * and locked its range in the inode's io tree, and a
			 * concurrent write against that previous extent map's
			 * range and this range started (we unlock the ranges
			 * in the io tree only when the bios complete and
			 * buffered writes always lock pages before attempting
			 * to lock range in the io tree).
			 */
			if (writing ||
			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
				btrfs_start_ordered_extent(inode, ordered, 1);
			else
				ret = -ENOTBLK;
7046 7047 7048
			btrfs_put_ordered_extent(ordered);
		} else {
			/*
7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059
			 * We could trigger writeback for this range (and wait
			 * for it to complete) and then invalidate the pages for
			 * this range (through invalidate_inode_pages2_range()),
			 * but that can lead us to a deadlock with a concurrent
			 * call to readpages() (a buffered read or a defrag call
			 * triggered a readahead) on a page lock due to an
			 * ordered dio extent we created before but did not have
			 * yet a corresponding bio submitted (whence it can not
			 * complete), which makes readpages() wait for that
			 * ordered extent to complete while holding a lock on
			 * that page.
7060
			 */
7061
			ret = -ENOTBLK;
7062 7063
		}

7064 7065 7066
		if (ret)
			break;

7067 7068 7069 7070 7071 7072
		cond_resched();
	}

	return ret;
}

7073 7074 7075 7076 7077 7078
/* The callers of this must take lock_extent() */
static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
				       u64 orig_start, u64 block_start,
				       u64 block_len, u64 orig_block_len,
				       u64 ram_bytes, int compress_type,
				       int type)
7079 7080 7081 7082 7083
{
	struct extent_map_tree *em_tree;
	struct extent_map *em;
	int ret;

7084 7085 7086
	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
	       type == BTRFS_ORDERED_COMPRESSED ||
	       type == BTRFS_ORDERED_NOCOW ||
7087
	       type == BTRFS_ORDERED_REGULAR);
7088

7089 7090 7091 7092 7093 7094 7095 7096 7097 7098
	em_tree = &BTRFS_I(inode)->extent_tree;
	em = alloc_extent_map();
	if (!em)
		return ERR_PTR(-ENOMEM);

	em->start = start;
	em->orig_start = orig_start;
	em->len = len;
	em->block_len = block_len;
	em->block_start = block_start;
7099
	em->orig_block_len = orig_block_len;
J
Josef Bacik 已提交
7100
	em->ram_bytes = ram_bytes;
7101
	em->generation = -1;
7102
	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7103
	if (type == BTRFS_ORDERED_PREALLOC) {
7104
		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7105
	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7106 7107 7108
		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
		em->compress_type = compress_type;
	}
7109 7110

	do {
7111
		btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7112 7113
				em->start + em->len - 1, 0);
		write_lock(&em_tree->lock);
J
Josef Bacik 已提交
7114
		ret = add_extent_mapping(em_tree, em, 1);
7115
		write_unlock(&em_tree->lock);
7116 7117 7118 7119
		/*
		 * The caller has taken lock_extent(), who could race with us
		 * to add em?
		 */
7120 7121 7122 7123 7124 7125 7126
	} while (ret == -EEXIST);

	if (ret) {
		free_extent_map(em);
		return ERR_PTR(ret);
	}

7127
	/* em got 2 refs now, callers needs to do free_extent_map once. */
7128 7129 7130
	return em;
}

7131 7132 7133 7134 7135 7136

static int btrfs_get_blocks_direct_read(struct extent_map *em,
					struct buffer_head *bh_result,
					struct inode *inode,
					u64 start, u64 len)
{
7137 7138
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);

7139 7140 7141 7142 7143 7144 7145 7146 7147
	if (em->block_start == EXTENT_MAP_HOLE ||
			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		return -ENOENT;

	len = min(len, em->len - (start - em->start));

	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
		inode->i_blkbits;
	bh_result->b_size = len;
7148
	bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
7149 7150 7151 7152 7153
	set_buffer_mapped(bh_result);

	return 0;
}

7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230
static int btrfs_get_blocks_direct_write(struct extent_map **map,
					 struct buffer_head *bh_result,
					 struct inode *inode,
					 struct btrfs_dio_data *dio_data,
					 u64 start, u64 len)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	struct extent_map *em = *map;
	int ret = 0;

	/*
	 * We don't allocate a new extent in the following cases
	 *
	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
	 * existing extent.
	 * 2) The extent is marked as PREALLOC. We're good to go here and can
	 * just use the extent.
	 *
	 */
	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
	     em->block_start != EXTENT_MAP_HOLE)) {
		int type;
		u64 block_start, orig_start, orig_block_len, ram_bytes;

		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
			type = BTRFS_ORDERED_PREALLOC;
		else
			type = BTRFS_ORDERED_NOCOW;
		len = min(len, em->len - (start - em->start));
		block_start = em->block_start + (start - em->start);

		if (can_nocow_extent(inode, start, &len, &orig_start,
				     &orig_block_len, &ram_bytes) == 1 &&
		    btrfs_inc_nocow_writers(fs_info, block_start)) {
			struct extent_map *em2;

			em2 = btrfs_create_dio_extent(inode, start, len,
						      orig_start, block_start,
						      len, orig_block_len,
						      ram_bytes, type);
			btrfs_dec_nocow_writers(fs_info, block_start);
			if (type == BTRFS_ORDERED_PREALLOC) {
				free_extent_map(em);
				*map = em = em2;
			}

			if (em2 && IS_ERR(em2)) {
				ret = PTR_ERR(em2);
				goto out;
			}
			/*
			 * For inode marked NODATACOW or extent marked PREALLOC,
			 * use the existing or preallocated extent, so does not
			 * need to adjust btrfs_space_info's bytes_may_use.
			 */
			btrfs_free_reserved_data_space_noquota(inode, start,
							       len);
			goto skip_cow;
		}
	}

	/* this will cow the extent */
	len = bh_result->b_size;
	free_extent_map(em);
	*map = em = btrfs_new_extent_direct(inode, start, len);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out;
	}

	len = min(len, em->len - (start - em->start));

skip_cow:
	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
		inode->i_blkbits;
	bh_result->b_size = len;
7231
	bh_result->b_bdev = fs_info->fs_devices->latest_bdev;
7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251
	set_buffer_mapped(bh_result);

	if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		set_buffer_new(bh_result);

	/*
	 * Need to update the i_size under the extent lock so buffered
	 * readers will get the updated i_size when we unlock.
	 */
	if (!dio_data->overwrite && start + len > i_size_read(inode))
		i_size_write(inode, start + len);

	WARN_ON(dio_data->reserve < len);
	dio_data->reserve -= len;
	dio_data->unsubmitted_oe_range_end = start + len;
	current->journal_info = dio_data;
out:
	return ret;
}

7252 7253 7254
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
				   struct buffer_head *bh_result, int create)
{
7255
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7256
	struct extent_map *em;
7257
	struct extent_state *cached_state = NULL;
7258
	struct btrfs_dio_data *dio_data = NULL;
7259
	u64 start = iblock << inode->i_blkbits;
7260
	u64 lockstart, lockend;
7261
	u64 len = bh_result->b_size;
7262
	int ret = 0;
7263

7264
	if (!create)
7265
		len = min_t(u64, len, fs_info->sectorsize);
7266

7267 7268 7269
	lockstart = start;
	lockend = start + len - 1;

7270 7271 7272
	if (current->journal_info) {
		/*
		 * Need to pull our outstanding extents and set journal_info to NULL so
7273
		 * that anything that needs to check if there's a transaction doesn't get
7274 7275
		 * confused.
		 */
7276
		dio_data = current->journal_info;
7277 7278 7279
		current->journal_info = NULL;
	}

7280 7281 7282 7283
	/*
	 * If this errors out it's because we couldn't invalidate pagecache for
	 * this range and we need to fallback to buffered.
	 */
7284 7285 7286 7287 7288
	if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
			       create)) {
		ret = -ENOTBLK;
		goto err;
	}
7289

7290
	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7291 7292 7293 7294
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto unlock_err;
	}
7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305

	/*
	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
	 * io.  INLINE is special, and we could probably kludge it in here, but
	 * it's still buffered so for safety lets just fall back to the generic
	 * buffered path.
	 *
	 * For COMPRESSED we _have_ to read the entire extent in so we can
	 * decompress it, so there will be buffering required no matter what we
	 * do, so go ahead and fallback to buffered.
	 *
7306
	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7307 7308 7309 7310 7311 7312
	 * to buffered IO.  Don't blame me, this is the price we pay for using
	 * the generic code.
	 */
	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
	    em->block_start == EXTENT_MAP_INLINE) {
		free_extent_map(em);
7313 7314
		ret = -ENOTBLK;
		goto unlock_err;
7315 7316
	}

7317 7318 7319 7320 7321 7322
	if (create) {
		ret = btrfs_get_blocks_direct_write(&em, bh_result, inode,
						    dio_data, start, len);
		if (ret < 0)
			goto unlock_err;

7323 7324
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
				     lockend, &cached_state);
7325
	} else {
7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339
		ret = btrfs_get_blocks_direct_read(em, bh_result, inode,
						   start, len);
		/* Can be negative only if we read from a hole */
		if (ret < 0) {
			ret = 0;
			free_extent_map(em);
			goto unlock_err;
		}
		/*
		 * We need to unlock only the end area that we aren't using.
		 * The rest is going to be unlocked by the endio routine.
		 */
		lockstart = start + bh_result->b_size;
		if (lockstart < lockend) {
7340 7341
			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
					     lockstart, lockend, &cached_state);
7342 7343 7344
		} else {
			free_extent_state(cached_state);
		}
7345 7346 7347 7348 7349
	}

	free_extent_map(em);

	return 0;
7350 7351

unlock_err:
7352 7353
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
			     &cached_state);
7354
err:
7355 7356
	if (dio_data)
		current->journal_info = dio_data;
7357
	return ret;
7358 7359
}

7360 7361 7362
static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
						 struct bio *bio,
						 int mirror_num)
7363
{
7364
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7365
	blk_status_t ret;
7366

M
Mike Christie 已提交
7367
	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7368

7369
	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
7370
	if (ret)
7371
		return ret;
7372

7373
	ret = btrfs_map_bio(fs_info, bio, mirror_num);
7374

7375 7376 7377 7378 7379 7380 7381 7382
	return ret;
}

static int btrfs_check_dio_repairable(struct inode *inode,
				      struct bio *failed_bio,
				      struct io_failure_record *failrec,
				      int failed_mirror)
{
7383
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7384 7385
	int num_copies;

7386
	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7387 7388 7389 7390 7391 7392
	if (num_copies == 1) {
		/*
		 * we only have a single copy of the data, so don't bother with
		 * all the retry and error correction code that follows. no
		 * matter what the error is, it is very likely to persist.
		 */
7393 7394 7395
		btrfs_debug(fs_info,
			"Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
			num_copies, failrec->this_mirror, failed_mirror);
7396 7397 7398 7399 7400 7401 7402 7403 7404
		return 0;
	}

	failrec->failed_mirror = failed_mirror;
	failrec->this_mirror++;
	if (failrec->this_mirror == failed_mirror)
		failrec->this_mirror++;

	if (failrec->this_mirror > num_copies) {
7405 7406 7407
		btrfs_debug(fs_info,
			"Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
			num_copies, failrec->this_mirror, failed_mirror);
7408 7409 7410 7411 7412 7413
		return 0;
	}

	return 1;
}

7414 7415 7416 7417
static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
				   struct page *page, unsigned int pgoff,
				   u64 start, u64 end, int failed_mirror,
				   bio_end_io_t *repair_endio, void *repair_arg)
7418 7419
{
	struct io_failure_record *failrec;
7420 7421
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7422 7423
	struct bio *bio;
	int isector;
7424
	unsigned int read_mode = 0;
7425
	int segs;
7426
	int ret;
7427
	blk_status_t status;
7428
	struct bio_vec bvec;
7429

M
Mike Christie 已提交
7430
	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7431 7432 7433

	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
	if (ret)
7434
		return errno_to_blk_status(ret);
7435 7436 7437 7438

	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
					 failed_mirror);
	if (!ret) {
7439
		free_io_failure(failure_tree, io_tree, failrec);
7440
		return BLK_STS_IOERR;
7441 7442
	}

7443
	segs = bio_segments(failed_bio);
7444
	bio_get_first_bvec(failed_bio, &bvec);
7445
	if (segs > 1 ||
7446
	    (bvec.bv_len > btrfs_inode_sectorsize(inode)))
7447
		read_mode |= REQ_FAILFAST_DEV;
7448 7449 7450 7451

	isector = start - btrfs_io_bio(failed_bio)->logical;
	isector >>= inode->i_sb->s_blocksize_bits;
	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7452
				pgoff, isector, repair_endio, repair_arg);
D
David Sterba 已提交
7453
	bio->bi_opf = REQ_OP_READ | read_mode;
7454 7455

	btrfs_debug(BTRFS_I(inode)->root->fs_info,
7456
		    "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
7457 7458
		    read_mode, failrec->this_mirror, failrec->in_validation);

7459 7460
	status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
	if (status) {
7461
		free_io_failure(failure_tree, io_tree, failrec);
7462 7463 7464
		bio_put(bio);
	}

7465
	return status;
7466 7467 7468 7469 7470 7471 7472 7473 7474
}

struct btrfs_retry_complete {
	struct completion done;
	struct inode *inode;
	u64 start;
	int uptodate;
};

7475
static void btrfs_retry_endio_nocsum(struct bio *bio)
7476 7477
{
	struct btrfs_retry_complete *done = bio->bi_private;
7478
	struct inode *inode = done->inode;
7479
	struct bio_vec *bvec;
7480
	struct extent_io_tree *io_tree, *failure_tree;
7481
	struct bvec_iter_all iter_all;
7482

7483
	if (bio->bi_status)
7484 7485
		goto end;

7486
	ASSERT(bio->bi_vcnt == 1);
7487 7488
	io_tree = &BTRFS_I(inode)->io_tree;
	failure_tree = &BTRFS_I(inode)->io_failure_tree;
7489
	ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
7490

7491
	done->uptodate = 1;
7492
	ASSERT(!bio_flagged(bio, BIO_CLONED));
7493
	bio_for_each_segment_all(bvec, bio, iter_all)
7494 7495 7496
		clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
				 io_tree, done->start, bvec->bv_page,
				 btrfs_ino(BTRFS_I(inode)), 0);
7497 7498 7499 7500 7501
end:
	complete(&done->done);
	bio_put(bio);
}

7502 7503
static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
						struct btrfs_io_bio *io_bio)
7504
{
7505
	struct btrfs_fs_info *fs_info;
7506 7507
	struct bio_vec bvec;
	struct bvec_iter iter;
7508
	struct btrfs_retry_complete done;
7509
	u64 start;
7510 7511 7512
	unsigned int pgoff;
	u32 sectorsize;
	int nr_sectors;
7513 7514
	blk_status_t ret;
	blk_status_t err = BLK_STS_OK;
7515

7516
	fs_info = BTRFS_I(inode)->root->fs_info;
7517
	sectorsize = fs_info->sectorsize;
7518

7519 7520
	start = io_bio->logical;
	done.inode = inode;
7521
	io_bio->bio.bi_iter = io_bio->iter;
7522

7523 7524 7525
	bio_for_each_segment(bvec, &io_bio->bio, iter) {
		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
		pgoff = bvec.bv_offset;
7526 7527

next_block_or_try_again:
7528 7529 7530 7531
		done.uptodate = 0;
		done.start = start;
		init_completion(&done.done);

7532
		ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
7533 7534 7535
				pgoff, start, start + sectorsize - 1,
				io_bio->mirror_num,
				btrfs_retry_endio_nocsum, &done);
7536 7537 7538 7539
		if (ret) {
			err = ret;
			goto next;
		}
7540

7541
		wait_for_completion_io(&done.done);
7542 7543 7544

		if (!done.uptodate) {
			/* We might have another mirror, so try again */
7545
			goto next_block_or_try_again;
7546 7547
		}

7548
next:
7549 7550
		start += sectorsize;

7551 7552
		nr_sectors--;
		if (nr_sectors) {
7553
			pgoff += sectorsize;
7554
			ASSERT(pgoff < PAGE_SIZE);
7555 7556
			goto next_block_or_try_again;
		}
7557 7558
	}

7559
	return err;
7560 7561
}

7562
static void btrfs_retry_endio(struct bio *bio)
7563 7564 7565
{
	struct btrfs_retry_complete *done = bio->bi_private;
	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7566 7567
	struct extent_io_tree *io_tree, *failure_tree;
	struct inode *inode = done->inode;
7568 7569 7570
	struct bio_vec *bvec;
	int uptodate;
	int ret;
7571
	int i = 0;
7572
	struct bvec_iter_all iter_all;
7573

7574
	if (bio->bi_status)
7575 7576 7577
		goto end;

	uptodate = 1;
7578 7579

	ASSERT(bio->bi_vcnt == 1);
7580
	ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
7581

7582 7583 7584
	io_tree = &BTRFS_I(inode)->io_tree;
	failure_tree = &BTRFS_I(inode)->io_failure_tree;

7585
	ASSERT(!bio_flagged(bio, BIO_CLONED));
7586
	bio_for_each_segment_all(bvec, bio, iter_all) {
7587 7588 7589
		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
					     bvec->bv_offset, done->start,
					     bvec->bv_len);
7590
		if (!ret)
7591 7592 7593 7594 7595
			clean_io_failure(BTRFS_I(inode)->root->fs_info,
					 failure_tree, io_tree, done->start,
					 bvec->bv_page,
					 btrfs_ino(BTRFS_I(inode)),
					 bvec->bv_offset);
7596 7597
		else
			uptodate = 0;
7598
		i++;
7599 7600 7601 7602 7603 7604 7605 7606
	}

	done->uptodate = uptodate;
end:
	complete(&done->done);
	bio_put(bio);
}

7607 7608
static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
		struct btrfs_io_bio *io_bio, blk_status_t err)
7609
{
7610
	struct btrfs_fs_info *fs_info;
7611 7612
	struct bio_vec bvec;
	struct bvec_iter iter;
7613 7614 7615
	struct btrfs_retry_complete done;
	u64 start;
	u64 offset = 0;
7616 7617 7618 7619
	u32 sectorsize;
	int nr_sectors;
	unsigned int pgoff;
	int csum_pos;
7620
	bool uptodate = (err == 0);
7621
	int ret;
7622
	blk_status_t status;
7623

7624
	fs_info = BTRFS_I(inode)->root->fs_info;
7625
	sectorsize = fs_info->sectorsize;
7626

7627
	err = BLK_STS_OK;
7628
	start = io_bio->logical;
7629
	done.inode = inode;
7630
	io_bio->bio.bi_iter = io_bio->iter;
7631

7632 7633
	bio_for_each_segment(bvec, &io_bio->bio, iter) {
		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
7634

7635
		pgoff = bvec.bv_offset;
7636
next_block:
7637 7638 7639 7640 7641 7642 7643
		if (uptodate) {
			csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
			ret = __readpage_endio_check(inode, io_bio, csum_pos,
					bvec.bv_page, pgoff, start, sectorsize);
			if (likely(!ret))
				goto next;
		}
7644 7645 7646 7647 7648
try_again:
		done.uptodate = 0;
		done.start = start;
		init_completion(&done.done);

7649 7650 7651 7652 7653 7654
		status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
					pgoff, start, start + sectorsize - 1,
					io_bio->mirror_num, btrfs_retry_endio,
					&done);
		if (status) {
			err = status;
7655 7656 7657
			goto next;
		}

7658
		wait_for_completion_io(&done.done);
7659 7660 7661 7662 7663 7664

		if (!done.uptodate) {
			/* We might have another mirror, so try again */
			goto try_again;
		}
next:
7665 7666 7667 7668 7669
		offset += sectorsize;
		start += sectorsize;

		ASSERT(nr_sectors);

7670 7671
		nr_sectors--;
		if (nr_sectors) {
7672
			pgoff += sectorsize;
7673
			ASSERT(pgoff < PAGE_SIZE);
7674 7675
			goto next_block;
		}
7676
	}
7677 7678 7679 7680

	return err;
}

7681 7682
static blk_status_t btrfs_subio_endio_read(struct inode *inode,
		struct btrfs_io_bio *io_bio, blk_status_t err)
7683 7684 7685 7686 7687 7688 7689
{
	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;

	if (skip_csum) {
		if (unlikely(err))
			return __btrfs_correct_data_nocsum(inode, io_bio);
		else
7690
			return BLK_STS_OK;
7691 7692 7693 7694 7695
	} else {
		return __btrfs_subio_endio_read(inode, io_bio, err);
	}
}

7696
static void btrfs_endio_direct_read(struct bio *bio)
7697 7698 7699 7700 7701
{
	struct btrfs_dio_private *dip = bio->bi_private;
	struct inode *inode = dip->inode;
	struct bio *dio_bio;
	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
7702
	blk_status_t err = bio->bi_status;
7703

7704
	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
7705
		err = btrfs_subio_endio_read(inode, io_bio, err);
7706

7707
	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
7708
		      dip->logical_offset + dip->bytes - 1);
7709
	dio_bio = dip->dio_bio;
7710 7711

	kfree(dip);
7712

7713
	dio_bio->bi_status = err;
7714
	dio_end_io(dio_bio);
7715
	btrfs_io_bio_free_csum(io_bio);
7716
	bio_put(bio);
7717 7718
}

7719 7720 7721
static void __endio_write_update_ordered(struct inode *inode,
					 const u64 offset, const u64 bytes,
					 const bool uptodate)
7722
{
7723
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7724
	struct btrfs_ordered_extent *ordered = NULL;
7725
	struct btrfs_workqueue *wq;
7726 7727
	u64 ordered_offset = offset;
	u64 ordered_bytes = bytes;
7728
	u64 last_offset;
7729

7730
	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
7731
		wq = fs_info->endio_freespace_worker;
7732
	else
7733 7734
		wq = fs_info->endio_write_workers;

7735 7736 7737 7738 7739 7740
	while (ordered_offset < offset + bytes) {
		last_offset = ordered_offset;
		if (btrfs_dec_test_first_ordered_pending(inode, &ordered,
							   &ordered_offset,
							   ordered_bytes,
							   uptodate)) {
7741 7742
			btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
					NULL);
7743 7744 7745 7746 7747 7748 7749 7750 7751 7752
			btrfs_queue_work(wq, &ordered->work);
		}
		/*
		 * If btrfs_dec_test_ordered_pending does not find any ordered
		 * extent in the range, we can exit.
		 */
		if (ordered_offset == last_offset)
			return;
		/*
		 * Our bio might span multiple ordered extents. In this case
7753
		 * we keep going until we have accounted the whole dio.
7754 7755 7756 7757 7758
		 */
		if (ordered_offset < offset + bytes) {
			ordered_bytes = offset + bytes - ordered_offset;
			ordered = NULL;
		}
7759
	}
7760 7761 7762 7763 7764 7765 7766
}

static void btrfs_endio_direct_write(struct bio *bio)
{
	struct btrfs_dio_private *dip = bio->bi_private;
	struct bio *dio_bio = dip->dio_bio;

7767
	__endio_write_update_ordered(dip->inode, dip->logical_offset,
7768
				     dip->bytes, !bio->bi_status);
7769 7770

	kfree(dip);
7771

7772
	dio_bio->bi_status = bio->bi_status;
7773
	dio_end_io(dio_bio);
7774
	bio_put(bio);
7775 7776
}

7777
static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
7778
				    struct bio *bio, u64 offset)
7779
{
7780
	struct inode *inode = private_data;
7781
	blk_status_t ret;
7782
	ret = btrfs_csum_one_bio(inode, bio, offset, 1);
7783
	BUG_ON(ret); /* -ENOMEM */
7784 7785 7786
	return 0;
}

7787
static void btrfs_end_dio_bio(struct bio *bio)
M
Miao Xie 已提交
7788 7789
{
	struct btrfs_dio_private *dip = bio->bi_private;
7790
	blk_status_t err = bio->bi_status;
M
Miao Xie 已提交
7791

7792 7793
	if (err)
		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
7794
			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
7795 7796
			   btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
			   bio->bi_opf,
7797 7798 7799 7800 7801
			   (unsigned long long)bio->bi_iter.bi_sector,
			   bio->bi_iter.bi_size, err);

	if (dip->subio_endio)
		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
7802 7803

	if (err) {
M
Miao Xie 已提交
7804
		/*
7805 7806 7807 7808
		 * We want to perceive the errors flag being set before
		 * decrementing the reference count. We don't need a barrier
		 * since atomic operations with a return value are fully
		 * ordered as per atomic_t.txt
M
Miao Xie 已提交
7809
		 */
7810
		dip->errors = 1;
M
Miao Xie 已提交
7811 7812 7813 7814 7815 7816
	}

	/* if there are more bios still pending for this dio, just exit */
	if (!atomic_dec_and_test(&dip->pending_bios))
		goto out;

7817
	if (dip->errors) {
M
Miao Xie 已提交
7818
		bio_io_error(dip->orig_bio);
7819
	} else {
7820
		dip->dio_bio->bi_status = BLK_STS_OK;
7821
		bio_endio(dip->orig_bio);
M
Miao Xie 已提交
7822 7823 7824 7825 7826
	}
out:
	bio_put(bio);
}

7827
static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
7828 7829 7830 7831 7832 7833
						 struct btrfs_dio_private *dip,
						 struct bio *bio,
						 u64 file_offset)
{
	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
7834
	u16 csum_size;
7835
	blk_status_t ret;
7836 7837 7838 7839 7840 7841 7842

	/*
	 * We load all the csum data we need when we submit
	 * the first bio to reduce the csum tree search and
	 * contention.
	 */
	if (dip->logical_offset == file_offset) {
7843 7844
		ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset,
					    NULL);
7845 7846 7847 7848 7849 7850 7851 7852 7853
		if (ret)
			return ret;
	}

	if (bio == dip->orig_bio)
		return 0;

	file_offset -= dip->logical_offset;
	file_offset >>= inode->i_sb->s_blocksize_bits;
7854 7855
	csum_size = btrfs_super_csum_size(btrfs_sb(inode->i_sb)->super_copy);
	io_bio->csum = orig_io_bio->csum + csum_size * file_offset;
7856 7857 7858 7859

	return 0;
}

7860 7861
static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
		struct inode *inode, u64 file_offset, int async_submit)
M
Miao Xie 已提交
7862
{
7863
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7864
	struct btrfs_dio_private *dip = bio->bi_private;
M
Mike Christie 已提交
7865
	bool write = bio_op(bio) == REQ_OP_WRITE;
7866
	blk_status_t ret;
M
Miao Xie 已提交
7867

7868
	/* Check btrfs_submit_bio_hook() for rules about async submit. */
7869 7870 7871
	if (async_submit)
		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);

7872
	if (!write) {
7873
		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
7874 7875 7876
		if (ret)
			goto err;
	}
M
Miao Xie 已提交
7877

7878
	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
7879 7880 7881
		goto map;

	if (write && async_submit) {
7882 7883
		ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
					  file_offset, inode,
7884
					  btrfs_submit_bio_start_direct_io);
M
Miao Xie 已提交
7885
		goto err;
7886 7887 7888 7889 7890
	} else if (write) {
		/*
		 * If we aren't doing async submit, calculate the csum of the
		 * bio now.
		 */
7891
		ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
7892 7893
		if (ret)
			goto err;
7894
	} else {
7895
		ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
7896
						     file_offset);
7897 7898 7899
		if (ret)
			goto err;
	}
7900
map:
7901
	ret = btrfs_map_bio(fs_info, bio, 0);
M
Miao Xie 已提交
7902 7903 7904 7905
err:
	return ret;
}

7906
static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
M
Miao Xie 已提交
7907 7908
{
	struct inode *inode = dip->inode;
7909
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
M
Miao Xie 已提交
7910 7911
	struct bio *bio;
	struct bio *orig_bio = dip->orig_bio;
7912
	u64 start_sector = orig_bio->bi_iter.bi_sector;
M
Miao Xie 已提交
7913
	u64 file_offset = dip->logical_offset;
7914
	int async_submit = 0;
7915 7916 7917
	u64 submit_len;
	int clone_offset = 0;
	int clone_len;
7918
	int ret;
7919
	blk_status_t status;
7920
	struct btrfs_io_geometry geom;
M
Miao Xie 已提交
7921

7922 7923 7924
	submit_len = orig_bio->bi_iter.bi_size;
	ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
				    start_sector << 9, submit_len, &geom);
7925
	if (ret)
M
Miao Xie 已提交
7926
		return -EIO;
7927

7928
	if (geom.len >= submit_len) {
7929
		bio = orig_bio;
7930
		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
7931 7932 7933
		goto submit;
	}

D
David Woodhouse 已提交
7934
	/* async crcs make it difficult to collect full stripe writes. */
7935
	if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
D
David Woodhouse 已提交
7936 7937 7938 7939
		async_submit = 0;
	else
		async_submit = 1;

7940
	/* bio split */
7941
	ASSERT(geom.len <= INT_MAX);
7942
	atomic_inc(&dip->pending_bios);
7943
	do {
7944
		clone_len = min_t(int, submit_len, geom.len);
7945

7946 7947 7948 7949
		/*
		 * This will never fail as it's passing GPF_NOFS and
		 * the allocation is backed by btrfs_bioset.
		 */
7950
		bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
7951 7952 7953 7954 7955 7956 7957 7958 7959
					      clone_len);
		bio->bi_private = dip;
		bio->bi_end_io = btrfs_end_dio_bio;
		btrfs_io_bio(bio)->logical = file_offset;

		ASSERT(submit_len >= clone_len);
		submit_len -= clone_len;
		if (submit_len == 0)
			break;
M
Miao Xie 已提交
7960

7961 7962 7963 7964 7965 7966 7967
		/*
		 * Increase the count before we submit the bio so we know
		 * the end IO handler won't happen before we increase the
		 * count. Otherwise, the dip might get freed before we're
		 * done setting it up.
		 */
		atomic_inc(&dip->pending_bios);
M
Miao Xie 已提交
7968

7969
		status = btrfs_submit_dio_bio(bio, inode, file_offset,
7970 7971
						async_submit);
		if (status) {
7972 7973 7974 7975
			bio_put(bio);
			atomic_dec(&dip->pending_bios);
			goto out_err;
		}
M
Miao Xie 已提交
7976

7977 7978 7979
		clone_offset += clone_len;
		start_sector += clone_len >> 9;
		file_offset += clone_len;
7980

7981 7982
		ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
				      start_sector << 9, submit_len, &geom);
7983 7984
		if (ret)
			goto out_err;
7985
	} while (submit_len > 0);
M
Miao Xie 已提交
7986

7987
submit:
7988
	status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
7989
	if (!status)
M
Miao Xie 已提交
7990 7991 7992 7993 7994 7995
		return 0;

	bio_put(bio);
out_err:
	dip->errors = 1;
	/*
7996 7997 7998 7999
	 * Before atomic variable goto zero, we must  make sure dip->errors is
	 * perceived to be set. This ordering is ensured by the fact that an
	 * atomic operations with a return value are fully ordered as per
	 * atomic_t.txt
M
Miao Xie 已提交
8000 8001 8002 8003 8004 8005 8006 8007
	 */
	if (atomic_dec_and_test(&dip->pending_bios))
		bio_io_error(dip->orig_bio);

	/* bio_end_io() will handle error, so we needn't return it */
	return 0;
}

8008 8009
static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
				loff_t file_offset)
8010
{
8011
	struct btrfs_dio_private *dip = NULL;
L
Liu Bo 已提交
8012 8013
	struct bio *bio = NULL;
	struct btrfs_io_bio *io_bio;
8014
	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8015 8016
	int ret = 0;

8017
	bio = btrfs_bio_clone(dio_bio);
8018

8019
	dip = kzalloc(sizeof(*dip), GFP_NOFS);
8020 8021
	if (!dip) {
		ret = -ENOMEM;
8022
		goto free_ordered;
8023 8024
	}

8025
	dip->private = dio_bio->bi_private;
8026 8027
	dip->inode = inode;
	dip->logical_offset = file_offset;
8028 8029
	dip->bytes = dio_bio->bi_iter.bi_size;
	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
L
Liu Bo 已提交
8030 8031
	bio->bi_private = dip;
	dip->orig_bio = bio;
8032
	dip->dio_bio = dio_bio;
M
Miao Xie 已提交
8033
	atomic_set(&dip->pending_bios, 0);
L
Liu Bo 已提交
8034 8035
	io_bio = btrfs_io_bio(bio);
	io_bio->logical = file_offset;
8036

8037
	if (write) {
L
Liu Bo 已提交
8038
		bio->bi_end_io = btrfs_endio_direct_write;
8039
	} else {
L
Liu Bo 已提交
8040
		bio->bi_end_io = btrfs_endio_direct_read;
8041 8042
		dip->subio_endio = btrfs_subio_endio_read;
	}
8043

8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058
	/*
	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
	 * even if we fail to submit a bio, because in such case we do the
	 * corresponding error handling below and it must not be done a second
	 * time by btrfs_direct_IO().
	 */
	if (write) {
		struct btrfs_dio_data *dio_data = current->journal_info;

		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
			dip->bytes;
		dio_data->unsubmitted_oe_range_start =
			dio_data->unsubmitted_oe_range_end;
	}

8059
	ret = btrfs_submit_direct_hook(dip);
M
Miao Xie 已提交
8060
	if (!ret)
8061
		return;
8062

8063
	btrfs_io_bio_free_csum(io_bio);
8064

8065 8066
free_ordered:
	/*
8067 8068 8069 8070 8071 8072 8073
	 * If we arrived here it means either we failed to submit the dip
	 * or we either failed to clone the dio_bio or failed to allocate the
	 * dip. If we cloned the dio_bio and allocated the dip, we can just
	 * call bio_endio against our io_bio so that we get proper resource
	 * cleanup if we fail to submit the dip, otherwise, we must do the
	 * same as btrfs_endio_direct_[write|read] because we can't call these
	 * callbacks - they require an allocated dip and a clone of dio_bio.
8074
	 */
L
Liu Bo 已提交
8075
	if (bio && dip) {
8076
		bio_io_error(bio);
8077
		/*
L
Liu Bo 已提交
8078
		 * The end io callbacks free our dip, do the final put on bio
8079 8080 8081 8082
		 * and all the cleanup and final put for dio_bio (through
		 * dio_end_io()).
		 */
		dip = NULL;
L
Liu Bo 已提交
8083
		bio = NULL;
8084
	} else {
8085
		if (write)
8086
			__endio_write_update_ordered(inode,
8087 8088
						file_offset,
						dio_bio->bi_iter.bi_size,
8089
						false);
8090
		else
8091 8092
			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
			      file_offset + dio_bio->bi_iter.bi_size - 1);
8093

8094
		dio_bio->bi_status = BLK_STS_IOERR;
8095 8096 8097 8098
		/*
		 * Releases and cleans up our dio_bio, no need to bio_put()
		 * nor bio_endio()/bio_io_error() against dio_bio.
		 */
8099
		dio_end_io(dio_bio);
8100
	}
L
Liu Bo 已提交
8101 8102
	if (bio)
		bio_put(bio);
8103
	kfree(dip);
8104 8105
}

8106 8107
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
			       const struct iov_iter *iter, loff_t offset)
C
Chris Mason 已提交
8108 8109
{
	int seg;
8110
	int i;
8111
	unsigned int blocksize_mask = fs_info->sectorsize - 1;
C
Chris Mason 已提交
8112 8113 8114 8115 8116
	ssize_t retval = -EINVAL;

	if (offset & blocksize_mask)
		goto out;

8117 8118
	if (iov_iter_alignment(iter) & blocksize_mask)
		goto out;
8119

8120
	/* If this is a write we don't need to check anymore */
8121
	if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8122 8123 8124 8125 8126 8127 8128 8129 8130
		return 0;
	/*
	 * Check to make sure we don't have duplicate iov_base's in this
	 * iovec, if so return EINVAL, otherwise we'll get csum errors
	 * when reading back.
	 */
	for (seg = 0; seg < iter->nr_segs; seg++) {
		for (i = seg + 1; i < iter->nr_segs; i++) {
			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8131 8132
				goto out;
		}
C
Chris Mason 已提交
8133 8134 8135 8136 8137
	}
	retval = 0;
out:
	return retval;
}
8138

8139
static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8140
{
8141 8142
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
8143
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8144
	struct btrfs_dio_data dio_data = { 0 };
8145
	struct extent_changeset *data_reserved = NULL;
8146
	loff_t offset = iocb->ki_pos;
8147
	size_t count = 0;
8148
	int flags = 0;
M
Miao Xie 已提交
8149 8150
	bool wakeup = true;
	bool relock = false;
8151
	ssize_t ret;
8152

8153
	if (check_direct_IO(fs_info, iter, offset))
C
Chris Mason 已提交
8154
		return 0;
8155

8156
	inode_dio_begin(inode);
M
Miao Xie 已提交
8157

8158
	/*
8159 8160 8161 8162
	 * The generic stuff only does filemap_write_and_wait_range, which
	 * isn't enough if we've written compressed pages to this area, so
	 * we need to flush the dirty pages again to make absolutely sure
	 * that any outstanding dirty pages are on disk.
8163
	 */
8164
	count = iov_iter_count(iter);
8165 8166
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
		     &BTRFS_I(inode)->runtime_flags))
8167 8168
		filemap_fdatawrite_range(inode->i_mapping, offset,
					 offset + count - 1);
8169

8170
	if (iov_iter_rw(iter) == WRITE) {
M
Miao Xie 已提交
8171 8172 8173 8174 8175 8176
		/*
		 * If the write DIO is beyond the EOF, we need update
		 * the isize, but it is protected by i_mutex. So we can
		 * not unlock the i_mutex at this case.
		 */
		if (offset + count <= inode->i_size) {
8177
			dio_data.overwrite = 1;
A
Al Viro 已提交
8178
			inode_unlock(inode);
M
Miao Xie 已提交
8179
			relock = true;
G
Goldwyn Rodrigues 已提交
8180 8181 8182
		} else if (iocb->ki_flags & IOCB_NOWAIT) {
			ret = -EAGAIN;
			goto out;
M
Miao Xie 已提交
8183
		}
8184 8185
		ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
						   offset, count);
8186
		if (ret)
M
Miao Xie 已提交
8187
			goto out;
8188 8189 8190 8191 8192 8193

		/*
		 * We need to know how many extents we reserved so that we can
		 * do the accounting properly if we go over the number we
		 * originally calculated.  Abuse current->journal_info for this.
		 */
8194
		dio_data.reserve = round_up(count,
8195
					    fs_info->sectorsize);
8196 8197
		dio_data.unsubmitted_oe_range_start = (u64)offset;
		dio_data.unsubmitted_oe_range_end = (u64)offset;
8198
		current->journal_info = &dio_data;
8199
		down_read(&BTRFS_I(inode)->dio_sem);
8200 8201
	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
				     &BTRFS_I(inode)->runtime_flags)) {
8202
		inode_dio_end(inode);
M
Miao Xie 已提交
8203 8204
		flags = DIO_LOCKING | DIO_SKIP_HOLES;
		wakeup = false;
8205 8206
	}

8207
	ret = __blockdev_direct_IO(iocb, inode,
8208
				   fs_info->fs_devices->latest_bdev,
8209
				   iter, btrfs_get_blocks_direct, NULL,
8210
				   btrfs_submit_direct, flags);
8211
	if (iov_iter_rw(iter) == WRITE) {
8212
		up_read(&BTRFS_I(inode)->dio_sem);
8213
		current->journal_info = NULL;
L
Liu Bo 已提交
8214
		if (ret < 0 && ret != -EIOCBQUEUED) {
8215
			if (dio_data.reserve)
8216
				btrfs_delalloc_release_space(inode, data_reserved,
8217
					offset, dio_data.reserve, true);
8218 8219 8220 8221 8222 8223 8224 8225
			/*
			 * On error we might have left some ordered extents
			 * without submitting corresponding bios for them, so
			 * cleanup them up to avoid other tasks getting them
			 * and waiting for them to complete forever.
			 */
			if (dio_data.unsubmitted_oe_range_start <
			    dio_data.unsubmitted_oe_range_end)
8226
				__endio_write_update_ordered(inode,
8227 8228 8229
					dio_data.unsubmitted_oe_range_start,
					dio_data.unsubmitted_oe_range_end -
					dio_data.unsubmitted_oe_range_start,
8230
					false);
L
Liu Bo 已提交
8231
		} else if (ret >= 0 && (size_t)ret < count)
8232
			btrfs_delalloc_release_space(inode, data_reserved,
8233
					offset, count - (size_t)ret, true);
8234
		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8235
	}
M
Miao Xie 已提交
8236
out:
8237
	if (wakeup)
8238
		inode_dio_end(inode);
M
Miao Xie 已提交
8239
	if (relock)
A
Al Viro 已提交
8240
		inode_lock(inode);
8241

8242
	extent_changeset_free(data_reserved);
8243
	return ret;
8244 8245
}

T
Tsutomu Itoh 已提交
8246 8247
#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)

Y
Yehuda Sadeh 已提交
8248 8249 8250
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
		__u64 start, __u64 len)
{
T
Tsutomu Itoh 已提交
8251 8252 8253 8254 8255 8256
	int	ret;

	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
	if (ret)
		return ret;

8257
	return extent_fiemap(inode, fieinfo, start, len);
Y
Yehuda Sadeh 已提交
8258 8259
}

8260
int btrfs_readpage(struct file *file, struct page *page)
C
Chris Mason 已提交
8261
{
8262
	return extent_read_full_page(page, btrfs_get_extent, 0);
C
Chris Mason 已提交
8263
}
8264

8265
static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
C
Chris Mason 已提交
8266
{
J
Josef Bacik 已提交
8267 8268
	struct inode *inode = page->mapping->host;
	int ret;
8269 8270 8271 8272 8273 8274

	if (current->flags & PF_MEMALLOC) {
		redirty_page_for_writepage(wbc, page);
		unlock_page(page);
		return 0;
	}
J
Josef Bacik 已提交
8275 8276 8277 8278 8279 8280 8281 8282 8283 8284

	/*
	 * If we are under memory pressure we will call this directly from the
	 * VM, we need to make sure we have the inode referenced for the ordered
	 * extent.  If not just return like we didn't do anything.
	 */
	if (!igrab(inode)) {
		redirty_page_for_writepage(wbc, page);
		return AOP_WRITEPAGE_ACTIVATE;
	}
8285
	ret = extent_write_full_page(page, wbc);
J
Josef Bacik 已提交
8286 8287
	btrfs_add_delayed_iput(inode);
	return ret;
C
Chris Mason 已提交
8288 8289
}

8290 8291
static int btrfs_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
C
Chris Mason 已提交
8292
{
8293
	return extent_writepages(mapping, wbc);
C
Chris Mason 已提交
8294 8295
}

C
Chris Mason 已提交
8296 8297 8298 8299
static int
btrfs_readpages(struct file *file, struct address_space *mapping,
		struct list_head *pages, unsigned nr_pages)
{
8300
	return extent_readpages(mapping, pages, nr_pages);
C
Chris Mason 已提交
8301
}
8302

8303
static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
C
Chris Mason 已提交
8304
{
8305
	int ret = try_release_extent_mapping(page, gfp_flags);
8306 8307 8308
	if (ret == 1) {
		ClearPagePrivate(page);
		set_page_private(page, 0);
8309
		put_page(page);
C
Chris Mason 已提交
8310
	}
8311
	return ret;
C
Chris Mason 已提交
8312 8313
}

8314 8315
static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
{
8316 8317
	if (PageWriteback(page) || PageDirty(page))
		return 0;
8318
	return __btrfs_releasepage(page, gfp_flags);
8319 8320
}

8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353
#ifdef CONFIG_MIGRATION
static int btrfs_migratepage(struct address_space *mapping,
			     struct page *newpage, struct page *page,
			     enum migrate_mode mode)
{
	int ret;

	ret = migrate_page_move_mapping(mapping, newpage, page, 0);
	if (ret != MIGRATEPAGE_SUCCESS)
		return ret;

	if (page_has_private(page)) {
		ClearPagePrivate(page);
		get_page(newpage);
		set_page_private(newpage, page_private(page));
		set_page_private(page, 0);
		put_page(page);
		SetPagePrivate(newpage);
	}

	if (PagePrivate2(page)) {
		ClearPagePrivate2(page);
		SetPagePrivate2(newpage);
	}

	if (mode != MIGRATE_SYNC_NO_COPY)
		migrate_page_copy(newpage, page);
	else
		migrate_page_states(newpage, page);
	return MIGRATEPAGE_SUCCESS;
}
#endif

8354 8355
static void btrfs_invalidatepage(struct page *page, unsigned int offset,
				 unsigned int length)
C
Chris Mason 已提交
8356
{
8357
	struct inode *inode = page->mapping->host;
8358
	struct extent_io_tree *tree;
8359
	struct btrfs_ordered_extent *ordered;
8360
	struct extent_state *cached_state = NULL;
8361
	u64 page_start = page_offset(page);
8362
	u64 page_end = page_start + PAGE_SIZE - 1;
8363 8364
	u64 start;
	u64 end;
8365
	int inode_evicting = inode->i_state & I_FREEING;
C
Chris Mason 已提交
8366

8367 8368 8369 8370 8371 8372 8373
	/*
	 * we have the page locked, so new writeback can't start,
	 * and the dirty bit won't be cleared while we are here.
	 *
	 * Wait for IO on this page so that we can safely clear
	 * the PagePrivate2 bit and do ordered accounting
	 */
8374
	wait_on_page_writeback(page);
8375

8376
	tree = &BTRFS_I(inode)->io_tree;
8377 8378 8379 8380
	if (offset) {
		btrfs_releasepage(page, GFP_NOFS);
		return;
	}
8381 8382

	if (!inode_evicting)
8383
		lock_extent_bits(tree, page_start, page_end, &cached_state);
8384 8385
again:
	start = page_start;
8386
	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8387
					page_end - start + 1);
8388
	if (ordered) {
8389 8390
		end = min(page_end,
			  ordered->file_offset + ordered->num_bytes - 1);
8391 8392 8393 8394
		/*
		 * IO on this page will never be started, so we need
		 * to account for any ordered extents now
		 */
8395
		if (!inode_evicting)
8396
			clear_extent_bit(tree, start, end,
8397
					 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
8398
					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8399
					 EXTENT_DEFRAG, 1, 0, &cached_state);
8400 8401 8402 8403
		/*
		 * whoever cleared the private bit is responsible
		 * for the finish_ordered_io
		 */
8404 8405 8406 8407 8408 8409 8410 8411
		if (TestClearPagePrivate2(page)) {
			struct btrfs_ordered_inode_tree *tree;
			u64 new_len;

			tree = &BTRFS_I(inode)->ordered_tree;

			spin_lock_irq(&tree->lock);
			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8412
			new_len = start - ordered->file_offset;
8413 8414 8415 8416 8417
			if (new_len < ordered->truncated_len)
				ordered->truncated_len = new_len;
			spin_unlock_irq(&tree->lock);

			if (btrfs_dec_test_ordered_pending(inode, &ordered,
8418 8419
							   start,
							   end - start + 1, 1))
8420
				btrfs_finish_ordered_io(ordered);
8421
		}
8422
		btrfs_put_ordered_extent(ordered);
8423 8424
		if (!inode_evicting) {
			cached_state = NULL;
8425
			lock_extent_bits(tree, start, end,
8426 8427
					 &cached_state);
		}
8428 8429 8430 8431

		start = end + 1;
		if (start < page_end)
			goto again;
8432 8433
	}

Q
Qu Wenruo 已提交
8434 8435 8436 8437 8438 8439 8440 8441 8442
	/*
	 * Qgroup reserved space handler
	 * Page here will be either
	 * 1) Already written to disk
	 *    In this case, its reserved space is released from data rsv map
	 *    and will be freed by delayed_ref handler finally.
	 *    So even we call qgroup_free_data(), it won't decrease reserved
	 *    space.
	 * 2) Not written to disk
8443 8444 8445 8446 8447
	 *    This means the reserved space should be freed here. However,
	 *    if a truncate invalidates the page (by clearing PageDirty)
	 *    and the page is accounted for while allocating extent
	 *    in btrfs_check_data_free_space() we let delayed_ref to
	 *    free the entire extent.
Q
Qu Wenruo 已提交
8448
	 */
8449
	if (PageDirty(page))
8450
		btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
8451
	if (!inode_evicting) {
8452
		clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
8453 8454
				 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
8455
				 &cached_state);
8456 8457

		__btrfs_releasepage(page, GFP_NOFS);
8458 8459
	}

C
Chris Mason 已提交
8460
	ClearPageChecked(page);
8461 8462 8463
	if (PagePrivate(page)) {
		ClearPagePrivate(page);
		set_page_private(page, 0);
8464
		put_page(page);
8465
	}
C
Chris Mason 已提交
8466 8467
}

C
Chris Mason 已提交
8468 8469 8470 8471 8472 8473 8474 8475 8476 8477
/*
 * btrfs_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
8478 8479
 * truncate_setsize() writes the inode size before removing pages, once we have
 * the page lock we can determine safely if the page is beyond EOF. If it is not
C
Chris Mason 已提交
8480 8481 8482
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 */
8483
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
C
Chris Mason 已提交
8484
{
8485
	struct page *page = vmf->page;
8486
	struct inode *inode = file_inode(vmf->vma->vm_file);
8487
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8488 8489
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct btrfs_ordered_extent *ordered;
8490
	struct extent_state *cached_state = NULL;
8491
	struct extent_changeset *data_reserved = NULL;
8492 8493
	char *kaddr;
	unsigned long zero_start;
C
Chris Mason 已提交
8494
	loff_t size;
8495 8496
	vm_fault_t ret;
	int ret2;
8497
	int reserved = 0;
8498
	u64 reserved_space;
8499
	u64 page_start;
8500
	u64 page_end;
8501 8502
	u64 end;

8503
	reserved_space = PAGE_SIZE;
C
Chris Mason 已提交
8504

8505
	sb_start_pagefault(inode->i_sb);
8506
	page_start = page_offset(page);
8507
	page_end = page_start + PAGE_SIZE - 1;
8508
	end = page_end;
8509

8510 8511 8512 8513 8514 8515 8516 8517
	/*
	 * Reserving delalloc space after obtaining the page lock can lead to
	 * deadlock. For example, if a dirty page is locked by this function
	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
	 * dirty page write out, then the btrfs_writepage() function could
	 * end up waiting indefinitely to get a lock on the page currently
	 * being processed by btrfs_page_mkwrite() function.
	 */
8518
	ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
8519
					   reserved_space);
8520 8521
	if (!ret2) {
		ret2 = file_update_time(vmf->vma->vm_file);
8522 8523
		reserved = 1;
	}
8524 8525
	if (ret2) {
		ret = vmf_error(ret2);
8526 8527 8528
		if (reserved)
			goto out;
		goto out_noreserve;
8529
	}
8530

8531
	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8532
again:
C
Chris Mason 已提交
8533 8534
	lock_page(page);
	size = i_size_read(inode);
8535

C
Chris Mason 已提交
8536
	if ((page->mapping != inode->i_mapping) ||
8537
	    (page_start >= size)) {
C
Chris Mason 已提交
8538 8539 8540
		/* page got truncated out from underneath us */
		goto out_unlock;
	}
8541 8542
	wait_on_page_writeback(page);

8543
	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
8544 8545
	set_page_extent_mapped(page);

8546 8547 8548 8549
	/*
	 * we can't set the delalloc bits if there are pending ordered
	 * extents.  Drop our locks and wait for them to finish
	 */
8550 8551
	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
			PAGE_SIZE);
8552
	if (ordered) {
8553
		unlock_extent_cached(io_tree, page_start, page_end,
8554
				     &cached_state);
8555
		unlock_page(page);
8556
		btrfs_start_ordered_extent(inode, ordered, 1);
8557 8558 8559 8560
		btrfs_put_ordered_extent(ordered);
		goto again;
	}

8561
	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8562
		reserved_space = round_up(size - page_start,
8563
					  fs_info->sectorsize);
8564
		if (reserved_space < PAGE_SIZE) {
8565
			end = page_start + reserved_space - 1;
8566
			btrfs_delalloc_release_space(inode, data_reserved,
8567 8568
					page_start, PAGE_SIZE - reserved_space,
					true);
8569 8570 8571
		}
	}

J
Josef Bacik 已提交
8572
	/*
8573 8574 8575 8576 8577
	 * page_mkwrite gets called when the page is firstly dirtied after it's
	 * faulted in, but write(2) could also dirty a page and set delalloc
	 * bits, thus in this case for space account reason, we still need to
	 * clear any delalloc bits within this page range since we have to
	 * reserve data&meta space before lock_page() (see above comments).
J
Josef Bacik 已提交
8578
	 */
8579
	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8580 8581
			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
			  EXTENT_DEFRAG, 0, 0, &cached_state);
J
Josef Bacik 已提交
8582

8583
	ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0,
8584
					&cached_state);
8585
	if (ret2) {
8586
		unlock_extent_cached(io_tree, page_start, page_end,
8587
				     &cached_state);
J
Josef Bacik 已提交
8588 8589 8590
		ret = VM_FAULT_SIGBUS;
		goto out_unlock;
	}
C
Chris Mason 已提交
8591 8592

	/* page is wholly or partially inside EOF */
8593
	if (page_start + PAGE_SIZE > size)
8594
		zero_start = offset_in_page(size);
C
Chris Mason 已提交
8595
	else
8596
		zero_start = PAGE_SIZE;
C
Chris Mason 已提交
8597

8598
	if (zero_start != PAGE_SIZE) {
8599
		kaddr = kmap(page);
8600
		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
8601 8602 8603
		flush_dcache_page(page);
		kunmap(page);
	}
8604
	ClearPageChecked(page);
8605
	set_page_dirty(page);
8606
	SetPageUptodate(page);
8607

8608
	BTRFS_I(inode)->last_trans = fs_info->generation;
8609
	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
8610
	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
8611

8612
	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
C
Chris Mason 已提交
8613

8614 8615 8616 8617
	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
	sb_end_pagefault(inode->i_sb);
	extent_changeset_free(data_reserved);
	return VM_FAULT_LOCKED;
8618 8619

out_unlock:
C
Chris Mason 已提交
8620
	unlock_page(page);
8621
out:
8622
	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8623
	btrfs_delalloc_release_space(inode, data_reserved, page_start,
8624
				     reserved_space, (ret != 0));
8625
out_noreserve:
8626
	sb_end_pagefault(inode->i_sb);
8627
	extent_changeset_free(data_reserved);
C
Chris Mason 已提交
8628 8629 8630
	return ret;
}

8631
static int btrfs_truncate(struct inode *inode, bool skip_writeback)
C
Chris Mason 已提交
8632
{
8633
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
8634
	struct btrfs_root *root = BTRFS_I(inode)->root;
8635
	struct btrfs_block_rsv *rsv;
8636
	int ret;
C
Chris Mason 已提交
8637
	struct btrfs_trans_handle *trans;
8638
	u64 mask = fs_info->sectorsize - 1;
8639
	u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
C
Chris Mason 已提交
8640

8641 8642 8643 8644 8645 8646
	if (!skip_writeback) {
		ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
					       (u64)-1);
		if (ret)
			return ret;
	}
C
Chris Mason 已提交
8647

8648
	/*
8649 8650
	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
	 * things going on here:
8651
	 *
8652
	 * 1) We need to reserve space to update our inode.
8653
	 *
8654
	 * 2) We need to have something to cache all the space that is going to
8655 8656 8657 8658
	 * be free'd up by the truncate operation, but also have some slack
	 * space reserved in case it uses space during the truncate (thank you
	 * very much snapshotting).
	 *
8659
	 * And we need these to be separate.  The fact is we can use a lot of
8660
	 * space doing the truncate, and we have no earthly idea how much space
8661
	 * we will use, so we need the truncate reservation to be separate so it
8662 8663 8664 8665 8666 8667
	 * doesn't end up using space reserved for updating the inode.  We also
	 * need to be able to stop the transaction and start a new one, which
	 * means we need to be able to update the inode several times, and we
	 * have no idea of knowing how many times that will be, so we can't just
	 * reserve 1 item for the entirety of the operation, so that has to be
	 * done separately as well.
8668 8669 8670
	 *
	 * So that leaves us with
	 *
8671
	 * 1) rsv - for the truncate reservation, which we will steal from the
8672
	 * transaction reservation.
8673
	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8674 8675
	 * updating the inode.
	 */
8676
	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8677 8678
	if (!rsv)
		return -ENOMEM;
J
Josef Bacik 已提交
8679
	rsv->size = min_size;
8680
	rsv->failfast = 1;
8681

8682
	/*
8683
	 * 1 for the truncate slack space
8684 8685
	 * 1 for updating the inode.
	 */
8686
	trans = btrfs_start_transaction(root, 2);
8687
	if (IS_ERR(trans)) {
8688
		ret = PTR_ERR(trans);
8689 8690
		goto out;
	}
8691

8692
	/* Migrate the slack space for the truncate to our reserve */
8693
	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8694
				      min_size, false);
8695
	BUG_ON(ret);
8696

J
Josef Bacik 已提交
8697 8698 8699 8700 8701 8702 8703 8704
	/*
	 * So if we truncate and then write and fsync we normally would just
	 * write the extents that changed, which is a problem if we need to
	 * first truncate that entire inode.  So set this flag so we write out
	 * all of the extents in the inode to the sync log so we're completely
	 * safe.
	 */
	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
8705
	trans->block_rsv = rsv;
8706

8707 8708 8709 8710
	while (1) {
		ret = btrfs_truncate_inode_items(trans, root, inode,
						 inode->i_size,
						 BTRFS_EXTENT_DATA_KEY);
8711
		trans->block_rsv = &fs_info->trans_block_rsv;
8712
		if (ret != -ENOSPC && ret != -EAGAIN)
8713
			break;
C
Chris Mason 已提交
8714

8715
		ret = btrfs_update_inode(trans, root, inode);
8716
		if (ret)
8717
			break;
8718

8719
		btrfs_end_transaction(trans);
8720
		btrfs_btree_balance_dirty(fs_info);
8721 8722 8723

		trans = btrfs_start_transaction(root, 2);
		if (IS_ERR(trans)) {
8724
			ret = PTR_ERR(trans);
8725 8726 8727 8728
			trans = NULL;
			break;
		}

8729
		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8730
		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8731
					      rsv, min_size, false);
8732 8733
		BUG_ON(ret);	/* shouldn't happen */
		trans->block_rsv = rsv;
8734 8735
	}

8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753
	/*
	 * We can't call btrfs_truncate_block inside a trans handle as we could
	 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
	 * we've truncated everything except the last little bit, and can do
	 * btrfs_truncate_block and then update the disk_i_size.
	 */
	if (ret == NEED_TRUNCATE_BLOCK) {
		btrfs_end_transaction(trans);
		btrfs_btree_balance_dirty(fs_info);

		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
		if (ret)
			goto out;
		trans = btrfs_start_transaction(root, 1);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			goto out;
		}
8754
		btrfs_inode_safe_disk_i_size_write(inode, 0);
8755 8756
	}

8757
	if (trans) {
8758 8759
		int ret2;

8760
		trans->block_rsv = &fs_info->trans_block_rsv;
8761 8762 8763
		ret2 = btrfs_update_inode(trans, root, inode);
		if (ret2 && !ret)
			ret = ret2;
8764

8765 8766 8767
		ret2 = btrfs_end_transaction(trans);
		if (ret2 && !ret)
			ret = ret2;
8768
		btrfs_btree_balance_dirty(fs_info);
8769
	}
8770
out:
8771
	btrfs_free_block_rsv(fs_info, rsv);
8772

8773
	return ret;
C
Chris Mason 已提交
8774 8775
}

C
Chris Mason 已提交
8776 8777 8778
/*
 * create a new subvolume directory/inode (helper for the ioctl).
 */
8779
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
8780 8781 8782
			     struct btrfs_root *new_root,
			     struct btrfs_root *parent_root,
			     u64 new_dirid)
C
Chris Mason 已提交
8783 8784
{
	struct inode *inode;
8785
	int err;
8786
	u64 index = 0;
C
Chris Mason 已提交
8787

8788 8789 8790 8791
	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
				new_dirid, new_dirid,
				S_IFDIR | (~current_umask() & S_IRWXUGO),
				&index);
8792
	if (IS_ERR(inode))
C
Christoph Hellwig 已提交
8793
		return PTR_ERR(inode);
C
Chris Mason 已提交
8794 8795 8796
	inode->i_op = &btrfs_dir_inode_operations;
	inode->i_fop = &btrfs_dir_file_operations;

M
Miklos Szeredi 已提交
8797
	set_nlink(inode, 1);
8798
	btrfs_i_size_write(BTRFS_I(inode), 0);
8799
	unlock_new_inode(inode);
8800

8801 8802 8803
	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
	if (err)
		btrfs_err(new_root->fs_info,
8804
			  "error inheriting subvolume %llu properties: %d",
8805 8806
			  new_root->root_key.objectid, err);

8807
	err = btrfs_update_inode(trans, new_root, inode);
8808

8809
	iput(inode);
8810
	return err;
C
Chris Mason 已提交
8811 8812 8813 8814
}

struct inode *btrfs_alloc_inode(struct super_block *sb)
{
8815
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
C
Chris Mason 已提交
8816
	struct btrfs_inode *ei;
Y
Yan, Zheng 已提交
8817
	struct inode *inode;
C
Chris Mason 已提交
8818

8819
	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
C
Chris Mason 已提交
8820 8821
	if (!ei)
		return NULL;
Y
Yan, Zheng 已提交
8822 8823 8824

	ei->root = NULL;
	ei->generation = 0;
8825
	ei->last_trans = 0;
8826
	ei->last_sub_trans = 0;
8827
	ei->logged_trans = 0;
Y
Yan, Zheng 已提交
8828
	ei->delalloc_bytes = 0;
8829
	ei->new_delalloc_bytes = 0;
8830
	ei->defrag_bytes = 0;
Y
Yan, Zheng 已提交
8831 8832
	ei->disk_i_size = 0;
	ei->flags = 0;
8833
	ei->csum_bytes = 0;
Y
Yan, Zheng 已提交
8834
	ei->index_cnt = (u64)-1;
8835
	ei->dir_index = 0;
Y
Yan, Zheng 已提交
8836
	ei->last_unlink_trans = 0;
8837
	ei->last_log_commit = 0;
Y
Yan, Zheng 已提交
8838

8839 8840
	spin_lock_init(&ei->lock);
	ei->outstanding_extents = 0;
8841 8842 8843
	if (sb->s_magic != BTRFS_TEST_MAGIC)
		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
					      BTRFS_BLOCK_RSV_DELALLOC);
8844
	ei->runtime_flags = 0;
8845
	ei->prop_compress = BTRFS_COMPRESS_NONE;
8846
	ei->defrag_compress = BTRFS_COMPRESS_NONE;
Y
Yan, Zheng 已提交
8847

8848 8849
	ei->delayed_node = NULL;

8850 8851 8852
	ei->i_otime.tv_sec = 0;
	ei->i_otime.tv_nsec = 0;

Y
Yan, Zheng 已提交
8853
	inode = &ei->vfs_inode;
8854
	extent_map_tree_init(&ei->extent_tree);
8855 8856 8857
	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode);
	extent_io_tree_init(fs_info, &ei->io_failure_tree,
			    IO_TREE_INODE_IO_FAILURE, inode);
8858 8859
	extent_io_tree_init(fs_info, &ei->file_extent_tree,
			    IO_TREE_INODE_FILE_EXTENT, inode);
8860 8861
	ei->io_tree.track_uptodate = true;
	ei->io_failure_tree.track_uptodate = true;
8862
	atomic_set(&ei->sync_writers, 0);
Y
Yan, Zheng 已提交
8863
	mutex_init(&ei->log_mutex);
8864
	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
Y
Yan, Zheng 已提交
8865
	INIT_LIST_HEAD(&ei->delalloc_inodes);
8866
	INIT_LIST_HEAD(&ei->delayed_iput);
Y
Yan, Zheng 已提交
8867
	RB_CLEAR_NODE(&ei->rb_node);
8868
	init_rwsem(&ei->dio_sem);
Y
Yan, Zheng 已提交
8869 8870

	return inode;
C
Chris Mason 已提交
8871 8872
}

8873 8874 8875
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode)
{
8876
	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
8877 8878 8879 8880
	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif

A
Al Viro 已提交
8881
void btrfs_free_inode(struct inode *inode)
N
Nick Piggin 已提交
8882 8883 8884 8885
{
	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}

C
Chris Mason 已提交
8886 8887
void btrfs_destroy_inode(struct inode *inode)
{
8888
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8889
	struct btrfs_ordered_extent *ordered;
8890 8891
	struct btrfs_root *root = BTRFS_I(inode)->root;

A
Al Viro 已提交
8892
	WARN_ON(!hlist_empty(&inode->i_dentry));
C
Chris Mason 已提交
8893
	WARN_ON(inode->i_data.nrpages);
8894 8895
	WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
	WARN_ON(BTRFS_I(inode)->block_rsv.size);
8896
	WARN_ON(BTRFS_I(inode)->outstanding_extents);
8897
	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
8898
	WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
8899
	WARN_ON(BTRFS_I(inode)->csum_bytes);
8900
	WARN_ON(BTRFS_I(inode)->defrag_bytes);
C
Chris Mason 已提交
8901

8902 8903 8904 8905 8906 8907
	/*
	 * This can happen where we create an inode, but somebody else also
	 * created the same inode and we need to destroy the one we already
	 * created.
	 */
	if (!root)
A
Al Viro 已提交
8908
		return;
8909

C
Chris Mason 已提交
8910
	while (1) {
8911 8912 8913 8914
		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
		if (!ordered)
			break;
		else {
8915
			btrfs_err(fs_info,
J
Jeff Mahoney 已提交
8916
				  "found ordered extent %llu %llu on inode cleanup",
8917
				  ordered->file_offset, ordered->num_bytes);
8918 8919 8920 8921 8922
			btrfs_remove_ordered_extent(inode, ordered);
			btrfs_put_ordered_extent(ordered);
			btrfs_put_ordered_extent(ordered);
		}
	}
8923
	btrfs_qgroup_check_reserved_leak(inode);
8924
	inode_tree_del(inode);
8925
	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
8926
	btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1);
8927
	btrfs_put_root(BTRFS_I(inode)->root);
C
Chris Mason 已提交
8928 8929
}

8930
int btrfs_drop_inode(struct inode *inode)
8931 8932
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
8933

8934 8935 8936
	if (root == NULL)
		return 1;

8937
	/* the snap/subvol tree is on deleting */
8938
	if (btrfs_root_refs(&root->root_item) == 0)
8939
		return 1;
8940
	else
8941
		return generic_drop_inode(inode);
8942 8943
}

8944
static void init_once(void *foo)
C
Chris Mason 已提交
8945 8946 8947 8948 8949 8950
{
	struct btrfs_inode *ei = (struct btrfs_inode *) foo;

	inode_init_once(&ei->vfs_inode);
}

8951
void __cold btrfs_destroy_cachep(void)
C
Chris Mason 已提交
8952
{
8953 8954 8955 8956 8957
	/*
	 * Make sure all delayed rcu free inodes are flushed before we
	 * destroy cache.
	 */
	rcu_barrier();
8958 8959 8960 8961
	kmem_cache_destroy(btrfs_inode_cachep);
	kmem_cache_destroy(btrfs_trans_handle_cachep);
	kmem_cache_destroy(btrfs_path_cachep);
	kmem_cache_destroy(btrfs_free_space_cachep);
8962
	kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
C
Chris Mason 已提交
8963 8964
}

8965
int __init btrfs_init_cachep(void)
C
Chris Mason 已提交
8966
{
D
David Sterba 已提交
8967
	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8968
			sizeof(struct btrfs_inode), 0,
8969 8970
			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
			init_once);
C
Chris Mason 已提交
8971 8972
	if (!btrfs_inode_cachep)
		goto fail;
8973

D
David Sterba 已提交
8974
	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
8975
			sizeof(struct btrfs_trans_handle), 0,
8976
			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
C
Chris Mason 已提交
8977 8978
	if (!btrfs_trans_handle_cachep)
		goto fail;
8979

D
David Sterba 已提交
8980
	btrfs_path_cachep = kmem_cache_create("btrfs_path",
8981
			sizeof(struct btrfs_path), 0,
8982
			SLAB_MEM_SPREAD, NULL);
C
Chris Mason 已提交
8983 8984
	if (!btrfs_path_cachep)
		goto fail;
8985

D
David Sterba 已提交
8986
	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
8987
			sizeof(struct btrfs_free_space), 0,
8988
			SLAB_MEM_SPREAD, NULL);
8989 8990 8991
	if (!btrfs_free_space_cachep)
		goto fail;

8992 8993 8994 8995 8996 8997
	btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
							PAGE_SIZE, PAGE_SIZE,
							SLAB_RED_ZONE, NULL);
	if (!btrfs_free_space_bitmap_cachep)
		goto fail;

C
Chris Mason 已提交
8998 8999 9000 9001 9002 9003
	return 0;
fail:
	btrfs_destroy_cachep();
	return -ENOMEM;
}

9004 9005
static int btrfs_getattr(const struct path *path, struct kstat *stat,
			 u32 request_mask, unsigned int flags)
C
Chris Mason 已提交
9006
{
9007
	u64 delalloc_bytes;
9008
	struct inode *inode = d_inode(path->dentry);
D
David Sterba 已提交
9009
	u32 blocksize = inode->i_sb->s_blocksize;
Y
Yonghong Song 已提交
9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027
	u32 bi_flags = BTRFS_I(inode)->flags;

	stat->result_mask |= STATX_BTIME;
	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
	if (bi_flags & BTRFS_INODE_APPEND)
		stat->attributes |= STATX_ATTR_APPEND;
	if (bi_flags & BTRFS_INODE_COMPRESS)
		stat->attributes |= STATX_ATTR_COMPRESSED;
	if (bi_flags & BTRFS_INODE_IMMUTABLE)
		stat->attributes |= STATX_ATTR_IMMUTABLE;
	if (bi_flags & BTRFS_INODE_NODUMP)
		stat->attributes |= STATX_ATTR_NODUMP;

	stat->attributes_mask |= (STATX_ATTR_APPEND |
				  STATX_ATTR_COMPRESSED |
				  STATX_ATTR_IMMUTABLE |
				  STATX_ATTR_NODUMP);
D
David Sterba 已提交
9028

C
Chris Mason 已提交
9029
	generic_fillattr(inode, stat);
9030
	stat->dev = BTRFS_I(inode)->root->anon_dev;
9031 9032

	spin_lock(&BTRFS_I(inode)->lock);
9033
	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9034
	spin_unlock(&BTRFS_I(inode)->lock);
D
David Sterba 已提交
9035
	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9036
			ALIGN(delalloc_bytes, blocksize)) >> 9;
C
Chris Mason 已提交
9037 9038 9039
	return 0;
}

9040 9041 9042 9043 9044
static int btrfs_rename_exchange(struct inode *old_dir,
			      struct dentry *old_dentry,
			      struct inode *new_dir,
			      struct dentry *new_dentry)
{
9045
	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9046 9047 9048 9049 9050
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(old_dir)->root;
	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
	struct inode *new_inode = new_dentry->d_inode;
	struct inode *old_inode = old_dentry->d_inode;
9051
	struct timespec64 ctime = current_time(old_inode);
9052
	struct dentry *parent;
9053 9054
	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9055 9056 9057
	u64 old_idx = 0;
	u64 new_idx = 0;
	int ret;
9058 9059
	bool root_log_pinned = false;
	bool dest_log_pinned = false;
9060 9061 9062 9063 9064
	struct btrfs_log_ctx ctx_root;
	struct btrfs_log_ctx ctx_dest;
	bool sync_log_root = false;
	bool sync_log_dest = false;
	bool commit_transaction = false;
9065 9066 9067 9068 9069

	/* we only allow rename subvolume link between subvolumes */
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
		return -EXDEV;

9070 9071 9072
	btrfs_init_log_ctx(&ctx_root, old_inode);
	btrfs_init_log_ctx(&ctx_dest, new_inode);

9073
	/* close the race window with snapshot create/destroy ioctl */
9074 9075
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
9076
		down_read(&fs_info->subvol_sem);
9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091

	/*
	 * We want to reserve the absolute worst case amount of items.  So if
	 * both inodes are subvols and we need to unlink them then that would
	 * require 4 item modifications, but if they are both normal inodes it
	 * would require 5 item modifications, so we'll assume their normal
	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
	 * should cover the worst case number of items we'll modify.
	 */
	trans = btrfs_start_transaction(root, 12);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto out_notrans;
	}

9092 9093 9094
	if (dest != root)
		btrfs_record_root_in_trans(trans, dest);

9095 9096 9097 9098
	/*
	 * We need to find a free sequence number both in the source and
	 * in the destination directory for the exchange.
	 */
9099
	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9100 9101
	if (ret)
		goto out_fail;
9102
	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9103 9104 9105 9106 9107 9108 9109 9110 9111
	if (ret)
		goto out_fail;

	BTRFS_I(old_inode)->dir_index = 0ULL;
	BTRFS_I(new_inode)->dir_index = 0ULL;

	/* Reference for the source. */
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
		/* force full log commit if subvolume involved. */
9112
		btrfs_set_log_full_commit(trans);
9113
	} else {
9114 9115
		btrfs_pin_log_trans(root);
		root_log_pinned = true;
9116 9117 9118 9119
		ret = btrfs_insert_inode_ref(trans, dest,
					     new_dentry->d_name.name,
					     new_dentry->d_name.len,
					     old_ino,
9120 9121
					     btrfs_ino(BTRFS_I(new_dir)),
					     old_idx);
9122 9123 9124 9125 9126 9127 9128
		if (ret)
			goto out_fail;
	}

	/* And now for the dest. */
	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
		/* force full log commit if subvolume involved. */
9129
		btrfs_set_log_full_commit(trans);
9130
	} else {
9131 9132
		btrfs_pin_log_trans(dest);
		dest_log_pinned = true;
9133 9134 9135 9136
		ret = btrfs_insert_inode_ref(trans, root,
					     old_dentry->d_name.name,
					     old_dentry->d_name.len,
					     new_ino,
9137 9138
					     btrfs_ino(BTRFS_I(old_dir)),
					     new_idx);
9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153
		if (ret)
			goto out_fail;
	}

	/* Update inode version and ctime/mtime. */
	inode_inc_iversion(old_dir);
	inode_inc_iversion(new_dir);
	inode_inc_iversion(old_inode);
	inode_inc_iversion(new_inode);
	old_dir->i_ctime = old_dir->i_mtime = ctime;
	new_dir->i_ctime = new_dir->i_mtime = ctime;
	old_inode->i_ctime = ctime;
	new_inode->i_ctime = ctime;

	if (old_dentry->d_parent != new_dentry->d_parent) {
9154 9155 9156 9157
		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
				BTRFS_I(old_inode), 1);
		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
				BTRFS_I(new_inode), 1);
9158 9159 9160 9161
	}

	/* src is a subvolume */
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9162
		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9163
	} else { /* src is an inode */
9164 9165
		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
					   BTRFS_I(old_dentry->d_inode),
9166 9167 9168 9169 9170 9171
					   old_dentry->d_name.name,
					   old_dentry->d_name.len);
		if (!ret)
			ret = btrfs_update_inode(trans, root, old_inode);
	}
	if (ret) {
9172
		btrfs_abort_transaction(trans, ret);
9173 9174 9175 9176 9177
		goto out_fail;
	}

	/* dest is a subvolume */
	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9178
		ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9179
	} else { /* dest is an inode */
9180 9181
		ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
					   BTRFS_I(new_dentry->d_inode),
9182 9183 9184 9185 9186 9187
					   new_dentry->d_name.name,
					   new_dentry->d_name.len);
		if (!ret)
			ret = btrfs_update_inode(trans, dest, new_inode);
	}
	if (ret) {
9188
		btrfs_abort_transaction(trans, ret);
9189 9190 9191
		goto out_fail;
	}

9192
	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9193 9194 9195
			     new_dentry->d_name.name,
			     new_dentry->d_name.len, 0, old_idx);
	if (ret) {
9196
		btrfs_abort_transaction(trans, ret);
9197 9198 9199
		goto out_fail;
	}

9200
	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9201 9202 9203
			     old_dentry->d_name.name,
			     old_dentry->d_name.len, 0, new_idx);
	if (ret) {
9204
		btrfs_abort_transaction(trans, ret);
9205 9206 9207 9208 9209 9210 9211 9212
		goto out_fail;
	}

	if (old_inode->i_nlink == 1)
		BTRFS_I(old_inode)->dir_index = old_idx;
	if (new_inode->i_nlink == 1)
		BTRFS_I(new_inode)->dir_index = new_idx;

9213
	if (root_log_pinned) {
9214
		parent = new_dentry->d_parent;
9215 9216 9217 9218 9219 9220 9221 9222
		ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
					 BTRFS_I(old_dir), parent,
					 false, &ctx_root);
		if (ret == BTRFS_NEED_LOG_SYNC)
			sync_log_root = true;
		else if (ret == BTRFS_NEED_TRANS_COMMIT)
			commit_transaction = true;
		ret = 0;
9223
		btrfs_end_log_trans(root);
9224
		root_log_pinned = false;
9225
	}
9226
	if (dest_log_pinned) {
9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237
		if (!commit_transaction) {
			parent = old_dentry->d_parent;
			ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
						 BTRFS_I(new_dir), parent,
						 false, &ctx_dest);
			if (ret == BTRFS_NEED_LOG_SYNC)
				sync_log_dest = true;
			else if (ret == BTRFS_NEED_TRANS_COMMIT)
				commit_transaction = true;
			ret = 0;
		}
9238
		btrfs_end_log_trans(dest);
9239
		dest_log_pinned = false;
9240 9241
	}
out_fail:
9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253
	/*
	 * If we have pinned a log and an error happened, we unpin tasks
	 * trying to sync the log and force them to fallback to a transaction
	 * commit if the log currently contains any of the inodes involved in
	 * this rename operation (to ensure we do not persist a log with an
	 * inconsistent state for any of these inodes or leading to any
	 * inconsistencies when replayed). If the transaction was aborted, the
	 * abortion reason is propagated to userspace when attempting to commit
	 * the transaction. If the log does not contain any of these inodes, we
	 * allow the tasks to sync it.
	 */
	if (ret && (root_log_pinned || dest_log_pinned)) {
9254 9255 9256
		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9257
		    (new_inode &&
9258
		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9259
			btrfs_set_log_full_commit(trans);
9260 9261 9262 9263 9264 9265 9266 9267 9268 9269

		if (root_log_pinned) {
			btrfs_end_log_trans(root);
			root_log_pinned = false;
		}
		if (dest_log_pinned) {
			btrfs_end_log_trans(dest);
			dest_log_pinned = false;
		}
	}
9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282
	if (!ret && sync_log_root && !commit_transaction) {
		ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
				     &ctx_root);
		if (ret)
			commit_transaction = true;
	}
	if (!ret && sync_log_dest && !commit_transaction) {
		ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
				     &ctx_dest);
		if (ret)
			commit_transaction = true;
	}
	if (commit_transaction) {
9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294
		/*
		 * We may have set commit_transaction when logging the new name
		 * in the destination root, in which case we left the source
		 * root context in the list of log contextes. So make sure we
		 * remove it to avoid invalid memory accesses, since the context
		 * was allocated in our stack frame.
		 */
		if (sync_log_root) {
			mutex_lock(&root->log_mutex);
			list_del_init(&ctx_root.list);
			mutex_unlock(&root->log_mutex);
		}
9295 9296 9297 9298 9299 9300 9301
		ret = btrfs_commit_transaction(trans);
	} else {
		int ret2;

		ret2 = btrfs_end_transaction(trans);
		ret = ret ? ret : ret2;
	}
9302
out_notrans:
9303 9304
	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
9305
		up_read(&fs_info->subvol_sem);
9306

9307 9308 9309
	ASSERT(list_empty(&ctx_root.list));
	ASSERT(list_empty(&ctx_dest.list));

9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329
	return ret;
}

static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct inode *dir,
				     struct dentry *dentry)
{
	int ret;
	struct inode *inode;
	u64 objectid;
	u64 index;

	ret = btrfs_find_free_ino(root, &objectid);
	if (ret)
		return ret;

	inode = btrfs_new_inode(trans, root, dir,
				dentry->d_name.name,
				dentry->d_name.len,
9330
				btrfs_ino(BTRFS_I(dir)),
9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346
				objectid,
				S_IFCHR | WHITEOUT_MODE,
				&index);

	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		return ret;
	}

	inode->i_op = &btrfs_special_inode_operations;
	init_special_inode(inode, inode->i_mode,
		WHITEOUT_DEV);

	ret = btrfs_init_inode_security(trans, inode, dir,
				&dentry->d_name);
	if (ret)
9347
		goto out;
9348

9349 9350
	ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
				BTRFS_I(inode), 0, index);
9351
	if (ret)
9352
		goto out;
9353 9354

	ret = btrfs_update_inode(trans, root, inode);
9355
out:
9356
	unlock_new_inode(inode);
9357 9358
	if (ret)
		inode_dec_link_count(inode);
9359 9360
	iput(inode);

9361
	return ret;
9362 9363
}

C
Chris Mason 已提交
9364
static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9365 9366
			   struct inode *new_dir, struct dentry *new_dentry,
			   unsigned int flags)
C
Chris Mason 已提交
9367
{
9368
	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
C
Chris Mason 已提交
9369
	struct btrfs_trans_handle *trans;
9370
	unsigned int trans_num_items;
C
Chris Mason 已提交
9371
	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9372
	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9373 9374
	struct inode *new_inode = d_inode(new_dentry);
	struct inode *old_inode = d_inode(old_dentry);
9375
	u64 index = 0;
C
Chris Mason 已提交
9376
	int ret;
9377
	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9378
	bool log_pinned = false;
9379 9380 9381
	struct btrfs_log_ctx ctx;
	bool sync_log = false;
	bool commit_transaction = false;
C
Chris Mason 已提交
9382

9383
	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9384 9385
		return -EPERM;

9386
	/* we only allow rename subvolume link between subvolumes */
L
Li Zefan 已提交
9387
	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9388 9389
		return -EXDEV;

L
Li Zefan 已提交
9390
	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9391
	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
C
Chris Mason 已提交
9392
		return -ENOTEMPTY;
9393

9394 9395 9396
	if (S_ISDIR(old_inode->i_mode) && new_inode &&
	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
		return -ENOTEMPTY;
C
Chris Mason 已提交
9397 9398 9399


	/* check for collisions, even if the  name isn't there */
9400
	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
C
Chris Mason 已提交
9401 9402 9403 9404 9405 9406 9407
			     new_dentry->d_name.name,
			     new_dentry->d_name.len);

	if (ret) {
		if (ret == -EEXIST) {
			/* we shouldn't get
			 * eexist without a new_inode */
9408
			if (WARN_ON(!new_inode)) {
C
Chris Mason 已提交
9409 9410 9411 9412 9413 9414 9415 9416 9417
				return ret;
			}
		} else {
			/* maybe -EOVERFLOW */
			return ret;
		}
	}
	ret = 0;

9418
	/*
9419 9420
	 * we're using rename to replace one file with another.  Start IO on it
	 * now so  we don't add too much work to the end of the transaction
9421
	 */
9422
	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9423 9424
		filemap_flush(old_inode->i_mapping);

9425
	/* close the racy window with snapshot create/destroy ioctl */
L
Li Zefan 已提交
9426
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9427
		down_read(&fs_info->subvol_sem);
9428 9429 9430 9431
	/*
	 * We want to reserve the absolute worst case amount of items.  So if
	 * both inodes are subvols and we need to unlink them then that would
	 * require 4 item modifications, but if they are both normal inodes it
9432
	 * would require 5 item modifications, so we'll assume they are normal
9433 9434
	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
	 * should cover the worst case number of items we'll modify.
9435 9436 9437
	 * If our rename has the whiteout flag, we need more 5 units for the
	 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
	 * when selinux is enabled).
9438
	 */
9439 9440 9441 9442
	trans_num_items = 11;
	if (flags & RENAME_WHITEOUT)
		trans_num_items += 5;
	trans = btrfs_start_transaction(root, trans_num_items);
9443
	if (IS_ERR(trans)) {
9444 9445 9446
		ret = PTR_ERR(trans);
		goto out_notrans;
	}
9447

9448 9449
	if (dest != root)
		btrfs_record_root_in_trans(trans, dest);
9450

9451
	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9452 9453
	if (ret)
		goto out_fail;
9454

9455
	BTRFS_I(old_inode)->dir_index = 0ULL;
L
Li Zefan 已提交
9456
	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9457
		/* force full log commit if subvolume involved. */
9458
		btrfs_set_log_full_commit(trans);
9459
	} else {
9460 9461
		btrfs_pin_log_trans(root);
		log_pinned = true;
9462 9463 9464
		ret = btrfs_insert_inode_ref(trans, dest,
					     new_dentry->d_name.name,
					     new_dentry->d_name.len,
L
Li Zefan 已提交
9465
					     old_ino,
9466
					     btrfs_ino(BTRFS_I(new_dir)), index);
9467 9468
		if (ret)
			goto out_fail;
9469
	}
9470

9471 9472 9473
	inode_inc_iversion(old_dir);
	inode_inc_iversion(new_dir);
	inode_inc_iversion(old_inode);
9474 9475
	old_dir->i_ctime = old_dir->i_mtime =
	new_dir->i_ctime = new_dir->i_mtime =
9476
	old_inode->i_ctime = current_time(old_dir);
9477

9478
	if (old_dentry->d_parent != new_dentry->d_parent)
9479 9480
		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
				BTRFS_I(old_inode), 1);
9481

L
Li Zefan 已提交
9482
	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9483
		ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
9484
	} else {
9485 9486
		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
					BTRFS_I(d_inode(old_dentry)),
9487 9488 9489 9490
					old_dentry->d_name.name,
					old_dentry->d_name.len);
		if (!ret)
			ret = btrfs_update_inode(trans, root, old_inode);
9491
	}
9492
	if (ret) {
9493
		btrfs_abort_transaction(trans, ret);
9494 9495
		goto out_fail;
	}
C
Chris Mason 已提交
9496 9497

	if (new_inode) {
9498
		inode_inc_iversion(new_inode);
9499
		new_inode->i_ctime = current_time(new_inode);
9500
		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9501
			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9502
			ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
9503 9504
			BUG_ON(new_inode->i_nlink == 0);
		} else {
9505 9506
			ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
						 BTRFS_I(d_inode(new_dentry)),
9507 9508 9509
						 new_dentry->d_name.name,
						 new_dentry->d_name.len);
		}
9510
		if (!ret && new_inode->i_nlink == 0)
9511 9512
			ret = btrfs_orphan_add(trans,
					BTRFS_I(d_inode(new_dentry)));
9513
		if (ret) {
9514
			btrfs_abort_transaction(trans, ret);
9515 9516
			goto out_fail;
		}
C
Chris Mason 已提交
9517
	}
9518

9519
	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9520
			     new_dentry->d_name.name,
9521
			     new_dentry->d_name.len, 0, index);
9522
	if (ret) {
9523
		btrfs_abort_transaction(trans, ret);
9524 9525
		goto out_fail;
	}
C
Chris Mason 已提交
9526

9527 9528 9529
	if (old_inode->i_nlink == 1)
		BTRFS_I(old_inode)->dir_index = index;

9530
	if (log_pinned) {
9531
		struct dentry *parent = new_dentry->d_parent;
9532

9533 9534 9535 9536 9537 9538 9539 9540 9541
		btrfs_init_log_ctx(&ctx, old_inode);
		ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
					 BTRFS_I(old_dir), parent,
					 false, &ctx);
		if (ret == BTRFS_NEED_LOG_SYNC)
			sync_log = true;
		else if (ret == BTRFS_NEED_TRANS_COMMIT)
			commit_transaction = true;
		ret = 0;
9542
		btrfs_end_log_trans(root);
9543
		log_pinned = false;
9544
	}
9545 9546 9547 9548 9549 9550

	if (flags & RENAME_WHITEOUT) {
		ret = btrfs_whiteout_for_rename(trans, root, old_dir,
						old_dentry);

		if (ret) {
9551
			btrfs_abort_transaction(trans, ret);
9552 9553
			goto out_fail;
		}
9554
	}
C
Chris Mason 已提交
9555
out_fail:
9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567
	/*
	 * If we have pinned the log and an error happened, we unpin tasks
	 * trying to sync the log and force them to fallback to a transaction
	 * commit if the log currently contains any of the inodes involved in
	 * this rename operation (to ensure we do not persist a log with an
	 * inconsistent state for any of these inodes or leading to any
	 * inconsistencies when replayed). If the transaction was aborted, the
	 * abortion reason is propagated to userspace when attempting to commit
	 * the transaction. If the log does not contain any of these inodes, we
	 * allow the tasks to sync it.
	 */
	if (ret && log_pinned) {
9568 9569 9570
		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9571
		    (new_inode &&
9572
		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9573
			btrfs_set_log_full_commit(trans);
9574 9575 9576 9577

		btrfs_end_log_trans(root);
		log_pinned = false;
	}
9578 9579 9580 9581
	if (!ret && sync_log) {
		ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
		if (ret)
			commit_transaction = true;
9582 9583 9584 9585
	} else if (sync_log) {
		mutex_lock(&root->log_mutex);
		list_del(&ctx.list);
		mutex_unlock(&root->log_mutex);
9586 9587 9588 9589 9590 9591 9592 9593 9594
	}
	if (commit_transaction) {
		ret = btrfs_commit_transaction(trans);
	} else {
		int ret2;

		ret2 = btrfs_end_transaction(trans);
		ret = ret ? ret : ret2;
	}
9595
out_notrans:
L
Li Zefan 已提交
9596
	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9597
		up_read(&fs_info->subvol_sem);
J
Josef Bacik 已提交
9598

C
Chris Mason 已提交
9599 9600 9601
	return ret;
}

M
Miklos Szeredi 已提交
9602 9603 9604 9605
static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
			 struct inode *new_dir, struct dentry *new_dentry,
			 unsigned int flags)
{
9606
	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
M
Miklos Szeredi 已提交
9607 9608
		return -EINVAL;

9609 9610 9611 9612 9613
	if (flags & RENAME_EXCHANGE)
		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
					  new_dentry);

	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
M
Miklos Szeredi 已提交
9614 9615
}

9616 9617 9618 9619 9620 9621 9622
struct btrfs_delalloc_work {
	struct inode *inode;
	struct completion completion;
	struct list_head list;
	struct btrfs_work work;
};

9623 9624 9625
static void btrfs_run_delalloc_work(struct btrfs_work *work)
{
	struct btrfs_delalloc_work *delalloc_work;
9626
	struct inode *inode;
9627 9628 9629

	delalloc_work = container_of(work, struct btrfs_delalloc_work,
				     work);
9630
	inode = delalloc_work->inode;
9631 9632 9633
	filemap_flush(inode->i_mapping);
	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
				&BTRFS_I(inode)->runtime_flags))
9634
		filemap_flush(inode->i_mapping);
9635

9636
	iput(inode);
9637 9638 9639
	complete(&delalloc_work->completion);
}

9640
static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9641 9642 9643
{
	struct btrfs_delalloc_work *work;

9644
	work = kmalloc(sizeof(*work), GFP_NOFS);
9645 9646 9647 9648 9649 9650
	if (!work)
		return NULL;

	init_completion(&work->completion);
	INIT_LIST_HEAD(&work->list);
	work->inode = inode;
9651
	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9652 9653 9654 9655

	return work;
}

C
Chris Mason 已提交
9656 9657 9658 9659
/*
 * some fairly slow code that needs optimization. This walks the list
 * of all the inodes with pending delalloc and forces them to disk.
 */
9660
static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot)
9661 9662
{
	struct btrfs_inode *binode;
9663
	struct inode *inode;
9664 9665
	struct btrfs_delalloc_work *work, *next;
	struct list_head works;
9666
	struct list_head splice;
9667
	int ret = 0;
9668

9669
	INIT_LIST_HEAD(&works);
9670
	INIT_LIST_HEAD(&splice);
9671

9672
	mutex_lock(&root->delalloc_mutex);
9673 9674
	spin_lock(&root->delalloc_lock);
	list_splice_init(&root->delalloc_inodes, &splice);
9675 9676
	while (!list_empty(&splice)) {
		binode = list_entry(splice.next, struct btrfs_inode,
9677
				    delalloc_inodes);
9678

9679 9680
		list_move_tail(&binode->delalloc_inodes,
			       &root->delalloc_inodes);
9681
		inode = igrab(&binode->vfs_inode);
9682
		if (!inode) {
9683
			cond_resched_lock(&root->delalloc_lock);
9684
			continue;
9685
		}
9686
		spin_unlock(&root->delalloc_lock);
9687

9688 9689 9690
		if (snapshot)
			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
				&binode->runtime_flags);
9691
		work = btrfs_alloc_delalloc_work(inode);
9692
		if (!work) {
9693
			iput(inode);
9694
			ret = -ENOMEM;
9695
			goto out;
9696
		}
9697
		list_add_tail(&work->list, &works);
9698 9699
		btrfs_queue_work(root->fs_info->flush_workers,
				 &work->work);
9700 9701
		ret++;
		if (nr != -1 && ret >= nr)
9702
			goto out;
9703
		cond_resched();
9704
		spin_lock(&root->delalloc_lock);
9705
	}
9706
	spin_unlock(&root->delalloc_lock);
9707

9708
out:
9709 9710
	list_for_each_entry_safe(work, next, &works, list) {
		list_del_init(&work->list);
9711 9712
		wait_for_completion(&work->completion);
		kfree(work);
9713 9714
	}

9715
	if (!list_empty(&splice)) {
9716 9717 9718 9719
		spin_lock(&root->delalloc_lock);
		list_splice_tail(&splice, &root->delalloc_inodes);
		spin_unlock(&root->delalloc_lock);
	}
9720
	mutex_unlock(&root->delalloc_mutex);
9721 9722
	return ret;
}
9723

9724
int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
9725
{
9726
	struct btrfs_fs_info *fs_info = root->fs_info;
9727
	int ret;
9728

9729
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
9730 9731
		return -EROFS;

9732
	ret = start_delalloc_inodes(root, -1, true);
9733 9734
	if (ret > 0)
		ret = 0;
9735 9736 9737
	return ret;
}

9738
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
9739 9740 9741 9742 9743
{
	struct btrfs_root *root;
	struct list_head splice;
	int ret;

9744
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
9745 9746 9747 9748
		return -EROFS;

	INIT_LIST_HEAD(&splice);

9749
	mutex_lock(&fs_info->delalloc_root_mutex);
9750 9751
	spin_lock(&fs_info->delalloc_root_lock);
	list_splice_init(&fs_info->delalloc_roots, &splice);
9752
	while (!list_empty(&splice) && nr) {
9753 9754
		root = list_first_entry(&splice, struct btrfs_root,
					delalloc_root);
9755
		root = btrfs_grab_root(root);
9756 9757 9758 9759 9760
		BUG_ON(!root);
		list_move_tail(&root->delalloc_root,
			       &fs_info->delalloc_roots);
		spin_unlock(&fs_info->delalloc_root_lock);

9761
		ret = start_delalloc_inodes(root, nr, false);
9762
		btrfs_put_root(root);
9763
		if (ret < 0)
9764 9765
			goto out;

9766 9767 9768 9769
		if (nr != -1) {
			nr -= ret;
			WARN_ON(nr < 0);
		}
9770
		spin_lock(&fs_info->delalloc_root_lock);
9771
	}
9772
	spin_unlock(&fs_info->delalloc_root_lock);
9773

9774
	ret = 0;
9775
out:
9776
	if (!list_empty(&splice)) {
9777 9778 9779
		spin_lock(&fs_info->delalloc_root_lock);
		list_splice_tail(&splice, &fs_info->delalloc_roots);
		spin_unlock(&fs_info->delalloc_root_lock);
9780
	}
9781
	mutex_unlock(&fs_info->delalloc_root_mutex);
9782
	return ret;
9783 9784
}

C
Chris Mason 已提交
9785 9786 9787
static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
			 const char *symname)
{
9788
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
C
Chris Mason 已提交
9789 9790 9791 9792
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct btrfs_path *path;
	struct btrfs_key key;
9793
	struct inode *inode = NULL;
C
Chris Mason 已提交
9794 9795
	int err;
	u64 objectid;
9796
	u64 index = 0;
C
Chris Mason 已提交
9797 9798
	int name_len;
	int datasize;
9799
	unsigned long ptr;
C
Chris Mason 已提交
9800
	struct btrfs_file_extent_item *ei;
9801
	struct extent_buffer *leaf;
C
Chris Mason 已提交
9802

9803
	name_len = strlen(symname);
9804
	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
C
Chris Mason 已提交
9805
		return -ENAMETOOLONG;
9806

J
Josef Bacik 已提交
9807 9808 9809
	/*
	 * 2 items for inode item and ref
	 * 2 items for dir items
9810 9811
	 * 1 item for updating parent inode item
	 * 1 item for the inline extent item
J
Josef Bacik 已提交
9812 9813
	 * 1 item for xattr if selinux is on
	 */
9814
	trans = btrfs_start_transaction(root, 7);
9815 9816
	if (IS_ERR(trans))
		return PTR_ERR(trans);
9817

9818 9819 9820 9821
	err = btrfs_find_free_ino(root, &objectid);
	if (err)
		goto out_unlock;

9822
	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
9823 9824
				dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
				objectid, S_IFLNK|S_IRWXUGO, &index);
9825 9826
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
A
Al Viro 已提交
9827
		inode = NULL;
C
Chris Mason 已提交
9828
		goto out_unlock;
9829
	}
C
Chris Mason 已提交
9830

9831 9832 9833 9834 9835 9836 9837 9838
	/*
	* If the active LSM wants to access the inode during
	* d_instantiate it needs these. Smack checks to see
	* if the filesystem supports xattrs by looking at the
	* ops vector.
	*/
	inode->i_fop = &btrfs_file_operations;
	inode->i_op = &btrfs_file_inode_operations;
9839 9840 9841 9842 9843
	inode->i_mapping->a_ops = &btrfs_aops;
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;

	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
	if (err)
A
Al Viro 已提交
9844
		goto out_unlock;
9845

C
Chris Mason 已提交
9846
	path = btrfs_alloc_path();
9847 9848
	if (!path) {
		err = -ENOMEM;
A
Al Viro 已提交
9849
		goto out_unlock;
9850
	}
9851
	key.objectid = btrfs_ino(BTRFS_I(inode));
C
Chris Mason 已提交
9852
	key.offset = 0;
9853
	key.type = BTRFS_EXTENT_DATA_KEY;
C
Chris Mason 已提交
9854 9855 9856
	datasize = btrfs_file_extent_calc_inline_size(name_len);
	err = btrfs_insert_empty_item(trans, root, path, &key,
				      datasize);
9857
	if (err) {
9858
		btrfs_free_path(path);
A
Al Viro 已提交
9859
		goto out_unlock;
9860
	}
9861 9862 9863 9864 9865
	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
	btrfs_set_file_extent_type(leaf, ei,
C
Chris Mason 已提交
9866
				   BTRFS_FILE_EXTENT_INLINE);
C
Chris Mason 已提交
9867 9868 9869 9870 9871
	btrfs_set_file_extent_encryption(leaf, ei, 0);
	btrfs_set_file_extent_compression(leaf, ei, 0);
	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);

C
Chris Mason 已提交
9872
	ptr = btrfs_file_extent_inline_start(ei);
9873 9874
	write_extent_buffer(leaf, symname, ptr, name_len);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
9875
	btrfs_free_path(path);
9876

C
Chris Mason 已提交
9877
	inode->i_op = &btrfs_symlink_inode_operations;
9878
	inode_nohighmem(inode);
Y
Yan Zheng 已提交
9879
	inode_set_bytes(inode, name_len);
9880
	btrfs_i_size_write(BTRFS_I(inode), name_len);
9881
	err = btrfs_update_inode(trans, root, inode);
9882 9883 9884 9885 9886 9887
	/*
	 * Last step, add directory indexes for our symlink inode. This is the
	 * last step to avoid extra cleanup of these indexes if an error happens
	 * elsewhere above.
	 */
	if (!err)
9888 9889
		err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
				BTRFS_I(inode), 0, index);
A
Al Viro 已提交
9890 9891
	if (err)
		goto out_unlock;
9892

9893
	d_instantiate_new(dentry, inode);
C
Chris Mason 已提交
9894 9895

out_unlock:
9896
	btrfs_end_transaction(trans);
A
Al Viro 已提交
9897
	if (err && inode) {
C
Chris Mason 已提交
9898
		inode_dec_link_count(inode);
A
Al Viro 已提交
9899
		discard_new_inode(inode);
C
Chris Mason 已提交
9900
	}
9901
	btrfs_btree_balance_dirty(fs_info);
C
Chris Mason 已提交
9902 9903
	return err;
}
9904

9905 9906 9907 9908
static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
				       u64 start, u64 num_bytes, u64 min_size,
				       loff_t actual_len, u64 *alloc_hint,
				       struct btrfs_trans_handle *trans)
Y
Yan Zheng 已提交
9909
{
9910
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
9911 9912
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
	struct extent_map *em;
Y
Yan Zheng 已提交
9913 9914 9915
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_key ins;
	u64 cur_offset = start;
9916
	u64 clear_offset = start;
9917
	u64 i_size;
9918
	u64 cur_bytes;
9919
	u64 last_alloc = (u64)-1;
Y
Yan Zheng 已提交
9920
	int ret = 0;
9921
	bool own_trans = true;
9922
	u64 end = start + num_bytes - 1;
Y
Yan Zheng 已提交
9923

9924 9925
	if (trans)
		own_trans = false;
Y
Yan Zheng 已提交
9926
	while (num_bytes > 0) {
9927 9928 9929 9930 9931 9932
		if (own_trans) {
			trans = btrfs_start_transaction(root, 3);
			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				break;
			}
9933 9934
		}

9935
		cur_bytes = min_t(u64, num_bytes, SZ_256M);
9936
		cur_bytes = max(cur_bytes, min_size);
9937 9938 9939 9940 9941 9942 9943
		/*
		 * If we are severely fragmented we could end up with really
		 * small allocations, so if the allocator is returning small
		 * chunks lets make its job easier by only searching for those
		 * sized chunks.
		 */
		cur_bytes = min(cur_bytes, last_alloc);
9944 9945
		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
				min_size, 0, *alloc_hint, &ins, 1, 0);
9946
		if (ret) {
9947
			if (own_trans)
9948
				btrfs_end_transaction(trans);
9949
			break;
Y
Yan Zheng 已提交
9950
		}
9951 9952 9953 9954 9955 9956 9957 9958 9959

		/*
		 * We've reserved this space, and thus converted it from
		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
		 * from here on out we will only need to clear our reservation
		 * for the remaining unreserved area, so advance our
		 * clear_offset by our extent size.
		 */
		clear_offset += ins.offset;
9960
		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9961

9962
		last_alloc = ins.offset;
Y
Yan Zheng 已提交
9963 9964 9965
		ret = insert_reserved_file_extent(trans, inode,
						  cur_offset, ins.objectid,
						  ins.offset, ins.offset,
Y
Yan, Zheng 已提交
9966
						  ins.offset, 0, 0, 0,
Y
Yan Zheng 已提交
9967
						  BTRFS_FILE_EXTENT_PREALLOC);
9968
		if (ret) {
9969
			btrfs_free_reserved_extent(fs_info, ins.objectid,
9970
						   ins.offset, 0);
9971
			btrfs_abort_transaction(trans, ret);
9972
			if (own_trans)
9973
				btrfs_end_transaction(trans);
9974 9975
			break;
		}
9976

9977
		btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
C
Chris Mason 已提交
9978
					cur_offset + ins.offset -1, 0);
9979

J
Josef Bacik 已提交
9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991
		em = alloc_extent_map();
		if (!em) {
			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
				&BTRFS_I(inode)->runtime_flags);
			goto next;
		}

		em->start = cur_offset;
		em->orig_start = cur_offset;
		em->len = ins.offset;
		em->block_start = ins.objectid;
		em->block_len = ins.offset;
9992
		em->orig_block_len = ins.offset;
J
Josef Bacik 已提交
9993
		em->ram_bytes = ins.offset;
J
Josef Bacik 已提交
9994 9995 9996 9997 9998
		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
		em->generation = trans->transid;

		while (1) {
			write_lock(&em_tree->lock);
J
Josef Bacik 已提交
9999
			ret = add_extent_mapping(em_tree, em, 1);
J
Josef Bacik 已提交
10000 10001 10002
			write_unlock(&em_tree->lock);
			if (ret != -EEXIST)
				break;
10003
			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
J
Josef Bacik 已提交
10004 10005 10006 10007 10008
						cur_offset + ins.offset - 1,
						0);
		}
		free_extent_map(em);
next:
Y
Yan Zheng 已提交
10009 10010
		num_bytes -= ins.offset;
		cur_offset += ins.offset;
10011
		*alloc_hint = ins.objectid + ins.offset;
10012

10013
		inode_inc_iversion(inode);
10014
		inode->i_ctime = current_time(inode);
10015
		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
Y
Yan Zheng 已提交
10016
		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10017 10018
		    (actual_len > inode->i_size) &&
		    (cur_offset > inode->i_size)) {
10019
			if (cur_offset > actual_len)
10020
				i_size = actual_len;
10021
			else
10022 10023
				i_size = cur_offset;
			i_size_write(inode, i_size);
10024
			btrfs_inode_safe_disk_i_size_write(inode, 0);
10025 10026
		}

Y
Yan Zheng 已提交
10027
		ret = btrfs_update_inode(trans, root, inode);
10028 10029

		if (ret) {
10030
			btrfs_abort_transaction(trans, ret);
10031
			if (own_trans)
10032
				btrfs_end_transaction(trans);
10033 10034
			break;
		}
Y
Yan Zheng 已提交
10035

10036
		if (own_trans)
10037
			btrfs_end_transaction(trans);
10038
	}
10039 10040 10041
	if (clear_offset < end)
		btrfs_free_reserved_data_space(inode, NULL, clear_offset,
			end - clear_offset + 1);
Y
Yan Zheng 已提交
10042 10043 10044
	return ret;
}

10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062
int btrfs_prealloc_file_range(struct inode *inode, int mode,
			      u64 start, u64 num_bytes, u64 min_size,
			      loff_t actual_len, u64 *alloc_hint)
{
	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
					   min_size, actual_len, alloc_hint,
					   NULL);
}

int btrfs_prealloc_file_range_trans(struct inode *inode,
				    struct btrfs_trans_handle *trans, int mode,
				    u64 start, u64 num_bytes, u64 min_size,
				    loff_t actual_len, u64 *alloc_hint)
{
	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
					   min_size, actual_len, alloc_hint, trans);
}

10063 10064 10065 10066 10067
static int btrfs_set_page_dirty(struct page *page)
{
	return __set_page_dirty_nobuffers(page);
}

10068
static int btrfs_permission(struct inode *inode, int mask)
Y
Yan 已提交
10069
{
L
Li Zefan 已提交
10070
	struct btrfs_root *root = BTRFS_I(inode)->root;
10071
	umode_t mode = inode->i_mode;
L
Li Zefan 已提交
10072

10073 10074 10075 10076 10077 10078 10079
	if (mask & MAY_WRITE &&
	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
		if (btrfs_root_readonly(root))
			return -EROFS;
		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
			return -EACCES;
	}
10080
	return generic_permission(inode, mask);
Y
Yan 已提交
10081
}
C
Chris Mason 已提交
10082

10083 10084
static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
{
10085
	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(dir)->root;
	struct inode *inode = NULL;
	u64 objectid;
	u64 index;
	int ret = 0;

	/*
	 * 5 units required for adding orphan entry
	 */
	trans = btrfs_start_transaction(root, 5);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	ret = btrfs_find_free_ino(root, &objectid);
	if (ret)
		goto out;

	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10105
			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117
	if (IS_ERR(inode)) {
		ret = PTR_ERR(inode);
		inode = NULL;
		goto out;
	}

	inode->i_fop = &btrfs_file_operations;
	inode->i_op = &btrfs_file_inode_operations;

	inode->i_mapping->a_ops = &btrfs_aops;
	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;

10118 10119
	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
	if (ret)
A
Al Viro 已提交
10120
		goto out;
10121 10122 10123

	ret = btrfs_update_inode(trans, root, inode);
	if (ret)
A
Al Viro 已提交
10124
		goto out;
10125
	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10126
	if (ret)
A
Al Viro 已提交
10127
		goto out;
10128

10129 10130 10131 10132 10133 10134 10135 10136
	/*
	 * We set number of links to 0 in btrfs_new_inode(), and here we set
	 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
	 * through:
	 *
	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
	 */
	set_nlink(inode, 1);
10137
	d_tmpfile(dentry, inode);
A
Al Viro 已提交
10138
	unlock_new_inode(inode);
10139 10140
	mark_inode_dirty(inode);
out:
10141
	btrfs_end_transaction(trans);
A
Al Viro 已提交
10142 10143
	if (ret && inode)
		discard_new_inode(inode);
10144
	btrfs_btree_balance_dirty(fs_info);
10145 10146 10147
	return ret;
}

10148
void btrfs_set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
10149
{
10150
	struct inode *inode = tree->private_data;
10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163
	unsigned long index = start >> PAGE_SHIFT;
	unsigned long end_index = end >> PAGE_SHIFT;
	struct page *page;

	while (index <= end_index) {
		page = find_get_page(inode->i_mapping, index);
		ASSERT(page); /* Pages should be in the extent_io_tree */
		set_page_writeback(page);
		put_page(page);
		index++;
	}
}

O
Omar Sandoval 已提交
10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348
#ifdef CONFIG_SWAP
/*
 * Add an entry indicating a block group or device which is pinned by a
 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
 * negative errno on failure.
 */
static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
				  bool is_block_group)
{
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct btrfs_swapfile_pin *sp, *entry;
	struct rb_node **p;
	struct rb_node *parent = NULL;

	sp = kmalloc(sizeof(*sp), GFP_NOFS);
	if (!sp)
		return -ENOMEM;
	sp->ptr = ptr;
	sp->inode = inode;
	sp->is_block_group = is_block_group;

	spin_lock(&fs_info->swapfile_pins_lock);
	p = &fs_info->swapfile_pins.rb_node;
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
		if (sp->ptr < entry->ptr ||
		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
			p = &(*p)->rb_left;
		} else if (sp->ptr > entry->ptr ||
			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
			p = &(*p)->rb_right;
		} else {
			spin_unlock(&fs_info->swapfile_pins_lock);
			kfree(sp);
			return 1;
		}
	}
	rb_link_node(&sp->node, parent, p);
	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
	spin_unlock(&fs_info->swapfile_pins_lock);
	return 0;
}

/* Free all of the entries pinned by this swapfile. */
static void btrfs_free_swapfile_pins(struct inode *inode)
{
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct btrfs_swapfile_pin *sp;
	struct rb_node *node, *next;

	spin_lock(&fs_info->swapfile_pins_lock);
	node = rb_first(&fs_info->swapfile_pins);
	while (node) {
		next = rb_next(node);
		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
		if (sp->inode == inode) {
			rb_erase(&sp->node, &fs_info->swapfile_pins);
			if (sp->is_block_group)
				btrfs_put_block_group(sp->ptr);
			kfree(sp);
		}
		node = next;
	}
	spin_unlock(&fs_info->swapfile_pins_lock);
}

struct btrfs_swap_info {
	u64 start;
	u64 block_start;
	u64 block_len;
	u64 lowest_ppage;
	u64 highest_ppage;
	unsigned long nr_pages;
	int nr_extents;
};

static int btrfs_add_swap_extent(struct swap_info_struct *sis,
				 struct btrfs_swap_info *bsi)
{
	unsigned long nr_pages;
	u64 first_ppage, first_ppage_reported, next_ppage;
	int ret;

	first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
	next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
				PAGE_SIZE) >> PAGE_SHIFT;

	if (first_ppage >= next_ppage)
		return 0;
	nr_pages = next_ppage - first_ppage;

	first_ppage_reported = first_ppage;
	if (bsi->start == 0)
		first_ppage_reported++;
	if (bsi->lowest_ppage > first_ppage_reported)
		bsi->lowest_ppage = first_ppage_reported;
	if (bsi->highest_ppage < (next_ppage - 1))
		bsi->highest_ppage = next_ppage - 1;

	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
	if (ret < 0)
		return ret;
	bsi->nr_extents += ret;
	bsi->nr_pages += nr_pages;
	return 0;
}

static void btrfs_swap_deactivate(struct file *file)
{
	struct inode *inode = file_inode(file);

	btrfs_free_swapfile_pins(inode);
	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
}

static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
			       sector_t *span)
{
	struct inode *inode = file_inode(file);
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
	struct extent_state *cached_state = NULL;
	struct extent_map *em = NULL;
	struct btrfs_device *device = NULL;
	struct btrfs_swap_info bsi = {
		.lowest_ppage = (sector_t)-1ULL,
	};
	int ret = 0;
	u64 isize;
	u64 start;

	/*
	 * If the swap file was just created, make sure delalloc is done. If the
	 * file changes again after this, the user is doing something stupid and
	 * we don't really care.
	 */
	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
	if (ret)
		return ret;

	/*
	 * The inode is locked, so these flags won't change after we check them.
	 */
	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
		btrfs_warn(fs_info, "swapfile must not be compressed");
		return -EINVAL;
	}
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
		return -EINVAL;
	}
	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
		btrfs_warn(fs_info, "swapfile must not be checksummed");
		return -EINVAL;
	}

	/*
	 * Balance or device remove/replace/resize can move stuff around from
	 * under us. The EXCL_OP flag makes sure they aren't running/won't run
	 * concurrently while we are mapping the swap extents, and
	 * fs_info->swapfile_pins prevents them from running while the swap file
	 * is active and moving the extents. Note that this also prevents a
	 * concurrent device add which isn't actually necessary, but it's not
	 * really worth the trouble to allow it.
	 */
	if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
		btrfs_warn(fs_info,
	   "cannot activate swapfile while exclusive operation is running");
		return -EBUSY;
	}
	/*
	 * Snapshots can create extents which require COW even if NODATACOW is
	 * set. We use this counter to prevent snapshots. We must increment it
	 * before walking the extents because we don't want a concurrent
	 * snapshot to run after we've already checked the extents.
	 */
	atomic_inc(&BTRFS_I(inode)->root->nr_swapfiles);

	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);

	lock_extent_bits(io_tree, 0, isize - 1, &cached_state);
	start = 0;
	while (start < isize) {
		u64 logical_block_start, physical_block_start;
10349
		struct btrfs_block_group *bg;
O
Omar Sandoval 已提交
10350 10351
		u64 len = isize - start;

10352
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
O
Omar Sandoval 已提交
10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}

		if (em->block_start == EXTENT_MAP_HOLE) {
			btrfs_warn(fs_info, "swapfile must not have holes");
			ret = -EINVAL;
			goto out;
		}
		if (em->block_start == EXTENT_MAP_INLINE) {
			/*
			 * It's unlikely we'll ever actually find ourselves
			 * here, as a file small enough to fit inline won't be
			 * big enough to store more than the swap header, but in
			 * case something changes in the future, let's catch it
			 * here rather than later.
			 */
			btrfs_warn(fs_info, "swapfile must not be inline");
			ret = -EINVAL;
			goto out;
		}
		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
			btrfs_warn(fs_info, "swapfile must not be compressed");
			ret = -EINVAL;
			goto out;
		}

		logical_block_start = em->block_start + (start - em->start);
		len = min(len, em->len - (start - em->start));
		free_extent_map(em);
		em = NULL;

		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
		if (ret < 0) {
			goto out;
		} else if (ret) {
			ret = 0;
		} else {
			btrfs_warn(fs_info,
				   "swapfile must not be copy-on-write");
			ret = -EINVAL;
			goto out;
		}

		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}

		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
			btrfs_warn(fs_info,
				   "swapfile must have single data profile");
			ret = -EINVAL;
			goto out;
		}

		if (device == NULL) {
			device = em->map_lookup->stripes[0].dev;
			ret = btrfs_add_swapfile_pin(inode, device, false);
			if (ret == 1)
				ret = 0;
			else if (ret)
				goto out;
		} else if (device != em->map_lookup->stripes[0].dev) {
			btrfs_warn(fs_info, "swapfile must be on one device");
			ret = -EINVAL;
			goto out;
		}

		physical_block_start = (em->map_lookup->stripes[0].physical +
					(logical_block_start - em->start));
		len = min(len, em->len - (logical_block_start - em->start));
		free_extent_map(em);
		em = NULL;

		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
		if (!bg) {
			btrfs_warn(fs_info,
			   "could not find block group containing swapfile");
			ret = -EINVAL;
			goto out;
		}

		ret = btrfs_add_swapfile_pin(inode, bg, true);
		if (ret) {
			btrfs_put_block_group(bg);
			if (ret == 1)
				ret = 0;
			else
				goto out;
		}

		if (bsi.block_len &&
		    bsi.block_start + bsi.block_len == physical_block_start) {
			bsi.block_len += len;
		} else {
			if (bsi.block_len) {
				ret = btrfs_add_swap_extent(sis, &bsi);
				if (ret)
					goto out;
			}
			bsi.start = start;
			bsi.block_start = physical_block_start;
			bsi.block_len = len;
		}

		start += len;
	}

	if (bsi.block_len)
		ret = btrfs_add_swap_extent(sis, &bsi);

out:
	if (!IS_ERR_OR_NULL(em))
		free_extent_map(em);

	unlock_extent_cached(io_tree, 0, isize - 1, &cached_state);

	if (ret)
		btrfs_swap_deactivate(file);

	clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);

	if (ret)
		return ret;

	if (device)
		sis->bdev = device->bdev;
	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
	sis->max = bsi.nr_pages;
	sis->pages = bsi.nr_pages - 1;
	sis->highest_bit = bsi.nr_pages - 1;
	return bsi.nr_extents;
}
#else
static void btrfs_swap_deactivate(struct file *file)
{
}

static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
			       sector_t *span)
{
	return -EOPNOTSUPP;
}
#endif

10501
static const struct inode_operations btrfs_dir_inode_operations = {
10502
	.getattr	= btrfs_getattr,
C
Chris Mason 已提交
10503 10504 10505 10506 10507 10508
	.lookup		= btrfs_lookup,
	.create		= btrfs_create,
	.unlink		= btrfs_unlink,
	.link		= btrfs_link,
	.mkdir		= btrfs_mkdir,
	.rmdir		= btrfs_rmdir,
10509
	.rename		= btrfs_rename2,
C
Chris Mason 已提交
10510 10511
	.symlink	= btrfs_symlink,
	.setattr	= btrfs_setattr,
J
Josef Bacik 已提交
10512
	.mknod		= btrfs_mknod,
J
Josef Bacik 已提交
10513
	.listxattr	= btrfs_listxattr,
Y
Yan 已提交
10514
	.permission	= btrfs_permission,
10515
	.get_acl	= btrfs_get_acl,
10516
	.set_acl	= btrfs_set_acl,
10517
	.update_time	= btrfs_update_time,
10518
	.tmpfile        = btrfs_tmpfile,
C
Chris Mason 已提交
10519
};
10520

10521
static const struct file_operations btrfs_dir_file_operations = {
C
Chris Mason 已提交
10522 10523
	.llseek		= generic_file_llseek,
	.read		= generic_read_dir,
10524
	.iterate_shared	= btrfs_real_readdir,
10525
	.open		= btrfs_opendir,
C
Christoph Hellwig 已提交
10526
	.unlocked_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
10527
#ifdef CONFIG_COMPAT
10528
	.compat_ioctl	= btrfs_compat_ioctl,
C
Chris Mason 已提交
10529
#endif
S
Sage Weil 已提交
10530
	.release        = btrfs_release_file,
10531
	.fsync		= btrfs_sync_file,
C
Chris Mason 已提交
10532 10533
};

10534
static const struct extent_io_ops btrfs_extent_io_ops = {
10535
	/* mandatory callbacks */
10536
	.submit_bio_hook = btrfs_submit_bio_hook,
10537 10538 10539
	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
};

10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551
/*
 * btrfs doesn't support the bmap operation because swapfiles
 * use bmap to make a mapping of extents in the file.  They assume
 * these extents won't change over the life of the file and they
 * use the bmap result to do IO directly to the drive.
 *
 * the btrfs bmap call would return logical addresses that aren't
 * suitable for IO and they also will change frequently as COW
 * operations happen.  So, swapfile + btrfs == corruption.
 *
 * For now we're avoiding this by dropping bmap.
 */
10552
static const struct address_space_operations btrfs_aops = {
C
Chris Mason 已提交
10553 10554
	.readpage	= btrfs_readpage,
	.writepage	= btrfs_writepage,
C
Chris Mason 已提交
10555
	.writepages	= btrfs_writepages,
C
Chris Mason 已提交
10556
	.readpages	= btrfs_readpages,
10557
	.direct_IO	= btrfs_direct_IO,
10558 10559
	.invalidatepage = btrfs_invalidatepage,
	.releasepage	= btrfs_releasepage,
10560 10561 10562
#ifdef CONFIG_MIGRATION
	.migratepage	= btrfs_migratepage,
#endif
10563
	.set_page_dirty	= btrfs_set_page_dirty,
10564
	.error_remove_page = generic_error_remove_page,
O
Omar Sandoval 已提交
10565 10566
	.swap_activate	= btrfs_swap_activate,
	.swap_deactivate = btrfs_swap_deactivate,
C
Chris Mason 已提交
10567 10568
};

10569
static const struct inode_operations btrfs_file_inode_operations = {
C
Chris Mason 已提交
10570 10571
	.getattr	= btrfs_getattr,
	.setattr	= btrfs_setattr,
J
Josef Bacik 已提交
10572
	.listxattr      = btrfs_listxattr,
Y
Yan 已提交
10573
	.permission	= btrfs_permission,
Y
Yehuda Sadeh 已提交
10574
	.fiemap		= btrfs_fiemap,
10575
	.get_acl	= btrfs_get_acl,
10576
	.set_acl	= btrfs_set_acl,
10577
	.update_time	= btrfs_update_time,
C
Chris Mason 已提交
10578
};
10579
static const struct inode_operations btrfs_special_inode_operations = {
J
Josef Bacik 已提交
10580 10581
	.getattr	= btrfs_getattr,
	.setattr	= btrfs_setattr,
Y
Yan 已提交
10582
	.permission	= btrfs_permission,
J
Josef Bacik 已提交
10583
	.listxattr	= btrfs_listxattr,
10584
	.get_acl	= btrfs_get_acl,
10585
	.set_acl	= btrfs_set_acl,
10586
	.update_time	= btrfs_update_time,
J
Josef Bacik 已提交
10587
};
10588
static const struct inode_operations btrfs_symlink_inode_operations = {
10589
	.get_link	= page_get_link,
10590
	.getattr	= btrfs_getattr,
10591
	.setattr	= btrfs_setattr,
Y
Yan 已提交
10592
	.permission	= btrfs_permission,
J
Jim Owens 已提交
10593
	.listxattr	= btrfs_listxattr,
10594
	.update_time	= btrfs_update_time,
C
Chris Mason 已提交
10595
};
10596

10597
const struct dentry_operations btrfs_dentry_operations = {
10598 10599
	.d_delete	= btrfs_dentry_delete,
};