file.c 87.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

C
Chris Mason 已提交
6 7 8 9 10 11
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
12
#include <linux/falloc.h>
C
Chris Mason 已提交
13 14
#include <linux/writeback.h>
#include <linux/compat.h>
15
#include <linux/slab.h>
16
#include <linux/btrfs.h>
17
#include <linux/uio.h>
18
#include <linux/iversion.h>
C
Chris Mason 已提交
19 20 21 22 23
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
24 25
#include "tree-log.h"
#include "locking.h"
J
Josef Bacik 已提交
26
#include "volumes.h"
J
Josef Bacik 已提交
27
#include "qgroup.h"
28
#include "compression.h"
C
Chris Mason 已提交
29

30
static struct kmem_cache *btrfs_inode_defrag_cachep;
C
Chris Mason 已提交
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
/*
 * when auto defrag is enabled we
 * queue up these defrag structs to remember which
 * inodes need defragging passes
 */
struct inode_defrag {
	struct rb_node rb_node;
	/* objectid */
	u64 ino;
	/*
	 * transid where the defrag was added, we search for
	 * extents newer than this
	 */
	u64 transid;

	/* root objectid */
	u64 root;

	/* last offset we were able to defrag */
	u64 last_offset;

	/* if we've wrapped around back to zero once already */
	int cycled;
};

56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
static int __compare_inode_defrag(struct inode_defrag *defrag1,
				  struct inode_defrag *defrag2)
{
	if (defrag1->root > defrag2->root)
		return 1;
	else if (defrag1->root < defrag2->root)
		return -1;
	else if (defrag1->ino > defrag2->ino)
		return 1;
	else if (defrag1->ino < defrag2->ino)
		return -1;
	else
		return 0;
}

C
Chris Mason 已提交
71 72 73 74 75 76 77 78 79
/* pop a record for an inode into the defrag tree.  The lock
 * must be held already
 *
 * If you're inserting a record for an older transid than an
 * existing record, the transid already in the tree is lowered
 *
 * If an existing record is found the defrag item you
 * pass in is freed
 */
80
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
C
Chris Mason 已提交
81 82
				    struct inode_defrag *defrag)
{
83
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
C
Chris Mason 已提交
84 85 86
	struct inode_defrag *entry;
	struct rb_node **p;
	struct rb_node *parent = NULL;
87
	int ret;
C
Chris Mason 已提交
88

89
	p = &fs_info->defrag_inodes.rb_node;
C
Chris Mason 已提交
90 91 92 93
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

94 95
		ret = __compare_inode_defrag(defrag, entry);
		if (ret < 0)
C
Chris Mason 已提交
96
			p = &parent->rb_left;
97
		else if (ret > 0)
C
Chris Mason 已提交
98 99 100 101 102 103 104 105 106 107
			p = &parent->rb_right;
		else {
			/* if we're reinserting an entry for
			 * an old defrag run, make sure to
			 * lower the transid of our existing record
			 */
			if (defrag->transid < entry->transid)
				entry->transid = defrag->transid;
			if (defrag->last_offset > entry->last_offset)
				entry->last_offset = defrag->last_offset;
108
			return -EEXIST;
C
Chris Mason 已提交
109 110
		}
	}
111
	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
C
Chris Mason 已提交
112
	rb_link_node(&defrag->rb_node, parent, p);
113
	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
114 115
	return 0;
}
C
Chris Mason 已提交
116

117
static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
118
{
119
	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
120 121
		return 0;

122
	if (btrfs_fs_closing(fs_info))
123
		return 0;
C
Chris Mason 已提交
124

125
	return 1;
C
Chris Mason 已提交
126 127 128 129 130 131 132
}

/*
 * insert a defrag record for this inode if auto defrag is
 * enabled
 */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
133
			   struct btrfs_inode *inode)
C
Chris Mason 已提交
134
{
135
	struct btrfs_root *root = inode->root;
136
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
137 138
	struct inode_defrag *defrag;
	u64 transid;
139
	int ret;
C
Chris Mason 已提交
140

141
	if (!__need_auto_defrag(fs_info))
C
Chris Mason 已提交
142 143
		return 0;

144
	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
C
Chris Mason 已提交
145 146 147 148 149
		return 0;

	if (trans)
		transid = trans->transid;
	else
150
		transid = inode->root->last_trans;
C
Chris Mason 已提交
151

152
	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
C
Chris Mason 已提交
153 154 155
	if (!defrag)
		return -ENOMEM;

156
	defrag->ino = btrfs_ino(inode);
C
Chris Mason 已提交
157 158 159
	defrag->transid = transid;
	defrag->root = root->root_key.objectid;

160
	spin_lock(&fs_info->defrag_inodes_lock);
161
	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
162 163 164 165 166 167 168 169 170
		/*
		 * If we set IN_DEFRAG flag and evict the inode from memory,
		 * and then re-read this inode, this new inode doesn't have
		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
		 */
		ret = __btrfs_add_inode_defrag(inode, defrag);
		if (ret)
			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	} else {
171
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
172
	}
173
	spin_unlock(&fs_info->defrag_inodes_lock);
174
	return 0;
C
Chris Mason 已提交
175 176 177
}

/*
178 179 180
 * Requeue the defrag object. If there is a defrag object that points to
 * the same inode in the tree, we will merge them together (by
 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
C
Chris Mason 已提交
181
 */
182
static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
183
				       struct inode_defrag *defrag)
184
{
185
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
186 187
	int ret;

188
	if (!__need_auto_defrag(fs_info))
189 190 191 192 193 194
		goto out;

	/*
	 * Here we don't check the IN_DEFRAG flag, because we need merge
	 * them together.
	 */
195
	spin_lock(&fs_info->defrag_inodes_lock);
196
	ret = __btrfs_add_inode_defrag(inode, defrag);
197
	spin_unlock(&fs_info->defrag_inodes_lock);
198 199 200 201 202 203 204
	if (ret)
		goto out;
	return;
out:
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}

C
Chris Mason 已提交
205
/*
206 207
 * pick the defragable inode that we want, if it doesn't exist, we will get
 * the next one.
C
Chris Mason 已提交
208
 */
209 210
static struct inode_defrag *
btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
C
Chris Mason 已提交
211 212
{
	struct inode_defrag *entry = NULL;
213
	struct inode_defrag tmp;
C
Chris Mason 已提交
214 215
	struct rb_node *p;
	struct rb_node *parent = NULL;
216 217 218 219
	int ret;

	tmp.ino = ino;
	tmp.root = root;
C
Chris Mason 已提交
220

221 222
	spin_lock(&fs_info->defrag_inodes_lock);
	p = fs_info->defrag_inodes.rb_node;
C
Chris Mason 已提交
223 224 225 226
	while (p) {
		parent = p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

227 228
		ret = __compare_inode_defrag(&tmp, entry);
		if (ret < 0)
C
Chris Mason 已提交
229
			p = parent->rb_left;
230
		else if (ret > 0)
C
Chris Mason 已提交
231 232
			p = parent->rb_right;
		else
233
			goto out;
C
Chris Mason 已提交
234 235
	}

236 237 238
	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
		parent = rb_next(parent);
		if (parent)
C
Chris Mason 已提交
239
			entry = rb_entry(parent, struct inode_defrag, rb_node);
240 241
		else
			entry = NULL;
C
Chris Mason 已提交
242
	}
243 244 245 246 247
out:
	if (entry)
		rb_erase(parent, &fs_info->defrag_inodes);
	spin_unlock(&fs_info->defrag_inodes_lock);
	return entry;
C
Chris Mason 已提交
248 249
}

250
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
251 252
{
	struct inode_defrag *defrag;
253 254 255 256 257 258 259 260 261
	struct rb_node *node;

	spin_lock(&fs_info->defrag_inodes_lock);
	node = rb_first(&fs_info->defrag_inodes);
	while (node) {
		rb_erase(node, &fs_info->defrag_inodes);
		defrag = rb_entry(node, struct inode_defrag, rb_node);
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

262
		cond_resched_lock(&fs_info->defrag_inodes_lock);
263 264 265 266 267 268 269 270 271 272 273

		node = rb_first(&fs_info->defrag_inodes);
	}
	spin_unlock(&fs_info->defrag_inodes_lock);
}

#define BTRFS_DEFRAG_BATCH	1024

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
				    struct inode_defrag *defrag)
{
C
Chris Mason 已提交
274 275 276 277 278
	struct btrfs_root *inode_root;
	struct inode *inode;
	struct btrfs_key key;
	struct btrfs_ioctl_defrag_range_args range;
	int num_defrag;
279 280
	int index;
	int ret;
C
Chris Mason 已提交
281

282 283
	/* get the inode */
	key.objectid = defrag->root;
284
	key.type = BTRFS_ROOT_ITEM_KEY;
285
	key.offset = (u64)-1;
286 287 288

	index = srcu_read_lock(&fs_info->subvol_srcu);

289 290
	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(inode_root)) {
291 292 293
		ret = PTR_ERR(inode_root);
		goto cleanup;
	}
294 295

	key.objectid = defrag->ino;
296
	key.type = BTRFS_INODE_ITEM_KEY;
297 298 299
	key.offset = 0;
	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
	if (IS_ERR(inode)) {
300 301
		ret = PTR_ERR(inode);
		goto cleanup;
302
	}
303
	srcu_read_unlock(&fs_info->subvol_srcu, index);
304 305 306

	/* do a chunk of defrag */
	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
C
Chris Mason 已提交
307 308
	memset(&range, 0, sizeof(range));
	range.len = (u64)-1;
309
	range.start = defrag->last_offset;
M
Miao Xie 已提交
310 311

	sb_start_write(fs_info->sb);
312 313
	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
				       BTRFS_DEFRAG_BATCH);
M
Miao Xie 已提交
314
	sb_end_write(fs_info->sb);
315 316 317 318 319 320 321
	/*
	 * if we filled the whole defrag batch, there
	 * must be more work to do.  Queue this defrag
	 * again
	 */
	if (num_defrag == BTRFS_DEFRAG_BATCH) {
		defrag->last_offset = range.start;
322
		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
323 324 325 326 327 328 329 330
	} else if (defrag->last_offset && !defrag->cycled) {
		/*
		 * we didn't fill our defrag batch, but
		 * we didn't start at zero.  Make sure we loop
		 * around to the start of the file.
		 */
		defrag->last_offset = 0;
		defrag->cycled = 1;
331
		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
332 333 334 335 336 337
	} else {
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	}

	iput(inode);
	return 0;
338 339 340 341
cleanup:
	srcu_read_unlock(&fs_info->subvol_srcu, index);
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	return ret;
342 343 344 345 346 347 348 349 350 351 352
}

/*
 * run through the list of inodes in the FS that need
 * defragging
 */
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
	struct inode_defrag *defrag;
	u64 first_ino = 0;
	u64 root_objectid = 0;
C
Chris Mason 已提交
353 354

	atomic_inc(&fs_info->defrag_running);
355
	while (1) {
M
Miao Xie 已提交
356 357 358 359 360
		/* Pause the auto defragger. */
		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
			     &fs_info->fs_state))
			break;

361
		if (!__need_auto_defrag(fs_info))
362
			break;
C
Chris Mason 已提交
363 364

		/* find an inode to defrag */
365 366
		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
						 first_ino);
C
Chris Mason 已提交
367
		if (!defrag) {
368
			if (root_objectid || first_ino) {
369
				root_objectid = 0;
C
Chris Mason 已提交
370 371 372 373 374 375 376 377
				first_ino = 0;
				continue;
			} else {
				break;
			}
		}

		first_ino = defrag->ino + 1;
378
		root_objectid = defrag->root;
C
Chris Mason 已提交
379

380
		__btrfs_run_defrag_inode(fs_info, defrag);
C
Chris Mason 已提交
381 382 383 384 385 386 387 388 389 390
	}
	atomic_dec(&fs_info->defrag_running);

	/*
	 * during unmount, we use the transaction_wait queue to
	 * wait for the defragger to stop
	 */
	wake_up(&fs_info->transaction_wait);
	return 0;
}
C
Chris Mason 已提交
391

C
Chris Mason 已提交
392 393 394
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
395
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
396
					 struct page **prepared_pages,
397
					 struct iov_iter *i)
C
Chris Mason 已提交
398
{
399
	size_t copied = 0;
J
Josef Bacik 已提交
400
	size_t total_copied = 0;
401
	int pg = 0;
402
	int offset = offset_in_page(pos);
C
Chris Mason 已提交
403

404
	while (write_bytes > 0) {
C
Chris Mason 已提交
405
		size_t count = min_t(size_t,
406
				     PAGE_SIZE - offset, write_bytes);
407
		struct page *page = prepared_pages[pg];
408 409 410 411
		/*
		 * Copy data from userspace to the current page
		 */
		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
412

C
Chris Mason 已提交
413 414
		/* Flush processor's dcache for this page */
		flush_dcache_page(page);
415 416 417 418 419 420 421 422 423 424 425 426 427

		/*
		 * if we get a partial write, we can end up with
		 * partially up to date pages.  These add
		 * a lot of complexity, so make sure they don't
		 * happen by forcing this copy to be retried.
		 *
		 * The rest of the btrfs_file_write code will fall
		 * back to page at a time copies after we return 0.
		 */
		if (!PageUptodate(page) && copied < count)
			copied = 0;

428 429
		iov_iter_advance(i, copied);
		write_bytes -= copied;
430
		total_copied += copied;
C
Chris Mason 已提交
431

A
Al Viro 已提交
432
		/* Return to btrfs_file_write_iter to fault page */
J
Josef Bacik 已提交
433
		if (unlikely(copied == 0))
434
			break;
435

436
		if (copied < PAGE_SIZE - offset) {
437 438 439 440 441
			offset += copied;
		} else {
			pg++;
			offset = 0;
		}
C
Chris Mason 已提交
442
	}
443
	return total_copied;
C
Chris Mason 已提交
444 445
}

C
Chris Mason 已提交
446 447 448
/*
 * unlocks pages after btrfs_file_write is done with them
 */
449
static void btrfs_drop_pages(struct page **pages, size_t num_pages)
C
Chris Mason 已提交
450 451 452
{
	size_t i;
	for (i = 0; i < num_pages; i++) {
C
Chris Mason 已提交
453 454
		/* page checked is some magic around finding pages that
		 * have been modified without going through btrfs_set_page_dirty
455 456 457
		 * clear it here. There should be no need to mark the pages
		 * accessed as prepare_pages should have marked them accessed
		 * in prepare_pages via find_or_create_page()
C
Chris Mason 已提交
458
		 */
C
Chris Mason 已提交
459
		ClearPageChecked(pages[i]);
C
Chris Mason 已提交
460
		unlock_page(pages[i]);
461
		put_page(pages[i]);
C
Chris Mason 已提交
462 463 464
	}
}

465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
					 const u64 start,
					 const u64 len,
					 struct extent_state **cached_state)
{
	u64 search_start = start;
	const u64 end = start + len - 1;

	while (search_start < end) {
		const u64 search_len = end - search_start + 1;
		struct extent_map *em;
		u64 em_len;
		int ret = 0;

		em = btrfs_get_extent(inode, NULL, 0, search_start,
				      search_len, 0);
		if (IS_ERR(em))
			return PTR_ERR(em);

		if (em->block_start != EXTENT_MAP_HOLE)
			goto next;

		em_len = em->len;
		if (em->start < search_start)
			em_len -= search_start - em->start;
		if (em_len > search_len)
			em_len = search_len;

		ret = set_extent_bit(&inode->io_tree, search_start,
				     search_start + em_len - 1,
				     EXTENT_DELALLOC_NEW,
				     NULL, cached_state, GFP_NOFS);
next:
		search_start = extent_map_end(em);
		free_extent_map(em);
		if (ret)
			return ret;
	}
	return 0;
}

C
Chris Mason 已提交
506 507 508 509 510 511 512 513
/*
 * after copy_from_user, pages need to be dirtied and we need to make
 * sure holes are created between the current EOF and the start of
 * any next extents (if required).
 *
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
514 515 516
int btrfs_dirty_pages(struct inode *inode, struct page **pages,
		      size_t num_pages, loff_t pos, size_t write_bytes,
		      struct extent_state **cached)
C
Chris Mason 已提交
517
{
518
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
519
	int err = 0;
520
	int i;
521
	u64 num_bytes;
522 523 524 525
	u64 start_pos;
	u64 end_of_last_block;
	u64 end_pos = pos + write_bytes;
	loff_t isize = i_size_read(inode);
526
	unsigned int extra_bits = 0;
C
Chris Mason 已提交
527

528
	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
529
	num_bytes = round_up(write_bytes + pos - start_pos,
530
			     fs_info->sectorsize);
C
Chris Mason 已提交
531

532
	end_of_last_block = start_pos + num_bytes - 1;
533

534 535 536 537 538 539 540 541
	/*
	 * The pages may have already been dirty, clear out old accounting so
	 * we can set things up properly
	 */
	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
			 EXTENT_DIRTY | EXTENT_DELALLOC |
			 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);

542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559
	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
		if (start_pos >= isize &&
		    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
			/*
			 * There can't be any extents following eof in this case
			 * so just set the delalloc new bit for the range
			 * directly.
			 */
			extra_bits |= EXTENT_DELALLOC_NEW;
		} else {
			err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
							    start_pos,
							    num_bytes, cached);
			if (err)
				return err;
		}
	}

560
	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
561
					extra_bits, cached, 0);
J
Josef Bacik 已提交
562 563
	if (err)
		return err;
J
Josef Bacik 已提交
564

C
Chris Mason 已提交
565 566 567 568 569
	for (i = 0; i < num_pages; i++) {
		struct page *p = pages[i];
		SetPageUptodate(p);
		ClearPageChecked(p);
		set_page_dirty(p);
570
	}
J
Josef Bacik 已提交
571 572 573 574 575 576 577

	/*
	 * we've only changed i_size in ram, and we haven't updated
	 * the disk i_size.  There is no need to log the inode
	 * at this time.
	 */
	if (end_pos > isize)
578
		i_size_write(inode, end_pos);
579
	return 0;
C
Chris Mason 已提交
580 581
}

C
Chris Mason 已提交
582 583 584 585
/*
 * this drops all the extents in the cache that intersect the range
 * [start, end].  Existing extents are split as required.
 */
586
void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
587
			     int skip_pinned)
588 589
{
	struct extent_map *em;
590 591
	struct extent_map *split = NULL;
	struct extent_map *split2 = NULL;
592
	struct extent_map_tree *em_tree = &inode->extent_tree;
593
	u64 len = end - start + 1;
J
Josef Bacik 已提交
594
	u64 gen;
595 596
	int ret;
	int testend = 1;
597
	unsigned long flags;
C
Chris Mason 已提交
598
	int compressed = 0;
J
Josef Bacik 已提交
599
	bool modified;
600

601
	WARN_ON(end < start);
602
	if (end == (u64)-1) {
603
		len = (u64)-1;
604 605
		testend = 0;
	}
C
Chris Mason 已提交
606
	while (1) {
607 608
		int no_splits = 0;

J
Josef Bacik 已提交
609
		modified = false;
610
		if (!split)
611
			split = alloc_extent_map();
612
		if (!split2)
613
			split2 = alloc_extent_map();
614 615
		if (!split || !split2)
			no_splits = 1;
616

617
		write_lock(&em_tree->lock);
618
		em = lookup_extent_mapping(em_tree, start, len);
619
		if (!em) {
620
			write_unlock(&em_tree->lock);
621
			break;
622
		}
623
		flags = em->flags;
J
Josef Bacik 已提交
624
		gen = em->generation;
625
		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
626
			if (testend && em->start + em->len >= start + len) {
627
				free_extent_map(em);
C
Chris Mason 已提交
628
				write_unlock(&em_tree->lock);
629 630
				break;
			}
631 632
			start = em->start + em->len;
			if (testend)
633 634
				len = start + len - (em->start + em->len);
			free_extent_map(em);
C
Chris Mason 已提交
635
			write_unlock(&em_tree->lock);
636 637
			continue;
		}
C
Chris Mason 已提交
638
		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
639
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
L
Liu Bo 已提交
640
		clear_bit(EXTENT_FLAG_LOGGING, &flags);
J
Josef Bacik 已提交
641
		modified = !list_empty(&em->list);
642 643
		if (no_splits)
			goto next;
644

645
		if (em->start < start) {
646 647
			split->start = em->start;
			split->len = start - em->start;
648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667

			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				split->orig_start = em->orig_start;
				split->block_start = em->block_start;

				if (compressed)
					split->block_len = em->block_len;
				else
					split->block_len = split->len;
				split->orig_block_len = max(split->block_len,
						em->orig_block_len);
				split->ram_bytes = em->ram_bytes;
			} else {
				split->orig_start = split->start;
				split->block_len = 0;
				split->block_start = em->block_start;
				split->orig_block_len = 0;
				split->ram_bytes = split->len;
			}

J
Josef Bacik 已提交
668
			split->generation = gen;
669
			split->bdev = em->bdev;
670
			split->flags = flags;
671
			split->compress_type = em->compress_type;
672
			replace_extent_mapping(em_tree, em, split, modified);
673 674 675 676
			free_extent_map(split);
			split = split2;
			split2 = NULL;
		}
677
		if (testend && em->start + em->len > start + len) {
678 679 680 681 682
			u64 diff = start + len - em->start;

			split->start = start + len;
			split->len = em->start + em->len - (start + len);
			split->bdev = em->bdev;
683
			split->flags = flags;
684
			split->compress_type = em->compress_type;
J
Josef Bacik 已提交
685
			split->generation = gen;
686 687 688

			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				split->orig_block_len = max(em->block_len,
689
						    em->orig_block_len);
690

691 692 693 694 695 696 697 698 699 700 701
				split->ram_bytes = em->ram_bytes;
				if (compressed) {
					split->block_len = em->block_len;
					split->block_start = em->block_start;
					split->orig_start = em->orig_start;
				} else {
					split->block_len = split->len;
					split->block_start = em->block_start
						+ diff;
					split->orig_start = em->orig_start;
				}
C
Chris Mason 已提交
702
			} else {
703 704 705 706 707
				split->ram_bytes = split->len;
				split->orig_start = split->start;
				split->block_len = 0;
				split->block_start = em->block_start;
				split->orig_block_len = 0;
C
Chris Mason 已提交
708
			}
709

710 711 712 713 714 715 716 717
			if (extent_map_in_tree(em)) {
				replace_extent_mapping(em_tree, em, split,
						       modified);
			} else {
				ret = add_extent_mapping(em_tree, split,
							 modified);
				ASSERT(ret == 0); /* Logic error */
			}
718 719 720
			free_extent_map(split);
			split = NULL;
		}
721
next:
722 723
		if (extent_map_in_tree(em))
			remove_extent_mapping(em_tree, em);
724
		write_unlock(&em_tree->lock);
725

726 727 728 729 730
		/* once for us */
		free_extent_map(em);
		/* once for the tree*/
		free_extent_map(em);
	}
731 732 733 734
	if (split)
		free_extent_map(split);
	if (split2)
		free_extent_map(split2);
735 736
}

C
Chris Mason 已提交
737 738 739 740 741 742 743 744 745
/*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
 * that would be a good hint to the block allocator for this file.
 *
 * If an extent intersects the range but is not entirely inside the range
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
 */
J
Josef Bacik 已提交
746 747 748
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root, struct inode *inode,
			 struct btrfs_path *path, u64 start, u64 end,
749 750 751 752
			 u64 *drop_end, int drop_cache,
			 int replace_extent,
			 u32 extent_item_size,
			 int *key_inserted)
C
Chris Mason 已提交
753
{
754
	struct btrfs_fs_info *fs_info = root->fs_info;
755
	struct extent_buffer *leaf;
Y
Yan, Zheng 已提交
756
	struct btrfs_file_extent_item *fi;
757
	struct btrfs_key key;
Y
Yan, Zheng 已提交
758
	struct btrfs_key new_key;
759
	u64 ino = btrfs_ino(BTRFS_I(inode));
Y
Yan, Zheng 已提交
760 761 762 763 764
	u64 search_start = start;
	u64 disk_bytenr = 0;
	u64 num_bytes = 0;
	u64 extent_offset = 0;
	u64 extent_end = 0;
J
Josef Bacik 已提交
765
	u64 last_end = start;
Y
Yan, Zheng 已提交
766 767 768
	int del_nr = 0;
	int del_slot = 0;
	int extent_type;
C
Chris Mason 已提交
769
	int recow;
770
	int ret;
771
	int modify_tree = -1;
772
	int update_refs;
773
	int found = 0;
774
	int leafs_visited = 0;
C
Chris Mason 已提交
775

C
Chris Mason 已提交
776
	if (drop_cache)
777
		btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
778

779
	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
780 781
		modify_tree = 0;

782
	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
783
		       root == fs_info->tree_root);
C
Chris Mason 已提交
784
	while (1) {
C
Chris Mason 已提交
785
		recow = 0;
L
Li Zefan 已提交
786
		ret = btrfs_lookup_file_extent(trans, root, path, ino,
787
					       search_start, modify_tree);
C
Chris Mason 已提交
788
		if (ret < 0)
Y
Yan, Zheng 已提交
789 790 791 792
			break;
		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
L
Li Zefan 已提交
793
			if (key.objectid == ino &&
Y
Yan, Zheng 已提交
794 795
			    key.type == BTRFS_EXTENT_DATA_KEY)
				path->slots[0]--;
C
Chris Mason 已提交
796
		}
Y
Yan, Zheng 已提交
797
		ret = 0;
798
		leafs_visited++;
799
next_slot:
800
		leaf = path->nodes[0];
Y
Yan, Zheng 已提交
801 802 803 804 805 806 807 808
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			BUG_ON(del_nr > 0);
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				break;
			if (ret > 0) {
				ret = 0;
				break;
809
			}
810
			leafs_visited++;
Y
Yan, Zheng 已提交
811 812 813 814 815
			leaf = path->nodes[0];
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
816 817 818 819 820 821 822 823 824 825

		if (key.objectid > ino)
			break;
		if (WARN_ON_ONCE(key.objectid < ino) ||
		    key.type < BTRFS_EXTENT_DATA_KEY) {
			ASSERT(del_nr == 0);
			path->slots[0]++;
			goto next_slot;
		}
		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
Y
Yan, Zheng 已提交
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
			break;

		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_type = btrfs_file_extent_type(leaf, fi);

		if (extent_type == BTRFS_FILE_EXTENT_REG ||
		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
			extent_offset = btrfs_file_extent_offset(leaf, fi);
			extent_end = key.offset +
				btrfs_file_extent_num_bytes(leaf, fi);
		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
			extent_end = key.offset +
841
				btrfs_file_extent_ram_bytes(leaf, fi);
842
		} else {
843 844
			/* can't happen */
			BUG();
C
Chris Mason 已提交
845 846
		}

847 848 849 850 851 852 853 854 855
		/*
		 * Don't skip extent items representing 0 byte lengths. They
		 * used to be created (bug) if while punching holes we hit
		 * -ENOSPC condition. So if we find one here, just ensure we
		 * delete it, otherwise we would insert a new file extent item
		 * with the same key (offset) as that 0 bytes length file
		 * extent item in the call to setup_items_for_insert() later
		 * in this function.
		 */
J
Josef Bacik 已提交
856 857
		if (extent_end == key.offset && extent_end >= search_start) {
			last_end = extent_end;
858
			goto delete_extent_item;
J
Josef Bacik 已提交
859
		}
860

Y
Yan, Zheng 已提交
861 862
		if (extent_end <= search_start) {
			path->slots[0]++;
863
			goto next_slot;
C
Chris Mason 已提交
864 865
		}

866
		found = 1;
Y
Yan, Zheng 已提交
867
		search_start = max(key.offset, start);
868 869
		if (recow || !modify_tree) {
			modify_tree = -1;
870
			btrfs_release_path(path);
Y
Yan, Zheng 已提交
871
			continue;
C
Chris Mason 已提交
872
		}
Y
Yan Zheng 已提交
873

Y
Yan, Zheng 已提交
874 875 876 877 878 879
		/*
		 *     | - range to drop - |
		 *  | -------- extent -------- |
		 */
		if (start > key.offset && end < extent_end) {
			BUG_ON(del_nr > 0);
880
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
881
				ret = -EOPNOTSUPP;
882 883
				break;
			}
Y
Yan, Zheng 已提交
884 885 886 887 888 889

			memcpy(&new_key, &key, sizeof(new_key));
			new_key.offset = start;
			ret = btrfs_duplicate_item(trans, root, path,
						   &new_key);
			if (ret == -EAGAIN) {
890
				btrfs_release_path(path);
Y
Yan, Zheng 已提交
891
				continue;
Y
Yan Zheng 已提交
892
			}
Y
Yan, Zheng 已提交
893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910
			if (ret < 0)
				break;

			leaf = path->nodes[0];
			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
					    struct btrfs_file_extent_item);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);

			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);

			extent_offset += start - key.offset;
			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - start);
			btrfs_mark_buffer_dirty(leaf);

J
Josef Bacik 已提交
911
			if (update_refs && disk_bytenr > 0) {
912
				ret = btrfs_inc_extent_ref(trans, root,
Y
Yan, Zheng 已提交
913 914 915
						disk_bytenr, num_bytes, 0,
						root->root_key.objectid,
						new_key.objectid,
916
						start - extent_offset);
917
				BUG_ON(ret); /* -ENOMEM */
918
			}
Y
Yan, Zheng 已提交
919
			key.offset = start;
Y
Yan Zheng 已提交
920
		}
J
Josef Bacik 已提交
921 922 923 924 925 926
		/*
		 * From here on out we will have actually dropped something, so
		 * last_end can be updated.
		 */
		last_end = extent_end;

Y
Yan, Zheng 已提交
927 928 929 930 931
		/*
		 *  | ---- range to drop ----- |
		 *      | -------- extent -------- |
		 */
		if (start <= key.offset && end < extent_end) {
932
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
933
				ret = -EOPNOTSUPP;
934 935
				break;
			}
Y
Yan Zheng 已提交
936

Y
Yan, Zheng 已提交
937 938
			memcpy(&new_key, &key, sizeof(new_key));
			new_key.offset = end;
939
			btrfs_set_item_key_safe(fs_info, path, &new_key);
Y
Yan Zheng 已提交
940

Y
Yan, Zheng 已提交
941 942 943 944 945
			extent_offset += end - key.offset;
			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - end);
			btrfs_mark_buffer_dirty(leaf);
946
			if (update_refs && disk_bytenr > 0)
Y
Yan, Zheng 已提交
947 948
				inode_sub_bytes(inode, end - key.offset);
			break;
C
Chris Mason 已提交
949
		}
950

Y
Yan, Zheng 已提交
951 952 953 954 955 956 957
		search_start = extent_end;
		/*
		 *       | ---- range to drop ----- |
		 *  | -------- extent -------- |
		 */
		if (start > key.offset && end >= extent_end) {
			BUG_ON(del_nr > 0);
958
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
959
				ret = -EOPNOTSUPP;
960 961
				break;
			}
962

Y
Yan, Zheng 已提交
963 964 965
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);
			btrfs_mark_buffer_dirty(leaf);
966
			if (update_refs && disk_bytenr > 0)
Y
Yan, Zheng 已提交
967 968 969
				inode_sub_bytes(inode, extent_end - start);
			if (end == extent_end)
				break;
C
Chris Mason 已提交
970

Y
Yan, Zheng 已提交
971 972
			path->slots[0]++;
			goto next_slot;
Z
Zheng Yan 已提交
973 974
		}

Y
Yan, Zheng 已提交
975 976 977 978 979
		/*
		 *  | ---- range to drop ----- |
		 *    | ------ extent ------ |
		 */
		if (start <= key.offset && end >= extent_end) {
980
delete_extent_item:
Y
Yan, Zheng 已提交
981 982 983 984 985 986 987
			if (del_nr == 0) {
				del_slot = path->slots[0];
				del_nr = 1;
			} else {
				BUG_ON(del_slot + del_nr != path->slots[0]);
				del_nr++;
			}
Z
Zheng Yan 已提交
988

J
Josef Bacik 已提交
989 990
			if (update_refs &&
			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
991
				inode_sub_bytes(inode,
Y
Yan, Zheng 已提交
992 993
						extent_end - key.offset);
				extent_end = ALIGN(extent_end,
994
						   fs_info->sectorsize);
J
Josef Bacik 已提交
995
			} else if (update_refs && disk_bytenr > 0) {
996
				ret = btrfs_free_extent(trans, root,
Y
Yan, Zheng 已提交
997 998
						disk_bytenr, num_bytes, 0,
						root->root_key.objectid,
999
						key.objectid, key.offset -
1000
						extent_offset);
1001
				BUG_ON(ret); /* -ENOMEM */
Y
Yan, Zheng 已提交
1002 1003
				inode_sub_bytes(inode,
						extent_end - key.offset);
Z
Zheng Yan 已提交
1004 1005
			}

Y
Yan, Zheng 已提交
1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
			if (end == extent_end)
				break;

			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
				path->slots[0]++;
				goto next_slot;
			}

			ret = btrfs_del_items(trans, root, path, del_slot,
					      del_nr);
1016
			if (ret) {
1017
				btrfs_abort_transaction(trans, ret);
J
Josef Bacik 已提交
1018
				break;
1019
			}
Y
Yan, Zheng 已提交
1020 1021 1022 1023

			del_nr = 0;
			del_slot = 0;

1024
			btrfs_release_path(path);
Y
Yan, Zheng 已提交
1025
			continue;
C
Chris Mason 已提交
1026
		}
Y
Yan, Zheng 已提交
1027 1028

		BUG_ON(1);
C
Chris Mason 已提交
1029
	}
Y
Yan, Zheng 已提交
1030

1031
	if (!ret && del_nr > 0) {
1032 1033 1034 1035
		/*
		 * Set path->slots[0] to first slot, so that after the delete
		 * if items are move off from our leaf to its immediate left or
		 * right neighbor leafs, we end up with a correct and adjusted
1036
		 * path->slots[0] for our insertion (if replace_extent != 0).
1037 1038
		 */
		path->slots[0] = del_slot;
Y
Yan, Zheng 已提交
1039
		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1040
		if (ret)
1041
			btrfs_abort_transaction(trans, ret);
1042
	}
1043

1044 1045 1046 1047 1048 1049 1050 1051 1052
	leaf = path->nodes[0];
	/*
	 * If btrfs_del_items() was called, it might have deleted a leaf, in
	 * which case it unlocked our path, so check path->locks[0] matches a
	 * write lock.
	 */
	if (!ret && replace_extent && leafs_visited == 1 &&
	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
	     path->locks[0] == BTRFS_WRITE_LOCK) &&
1053
	    btrfs_leaf_free_space(fs_info, leaf) >=
1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
	    sizeof(struct btrfs_item) + extent_item_size) {

		key.objectid = ino;
		key.type = BTRFS_EXTENT_DATA_KEY;
		key.offset = start;
		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
			struct btrfs_key slot_key;

			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
				path->slots[0]++;
1065
		}
1066 1067 1068 1069 1070 1071
		setup_items_for_insert(root, path, &key,
				       &extent_item_size,
				       extent_item_size,
				       sizeof(struct btrfs_item) +
				       extent_item_size, 1);
		*key_inserted = 1;
Y
Yan Zheng 已提交
1072
	}
Y
Yan, Zheng 已提交
1073

1074 1075
	if (!replace_extent || !(*key_inserted))
		btrfs_release_path(path);
J
Josef Bacik 已提交
1076
	if (drop_end)
J
Josef Bacik 已提交
1077
		*drop_end = found ? min(end, last_end) : end;
J
Josef Bacik 已提交
1078 1079 1080 1081 1082
	return ret;
}

int btrfs_drop_extents(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root, struct inode *inode, u64 start,
1083
		       u64 end, int drop_cache)
J
Josef Bacik 已提交
1084 1085 1086 1087 1088 1089 1090
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
J
Josef Bacik 已提交
1091
	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1092
				   drop_cache, 0, 0, NULL);
Y
Yan, Zheng 已提交
1093
	btrfs_free_path(path);
C
Chris Mason 已提交
1094 1095 1096
	return ret;
}

Y
Yan Zheng 已提交
1097
static int extent_mergeable(struct extent_buffer *leaf, int slot,
1098 1099
			    u64 objectid, u64 bytenr, u64 orig_offset,
			    u64 *start, u64 *end)
Y
Yan Zheng 已提交
1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
{
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;
	u64 extent_end;

	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
		return 0;

	btrfs_item_key_to_cpu(leaf, &key, slot);
	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
		return 0;

	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1115
	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
Y
Yan Zheng 已提交
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
	    btrfs_file_extent_compression(leaf, fi) ||
	    btrfs_file_extent_encryption(leaf, fi) ||
	    btrfs_file_extent_other_encoding(leaf, fi))
		return 0;

	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
	if ((*start && *start != key.offset) || (*end && *end != extent_end))
		return 0;

	*start = key.offset;
	*end = extent_end;
	return 1;
}

/*
 * Mark extent in the range start - end as written.
 *
 * This changes extent type from 'pre-allocated' to 'regular'. If only
 * part of extent is marked as written, the extent will be split into
 * two or three.
 */
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1138
			      struct btrfs_inode *inode, u64 start, u64 end)
Y
Yan Zheng 已提交
1139
{
1140
	struct btrfs_fs_info *fs_info = trans->fs_info;
1141
	struct btrfs_root *root = inode->root;
Y
Yan Zheng 已提交
1142 1143 1144 1145
	struct extent_buffer *leaf;
	struct btrfs_path *path;
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;
Y
Yan, Zheng 已提交
1146
	struct btrfs_key new_key;
Y
Yan Zheng 已提交
1147 1148 1149
	u64 bytenr;
	u64 num_bytes;
	u64 extent_end;
1150
	u64 orig_offset;
Y
Yan Zheng 已提交
1151 1152
	u64 other_start;
	u64 other_end;
Y
Yan, Zheng 已提交
1153 1154 1155
	u64 split;
	int del_nr = 0;
	int del_slot = 0;
1156
	int recow;
Y
Yan Zheng 已提交
1157
	int ret;
1158
	u64 ino = btrfs_ino(inode);
Y
Yan Zheng 已提交
1159 1160

	path = btrfs_alloc_path();
1161 1162
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
1163
again:
1164
	recow = 0;
Y
Yan, Zheng 已提交
1165
	split = start;
L
Li Zefan 已提交
1166
	key.objectid = ino;
Y
Yan Zheng 已提交
1167
	key.type = BTRFS_EXTENT_DATA_KEY;
Y
Yan, Zheng 已提交
1168
	key.offset = split;
Y
Yan Zheng 已提交
1169 1170

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1171 1172
	if (ret < 0)
		goto out;
Y
Yan Zheng 已提交
1173 1174 1175 1176 1177
	if (ret > 0 && path->slots[0] > 0)
		path->slots[0]--;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1178 1179 1180 1181 1182 1183
	if (key.objectid != ino ||
	    key.type != BTRFS_EXTENT_DATA_KEY) {
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
Y
Yan Zheng 已提交
1184 1185
	fi = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
1186 1187 1188 1189 1190
	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
Y
Yan Zheng 已提交
1191
	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1192 1193 1194 1195 1196
	if (key.offset > start || extent_end < end) {
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
Y
Yan Zheng 已提交
1197 1198 1199

	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1200
	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1201 1202 1203 1204 1205 1206
	memcpy(&new_key, &key, sizeof(new_key));

	if (start == key.offset && end < extent_end) {
		other_start = 0;
		other_end = start;
		if (extent_mergeable(leaf, path->slots[0] - 1,
L
Li Zefan 已提交
1207
				     ino, bytenr, orig_offset,
1208 1209
				     &other_start, &other_end)) {
			new_key.offset = end;
1210
			btrfs_set_item_key_safe(fs_info, path, &new_key);
1211 1212
			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
1213 1214
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1215 1216 1217 1218 1219 1220
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - end);
			btrfs_set_file_extent_offset(leaf, fi,
						     end - orig_offset);
			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
					    struct btrfs_file_extent_item);
1221 1222
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233
			btrfs_set_file_extent_num_bytes(leaf, fi,
							end - other_start);
			btrfs_mark_buffer_dirty(leaf);
			goto out;
		}
	}

	if (start > key.offset && end == extent_end) {
		other_start = end;
		other_end = 0;
		if (extent_mergeable(leaf, path->slots[0] + 1,
L
Li Zefan 已提交
1234
				     ino, bytenr, orig_offset,
1235 1236 1237 1238 1239
				     &other_start, &other_end)) {
			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);
1240 1241
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1242 1243
			path->slots[0]++;
			new_key.offset = start;
1244
			btrfs_set_item_key_safe(fs_info, path, &new_key);
1245 1246 1247

			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
1248 1249
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1250 1251 1252 1253 1254 1255 1256 1257
			btrfs_set_file_extent_num_bytes(leaf, fi,
							other_end - start);
			btrfs_set_file_extent_offset(leaf, fi,
						     start - orig_offset);
			btrfs_mark_buffer_dirty(leaf);
			goto out;
		}
	}
Y
Yan Zheng 已提交
1258

Y
Yan, Zheng 已提交
1259 1260 1261 1262 1263 1264 1265
	while (start > key.offset || end < extent_end) {
		if (key.offset == start)
			split = end;

		new_key.offset = split;
		ret = btrfs_duplicate_item(trans, root, path, &new_key);
		if (ret == -EAGAIN) {
1266
			btrfs_release_path(path);
Y
Yan, Zheng 已提交
1267
			goto again;
Y
Yan Zheng 已提交
1268
		}
1269
		if (ret < 0) {
1270
			btrfs_abort_transaction(trans, ret);
1271 1272
			goto out;
		}
Y
Yan Zheng 已提交
1273

Y
Yan, Zheng 已提交
1274 1275
		leaf = path->nodes[0];
		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
Y
Yan Zheng 已提交
1276
				    struct btrfs_file_extent_item);
1277
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
Y
Yan Zheng 已提交
1278
		btrfs_set_file_extent_num_bytes(leaf, fi,
Y
Yan, Zheng 已提交
1279 1280 1281 1282 1283
						split - key.offset);

		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);

1284
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
Y
Yan, Zheng 已提交
1285 1286 1287
		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
		btrfs_set_file_extent_num_bytes(leaf, fi,
						extent_end - split);
Y
Yan Zheng 已提交
1288 1289
		btrfs_mark_buffer_dirty(leaf);

1290
		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
1291
					   0, root->root_key.objectid,
1292
					   ino, orig_offset);
1293 1294 1295 1296
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
Y
Yan Zheng 已提交
1297

Y
Yan, Zheng 已提交
1298 1299 1300
		if (split == start) {
			key.offset = start;
		} else {
1301 1302 1303 1304 1305
			if (start != key.offset) {
				ret = -EINVAL;
				btrfs_abort_transaction(trans, ret);
				goto out;
			}
Y
Yan Zheng 已提交
1306
			path->slots[0]--;
Y
Yan, Zheng 已提交
1307
			extent_end = end;
Y
Yan Zheng 已提交
1308
		}
1309
		recow = 1;
Y
Yan Zheng 已提交
1310 1311
	}

Y
Yan, Zheng 已提交
1312 1313
	other_start = end;
	other_end = 0;
1314
	if (extent_mergeable(leaf, path->slots[0] + 1,
L
Li Zefan 已提交
1315
			     ino, bytenr, orig_offset,
1316 1317
			     &other_start, &other_end)) {
		if (recow) {
1318
			btrfs_release_path(path);
1319 1320
			goto again;
		}
Y
Yan, Zheng 已提交
1321 1322 1323
		extent_end = other_end;
		del_slot = path->slots[0] + 1;
		del_nr++;
1324
		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
Y
Yan, Zheng 已提交
1325
					0, root->root_key.objectid,
1326
					ino, orig_offset);
1327 1328 1329 1330
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
Y
Yan Zheng 已提交
1331
	}
Y
Yan, Zheng 已提交
1332 1333
	other_start = 0;
	other_end = start;
1334
	if (extent_mergeable(leaf, path->slots[0] - 1,
L
Li Zefan 已提交
1335
			     ino, bytenr, orig_offset,
1336 1337
			     &other_start, &other_end)) {
		if (recow) {
1338
			btrfs_release_path(path);
1339 1340
			goto again;
		}
Y
Yan, Zheng 已提交
1341 1342 1343
		key.offset = other_start;
		del_slot = path->slots[0];
		del_nr++;
1344
		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
Y
Yan, Zheng 已提交
1345
					0, root->root_key.objectid,
1346
					ino, orig_offset);
1347 1348 1349 1350
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
Y
Yan, Zheng 已提交
1351 1352
	}
	if (del_nr == 0) {
1353 1354
		fi = btrfs_item_ptr(leaf, path->slots[0],
			   struct btrfs_file_extent_item);
Y
Yan, Zheng 已提交
1355 1356
		btrfs_set_file_extent_type(leaf, fi,
					   BTRFS_FILE_EXTENT_REG);
1357
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
Y
Yan, Zheng 已提交
1358
		btrfs_mark_buffer_dirty(leaf);
1359
	} else {
1360 1361
		fi = btrfs_item_ptr(leaf, del_slot - 1,
			   struct btrfs_file_extent_item);
1362 1363
		btrfs_set_file_extent_type(leaf, fi,
					   BTRFS_FILE_EXTENT_REG);
1364
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1365 1366 1367
		btrfs_set_file_extent_num_bytes(leaf, fi,
						extent_end - key.offset);
		btrfs_mark_buffer_dirty(leaf);
Y
Yan, Zheng 已提交
1368

1369
		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1370
		if (ret < 0) {
1371
			btrfs_abort_transaction(trans, ret);
1372 1373
			goto out;
		}
1374
	}
Y
Yan, Zheng 已提交
1375
out:
Y
Yan Zheng 已提交
1376 1377 1378 1379
	btrfs_free_path(path);
	return 0;
}

1380 1381 1382 1383
/*
 * on error we return an unlocked page and the error value
 * on success we return a locked page and 0
 */
1384 1385
static int prepare_uptodate_page(struct inode *inode,
				 struct page *page, u64 pos,
1386
				 bool force_uptodate)
1387 1388 1389
{
	int ret = 0;

1390
	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1391
	    !PageUptodate(page)) {
1392 1393 1394 1395 1396 1397 1398 1399
		ret = btrfs_readpage(NULL, page);
		if (ret)
			return ret;
		lock_page(page);
		if (!PageUptodate(page)) {
			unlock_page(page);
			return -EIO;
		}
1400 1401 1402 1403
		if (page->mapping != inode->i_mapping) {
			unlock_page(page);
			return -EAGAIN;
		}
1404 1405 1406 1407
	}
	return 0;
}

C
Chris Mason 已提交
1408
/*
1409
 * this just gets pages into the page cache and locks them down.
C
Chris Mason 已提交
1410
 */
1411 1412 1413
static noinline int prepare_pages(struct inode *inode, struct page **pages,
				  size_t num_pages, loff_t pos,
				  size_t write_bytes, bool force_uptodate)
C
Chris Mason 已提交
1414 1415
{
	int i;
1416
	unsigned long index = pos >> PAGE_SHIFT;
1417
	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1418
	int err = 0;
1419
	int faili;
1420

C
Chris Mason 已提交
1421
	for (i = 0; i < num_pages; i++) {
1422
again:
1423
		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1424
					       mask | __GFP_WRITE);
C
Chris Mason 已提交
1425
		if (!pages[i]) {
1426 1427 1428 1429 1430 1431
			faili = i - 1;
			err = -ENOMEM;
			goto fail;
		}

		if (i == 0)
1432
			err = prepare_uptodate_page(inode, pages[i], pos,
1433
						    force_uptodate);
1434 1435
		if (!err && i == num_pages - 1)
			err = prepare_uptodate_page(inode, pages[i],
1436
						    pos + write_bytes, false);
1437
		if (err) {
1438
			put_page(pages[i]);
1439 1440 1441 1442
			if (err == -EAGAIN) {
				err = 0;
				goto again;
			}
1443 1444
			faili = i - 1;
			goto fail;
C
Chris Mason 已提交
1445
		}
C
Chris Mason 已提交
1446
		wait_on_page_writeback(pages[i]);
C
Chris Mason 已提交
1447
	}
1448 1449 1450 1451 1452

	return 0;
fail:
	while (faili >= 0) {
		unlock_page(pages[faili]);
1453
		put_page(pages[faili]);
1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470
		faili--;
	}
	return err;

}

/*
 * This function locks the extent and properly waits for data=ordered extents
 * to finish before allowing the pages to be modified if need.
 *
 * The return value:
 * 1 - the extent is locked
 * 0 - the extent is not locked, and everything is OK
 * -EAGAIN - need re-prepare the pages
 * the other < 0 number - Something wrong happens
 */
static noinline int
1471
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1472
				size_t num_pages, loff_t pos,
1473
				size_t write_bytes,
1474 1475 1476
				u64 *lockstart, u64 *lockend,
				struct extent_state **cached_state)
{
1477
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1478 1479 1480 1481 1482
	u64 start_pos;
	u64 last_pos;
	int i;
	int ret = 0;

1483
	start_pos = round_down(pos, fs_info->sectorsize);
1484
	last_pos = start_pos
1485
		+ round_up(pos + write_bytes - start_pos,
1486
			   fs_info->sectorsize) - 1;
1487

1488
	if (start_pos < inode->vfs_inode.i_size) {
1489
		struct btrfs_ordered_extent *ordered;
1490

1491 1492
		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
				cached_state);
1493 1494
		ordered = btrfs_lookup_ordered_range(inode, start_pos,
						     last_pos - start_pos + 1);
1495 1496
		if (ordered &&
		    ordered->file_offset + ordered->len > start_pos &&
1497
		    ordered->file_offset <= last_pos) {
1498
			unlock_extent_cached(&inode->io_tree, start_pos,
1499
					last_pos, cached_state);
1500 1501
			for (i = 0; i < num_pages; i++) {
				unlock_page(pages[i]);
1502
				put_page(pages[i]);
1503
			}
1504 1505
			btrfs_start_ordered_extent(&inode->vfs_inode,
					ordered, 1);
1506 1507
			btrfs_put_ordered_extent(ordered);
			return -EAGAIN;
1508 1509 1510
		}
		if (ordered)
			btrfs_put_ordered_extent(ordered);
1511

1512 1513 1514
		*lockstart = start_pos;
		*lockend = last_pos;
		ret = 1;
1515
	}
1516

1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530
	/*
	 * It's possible the pages are dirty right now, but we don't want
	 * to clean them yet because copy_from_user may catch a page fault
	 * and we might have to fall back to one page at a time.  If that
	 * happens, we'll unlock these pages and we'd have a window where
	 * reclaim could sneak in and drop the once-dirty page on the floor
	 * without writing it.
	 *
	 * We have the pages locked and the extent range locked, so there's
	 * no way someone can start IO on any dirty pages in this range.
	 *
	 * We'll call btrfs_dirty_pages() later on, and that will flip around
	 * delalloc bits and dirty the pages as required.
	 */
1531 1532 1533 1534
	for (i = 0; i < num_pages; i++) {
		set_page_extent_mapped(pages[i]);
		WARN_ON(!PageLocked(pages[i]));
	}
1535

1536
	return ret;
C
Chris Mason 已提交
1537 1538
}

1539
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1540 1541
				    size_t *write_bytes)
{
1542
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1543
	struct btrfs_root *root = inode->root;
1544 1545 1546 1547 1548
	struct btrfs_ordered_extent *ordered;
	u64 lockstart, lockend;
	u64 num_bytes;
	int ret;

1549
	ret = btrfs_start_write_no_snapshotting(root);
1550 1551 1552
	if (!ret)
		return -ENOSPC;

1553
	lockstart = round_down(pos, fs_info->sectorsize);
1554
	lockend = round_up(pos + *write_bytes,
1555
			   fs_info->sectorsize) - 1;
1556 1557

	while (1) {
1558
		lock_extent(&inode->io_tree, lockstart, lockend);
1559 1560 1561 1562 1563
		ordered = btrfs_lookup_ordered_range(inode, lockstart,
						     lockend - lockstart + 1);
		if (!ordered) {
			break;
		}
1564 1565
		unlock_extent(&inode->io_tree, lockstart, lockend);
		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
1566 1567 1568 1569
		btrfs_put_ordered_extent(ordered);
	}

	num_bytes = lockend - lockstart + 1;
1570 1571
	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
			NULL, NULL, NULL);
1572 1573
	if (ret <= 0) {
		ret = 0;
1574
		btrfs_end_write_no_snapshotting(root);
1575
	} else {
1576 1577
		*write_bytes = min_t(size_t, *write_bytes ,
				     num_bytes - pos + lockstart);
1578 1579
	}

1580
	unlock_extent(&inode->io_tree, lockstart, lockend);
1581 1582 1583 1584

	return ret;
}

1585 1586
static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
					       struct iov_iter *i)
1587
{
1588 1589
	struct file *file = iocb->ki_filp;
	loff_t pos = iocb->ki_pos;
A
Al Viro 已提交
1590
	struct inode *inode = file_inode(file);
1591
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1592 1593
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct page **pages = NULL;
1594
	struct extent_state *cached_state = NULL;
1595
	struct extent_changeset *data_reserved = NULL;
1596
	u64 release_bytes = 0;
1597 1598
	u64 lockstart;
	u64 lockend;
J
Josef Bacik 已提交
1599 1600
	size_t num_written = 0;
	int nrptrs;
1601
	int ret = 0;
1602
	bool only_release_metadata = false;
1603
	bool force_page_uptodate = false;
1604

1605 1606
	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
			PAGE_SIZE / (sizeof(struct page *)));
1607 1608
	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
	nrptrs = max(nrptrs, 8);
1609
	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
J
Josef Bacik 已提交
1610 1611
	if (!pages)
		return -ENOMEM;
1612

J
Josef Bacik 已提交
1613
	while (iov_iter_count(i) > 0) {
1614
		size_t offset = offset_in_page(pos);
1615
		size_t sector_offset;
J
Josef Bacik 已提交
1616
		size_t write_bytes = min(iov_iter_count(i),
1617
					 nrptrs * (size_t)PAGE_SIZE -
1618
					 offset);
1619
		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1620
						PAGE_SIZE);
1621
		size_t reserve_bytes;
J
Josef Bacik 已提交
1622 1623
		size_t dirty_pages;
		size_t copied;
1624 1625
		size_t dirty_sectors;
		size_t num_sectors;
1626
		int extents_locked;
C
Chris Mason 已提交
1627

1628
		WARN_ON(num_pages > nrptrs);
1629

1630 1631 1632 1633
		/*
		 * Fault pages before locking them in prepare_pages
		 * to avoid recursive lock
		 */
J
Josef Bacik 已提交
1634
		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1635
			ret = -EFAULT;
J
Josef Bacik 已提交
1636
			break;
1637 1638
		}

1639
		sector_offset = pos & (fs_info->sectorsize - 1);
1640
		reserve_bytes = round_up(write_bytes + sector_offset,
1641
				fs_info->sectorsize);
1642

1643 1644 1645
		extent_changeset_release(data_reserved);
		ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
						  write_bytes);
1646 1647 1648
		if (ret < 0) {
			if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
						      BTRFS_INODE_PREALLOC)) &&
1649 1650
			    check_can_nocow(BTRFS_I(inode), pos,
					&write_bytes) > 0) {
1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
				/*
				 * For nodata cow case, no need to reserve
				 * data space.
				 */
				only_release_metadata = true;
				/*
				 * our prealloc extent may be smaller than
				 * write_bytes, so scale down.
				 */
				num_pages = DIV_ROUND_UP(write_bytes + offset,
							 PAGE_SIZE);
				reserve_bytes = round_up(write_bytes +
							 sector_offset,
1664
							 fs_info->sectorsize);
1665 1666 1667 1668
			} else {
				break;
			}
		}
1669

J
Josef Bacik 已提交
1670
		WARN_ON(reserve_bytes == 0);
1671 1672
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
				reserve_bytes);
1673 1674
		if (ret) {
			if (!only_release_metadata)
1675 1676 1677
				btrfs_free_reserved_data_space(inode,
						data_reserved, pos,
						write_bytes);
1678
			else
1679
				btrfs_end_write_no_snapshotting(root);
1680 1681 1682 1683
			break;
		}

		release_bytes = reserve_bytes;
1684
again:
1685 1686 1687 1688 1689
		/*
		 * This is going to setup the pages array with the number of
		 * pages we want, so we don't really need to worry about the
		 * contents of pages from loop to loop
		 */
1690 1691
		ret = prepare_pages(inode, pages, num_pages,
				    pos, write_bytes,
1692
				    force_page_uptodate);
J
Josef Bacik 已提交
1693 1694
		if (ret) {
			btrfs_delalloc_release_extents(BTRFS_I(inode),
1695
						       reserve_bytes, true);
J
Josef Bacik 已提交
1696
			break;
J
Josef Bacik 已提交
1697
		}
C
Chris Mason 已提交
1698

1699 1700
		extents_locked = lock_and_cleanup_extent_if_need(
				BTRFS_I(inode), pages,
1701 1702
				num_pages, pos, write_bytes, &lockstart,
				&lockend, &cached_state);
1703 1704
		if (extents_locked < 0) {
			if (extents_locked == -EAGAIN)
1705
				goto again;
J
Josef Bacik 已提交
1706
			btrfs_delalloc_release_extents(BTRFS_I(inode),
1707
						       reserve_bytes, true);
1708
			ret = extents_locked;
1709 1710 1711
			break;
		}

1712
		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1713

1714
		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1715
		dirty_sectors = round_up(copied + sector_offset,
1716 1717
					fs_info->sectorsize);
		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1718

1719 1720 1721 1722 1723 1724 1725
		/*
		 * if we have trouble faulting in the pages, fall
		 * back to one page at a time
		 */
		if (copied < write_bytes)
			nrptrs = 1;

1726 1727
		if (copied == 0) {
			force_page_uptodate = true;
1728
			dirty_sectors = 0;
1729
			dirty_pages = 0;
1730 1731
		} else {
			force_page_uptodate = false;
1732
			dirty_pages = DIV_ROUND_UP(copied + offset,
1733
						   PAGE_SIZE);
1734
		}
1735

1736
		if (num_sectors > dirty_sectors) {
1737 1738
			/* release everything except the sectors we dirtied */
			release_bytes -= dirty_sectors <<
1739
						fs_info->sb->s_blocksize_bits;
1740
			if (only_release_metadata) {
1741
				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1742
							release_bytes, true);
1743 1744 1745
			} else {
				u64 __pos;

1746
				__pos = round_down(pos,
1747
						   fs_info->sectorsize) +
1748
					(dirty_pages << PAGE_SHIFT);
1749 1750
				btrfs_delalloc_release_space(inode,
						data_reserved, __pos,
1751
						release_bytes, true);
1752
			}
1753 1754
		}

1755
		release_bytes = round_up(copied + sector_offset,
1756
					fs_info->sectorsize);
1757 1758

		if (copied > 0)
1759
			ret = btrfs_dirty_pages(inode, pages, dirty_pages,
1760
						pos, copied, &cached_state);
1761
		if (extents_locked)
1762
			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1763
					     lockstart, lockend, &cached_state);
1764
		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
1765
					       true);
1766 1767
		if (ret) {
			btrfs_drop_pages(pages, num_pages);
1768
			break;
1769
		}
C
Chris Mason 已提交
1770

1771
		release_bytes = 0;
1772
		if (only_release_metadata)
1773
			btrfs_end_write_no_snapshotting(root);
1774

1775
		if (only_release_metadata && copied > 0) {
1776
			lockstart = round_down(pos,
1777
					       fs_info->sectorsize);
1778
			lockend = round_up(pos + copied,
1779
					   fs_info->sectorsize) - 1;
1780 1781 1782 1783 1784 1785 1786

			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
				       lockend, EXTENT_NORESERVE, NULL,
				       NULL, GFP_NOFS);
			only_release_metadata = false;
		}

1787 1788
		btrfs_drop_pages(pages, num_pages);

J
Josef Bacik 已提交
1789 1790
		cond_resched();

1791
		balance_dirty_pages_ratelimited(inode->i_mapping);
1792
		if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
1793
			btrfs_btree_balance_dirty(fs_info);
1794

1795 1796
		pos += copied;
		num_written += copied;
J
Josef Bacik 已提交
1797
	}
C
Chris Mason 已提交
1798

J
Josef Bacik 已提交
1799 1800
	kfree(pages);

1801
	if (release_bytes) {
1802
		if (only_release_metadata) {
1803
			btrfs_end_write_no_snapshotting(root);
1804
			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1805
					release_bytes, true);
1806
		} else {
1807 1808
			btrfs_delalloc_release_space(inode, data_reserved,
					round_down(pos, fs_info->sectorsize),
1809
					release_bytes, true);
1810
		}
1811 1812
	}

1813
	extent_changeset_free(data_reserved);
J
Josef Bacik 已提交
1814 1815 1816
	return num_written ? num_written : ret;
}

1817
static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
J
Josef Bacik 已提交
1818 1819
{
	struct file *file = iocb->ki_filp;
1820
	struct inode *inode = file_inode(file);
1821
	loff_t pos;
J
Josef Bacik 已提交
1822 1823 1824 1825 1826
	ssize_t written;
	ssize_t written_buffered;
	loff_t endbyte;
	int err;

1827
	written = generic_file_direct_write(iocb, from);
J
Josef Bacik 已提交
1828

A
Al Viro 已提交
1829
	if (written < 0 || !iov_iter_count(from))
J
Josef Bacik 已提交
1830 1831
		return written;

1832 1833
	pos = iocb->ki_pos;
	written_buffered = btrfs_buffered_write(iocb, from);
J
Josef Bacik 已提交
1834 1835 1836
	if (written_buffered < 0) {
		err = written_buffered;
		goto out;
C
Chris Mason 已提交
1837
	}
1838 1839 1840 1841
	/*
	 * Ensure all data is persisted. We want the next direct IO read to be
	 * able to read what was just written.
	 */
J
Josef Bacik 已提交
1842
	endbyte = pos + written_buffered - 1;
1843
	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1844 1845
	if (err)
		goto out;
1846
	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
J
Josef Bacik 已提交
1847 1848 1849
	if (err)
		goto out;
	written += written_buffered;
1850
	iocb->ki_pos = pos + written_buffered;
1851 1852
	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
				 endbyte >> PAGE_SHIFT);
C
Chris Mason 已提交
1853
out:
J
Josef Bacik 已提交
1854 1855
	return written ? written : err;
}
1856

1857 1858
static void update_time_for_write(struct inode *inode)
{
1859
	struct timespec64 now;
1860 1861 1862 1863

	if (IS_NOCMTIME(inode))
		return;

1864
	now = current_time(inode);
1865
	if (!timespec64_equal(&inode->i_mtime, &now))
1866 1867
		inode->i_mtime = now;

1868
	if (!timespec64_equal(&inode->i_ctime, &now))
1869 1870 1871 1872 1873 1874
		inode->i_ctime = now;

	if (IS_I_VERSION(inode))
		inode_inc_iversion(inode);
}

A
Al Viro 已提交
1875 1876
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
				    struct iov_iter *from)
J
Josef Bacik 已提交
1877 1878
{
	struct file *file = iocb->ki_filp;
A
Al Viro 已提交
1879
	struct inode *inode = file_inode(file);
1880
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
1881
	struct btrfs_root *root = BTRFS_I(inode)->root;
1882
	u64 start_pos;
1883
	u64 end_pos;
J
Josef Bacik 已提交
1884
	ssize_t num_written = 0;
1885
	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1886
	ssize_t err;
1887
	loff_t pos;
G
Goldwyn Rodrigues 已提交
1888
	size_t count = iov_iter_count(from);
1889 1890
	loff_t oldsize;
	int clean_page = 0;
J
Josef Bacik 已提交
1891

1892 1893 1894 1895
	if (!(iocb->ki_flags & IOCB_DIRECT) &&
	    (iocb->ki_flags & IOCB_NOWAIT))
		return -EOPNOTSUPP;

1896 1897
	if (!inode_trylock(inode)) {
		if (iocb->ki_flags & IOCB_NOWAIT)
G
Goldwyn Rodrigues 已提交
1898
			return -EAGAIN;
1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
		inode_lock(inode);
	}

	err = generic_write_checks(iocb, from);
	if (err <= 0) {
		inode_unlock(inode);
		return err;
	}

	pos = iocb->ki_pos;
	if (iocb->ki_flags & IOCB_NOWAIT) {
G
Goldwyn Rodrigues 已提交
1910 1911 1912 1913 1914 1915 1916 1917 1918 1919
		/*
		 * We will allocate space in case nodatacow is not set,
		 * so bail
		 */
		if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
					      BTRFS_INODE_PREALLOC)) ||
		    check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
			inode_unlock(inode);
			return -EAGAIN;
		}
J
Josef Bacik 已提交
1920 1921
	}

1922
	current->backing_dev_info = inode_to_bdi(inode);
1923
	err = file_remove_privs(file);
J
Josef Bacik 已提交
1924
	if (err) {
A
Al Viro 已提交
1925
		inode_unlock(inode);
J
Josef Bacik 已提交
1926 1927 1928 1929 1930 1931 1932 1933 1934
		goto out;
	}

	/*
	 * If BTRFS flips readonly due to some impossible error
	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
	 * although we have opened a file as writable, we have
	 * to stop this write operation to ensure FS consistency.
	 */
1935
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
A
Al Viro 已提交
1936
		inode_unlock(inode);
J
Josef Bacik 已提交
1937 1938 1939 1940
		err = -EROFS;
		goto out;
	}

1941 1942 1943 1944 1945 1946 1947
	/*
	 * We reserve space for updating the inode when we reserve space for the
	 * extent we are going to write, so we will enospc out there.  We don't
	 * need to start yet another transaction to update the inode as we will
	 * update the inode when we finish writing whatever data we write.
	 */
	update_time_for_write(inode);
J
Josef Bacik 已提交
1948

1949
	start_pos = round_down(pos, fs_info->sectorsize);
1950 1951
	oldsize = i_size_read(inode);
	if (start_pos > oldsize) {
1952
		/* Expand hole size to cover write data, preventing empty gap */
1953
		end_pos = round_up(pos + count,
1954
				   fs_info->sectorsize);
1955
		err = btrfs_cont_expand(inode, oldsize, end_pos);
1956
		if (err) {
A
Al Viro 已提交
1957
			inode_unlock(inode);
1958 1959
			goto out;
		}
1960
		if (start_pos > round_up(oldsize, fs_info->sectorsize))
1961
			clean_page = 1;
1962 1963
	}

1964 1965 1966
	if (sync)
		atomic_inc(&BTRFS_I(inode)->sync_writers);

1967
	if (iocb->ki_flags & IOCB_DIRECT) {
1968
		num_written = __btrfs_direct_write(iocb, from);
J
Josef Bacik 已提交
1969
	} else {
1970
		num_written = btrfs_buffered_write(iocb, from);
J
Josef Bacik 已提交
1971
		if (num_written > 0)
1972
			iocb->ki_pos = pos + num_written;
1973 1974 1975
		if (clean_page)
			pagecache_isize_extended(inode, oldsize,
						i_size_read(inode));
J
Josef Bacik 已提交
1976 1977
	}

A
Al Viro 已提交
1978
	inode_unlock(inode);
1979

1980
	/*
1981 1982
	 * We also have to set last_sub_trans to the current log transid,
	 * otherwise subsequent syncs to a file that's been synced in this
1983
	 * transaction will appear to have already occurred.
1984
	 */
1985
	spin_lock(&BTRFS_I(inode)->lock);
1986
	BTRFS_I(inode)->last_sub_trans = root->log_transid;
1987
	spin_unlock(&BTRFS_I(inode)->lock);
1988 1989
	if (num_written > 0)
		num_written = generic_write_sync(iocb, num_written);
1990

1991 1992
	if (sync)
		atomic_dec(&BTRFS_I(inode)->sync_writers);
1993
out:
C
Chris Mason 已提交
1994 1995 1996 1997
	current->backing_dev_info = NULL;
	return num_written ? num_written : err;
}

C
Chris Mason 已提交
1998
int btrfs_release_file(struct inode *inode, struct file *filp)
1999
{
2000 2001 2002 2003 2004 2005 2006
	struct btrfs_file_private *private = filp->private_data;

	if (private && private->filldir_buf)
		kfree(private->filldir_buf);
	kfree(private);
	filp->private_data = NULL;

2007
	/*
2008
	 * ordered_data_close is set by setattr when we are about to truncate
2009 2010 2011 2012 2013 2014 2015
	 * a file from a non-zero size to a zero size.  This tries to
	 * flush down new bytes that may have been written if the
	 * application were using truncate to replace a file in place.
	 */
	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
			       &BTRFS_I(inode)->runtime_flags))
			filemap_flush(inode->i_mapping);
2016 2017 2018
	return 0;
}

2019 2020 2021
static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
{
	int ret;
L
Liu Bo 已提交
2022
	struct blk_plug plug;
2023

L
Liu Bo 已提交
2024 2025 2026 2027 2028 2029 2030
	/*
	 * This is only called in fsync, which would do synchronous writes, so
	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
	 * multiple disks using raid profile, a large IO can be split to
	 * several segments of stripe length (currently 64K).
	 */
	blk_start_plug(&plug);
2031
	atomic_inc(&BTRFS_I(inode)->sync_writers);
2032
	ret = btrfs_fdatawrite_range(inode, start, end);
2033
	atomic_dec(&BTRFS_I(inode)->sync_writers);
L
Liu Bo 已提交
2034
	blk_finish_plug(&plug);
2035 2036 2037 2038

	return ret;
}

C
Chris Mason 已提交
2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049
/*
 * fsync call for both files and directories.  This logs the inode into
 * the tree log instead of forcing full commits whenever possible.
 *
 * It needs to call filemap_fdatawait so that all ordered extent updates are
 * in the metadata btree are up to date for copying to the log.
 *
 * It drops the inode mutex before doing the tree log commit.  This is an
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
2050
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
C
Chris Mason 已提交
2051
{
2052
	struct dentry *dentry = file_dentry(file);
2053
	struct inode *inode = d_inode(dentry);
2054
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
2055 2056
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
2057
	struct btrfs_log_ctx ctx;
2058
	int ret = 0, err;
2059
	u64 len;
C
Chris Mason 已提交
2060

2061 2062 2063 2064 2065
	/*
	 * The range length can be represented by u64, we have to do the typecasts
	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
	 */
	len = (u64)end - (u64)start + 1;
2066
	trace_btrfs_sync_file(file, datasync);
2067

2068 2069
	btrfs_init_log_ctx(&ctx, inode);

2070 2071 2072
	/*
	 * We write the dirty pages in the range and wait until they complete
	 * out of the ->i_mutex. If so, we can flush the dirty pages by
2073 2074
	 * multi-task, and make the performance up.  See
	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2075
	 */
2076
	ret = start_ordered_ops(inode, start, end);
2077
	if (ret)
2078
		goto out;
2079

A
Al Viro 已提交
2080
	inode_lock(inode);
2081 2082 2083 2084 2085 2086 2087 2088

	/*
	 * We take the dio_sem here because the tree log stuff can race with
	 * lockless dio writes and get an extent map logged for an extent we
	 * never waited on.  We need it this high up for lockdep reasons.
	 */
	down_write(&BTRFS_I(inode)->dio_sem);

M
Miao Xie 已提交
2089
	atomic_inc(&root->log_batch);
2090

2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114
	/*
	 * Before we acquired the inode's lock, someone may have dirtied more
	 * pages in the target range. We need to make sure that writeback for
	 * any such pages does not start while we are logging the inode, because
	 * if it does, any of the following might happen when we are not doing a
	 * full inode sync:
	 *
	 * 1) We log an extent after its writeback finishes but before its
	 *    checksums are added to the csum tree, leading to -EIO errors
	 *    when attempting to read the extent after a log replay.
	 *
	 * 2) We can end up logging an extent before its writeback finishes.
	 *    Therefore after the log replay we will have a file extent item
	 *    pointing to an unwritten extent (and no data checksums as well).
	 *
	 * So trigger writeback for any eventual new dirty pages and then we
	 * wait for all ordered extents to complete below.
	 */
	ret = start_ordered_ops(inode, start, end);
	if (ret) {
		inode_unlock(inode);
		goto out;
	}

2115
	/*
2116
	 * We have to do this here to avoid the priority inversion of waiting on
2117
	 * IO of a lower priority task while holding a transaction open.
2118
	 */
2119
	ret = btrfs_wait_ordered_range(inode, start, len);
2120
	if (ret) {
2121
		up_write(&BTRFS_I(inode)->dio_sem);
A
Al Viro 已提交
2122
		inode_unlock(inode);
2123
		goto out;
2124
	}
M
Miao Xie 已提交
2125
	atomic_inc(&root->log_batch);
2126

J
Josef Bacik 已提交
2127
	smp_mb();
2128
	if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
2129
	    BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) {
J
Josef Bacik 已提交
2130
		/*
2131
		 * We've had everything committed since the last time we were
J
Josef Bacik 已提交
2132 2133 2134 2135 2136
		 * modified so clear this flag in case it was set for whatever
		 * reason, it's no longer relevant.
		 */
		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			  &BTRFS_I(inode)->runtime_flags);
2137 2138 2139 2140
		/*
		 * An ordered extent might have started before and completed
		 * already with io errors, in which case the inode was not
		 * updated and we end up here. So check the inode's mapping
2141 2142
		 * for any errors that might have happened since we last
		 * checked called fsync.
2143
		 */
2144
		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2145
		up_write(&BTRFS_I(inode)->dio_sem);
A
Al Viro 已提交
2146
		inode_unlock(inode);
2147 2148 2149
		goto out;
	}

2150 2151 2152 2153 2154 2155 2156
	/*
	 * We use start here because we will need to wait on the IO to complete
	 * in btrfs_sync_log, which could require joining a transaction (for
	 * example checking cross references in the nocow path).  If we use join
	 * here we could get into a situation where we're waiting on IO to
	 * happen that is blocked on a transaction trying to commit.  With start
	 * we inc the extwriter counter, so we wait for all extwriters to exit
2157
	 * before we start blocking joiners.  This comment is to keep somebody
2158 2159 2160
	 * from thinking they are super smart and changing this to
	 * btrfs_join_transaction *cough*Josef*cough*.
	 */
2161 2162 2163
	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2164
		up_write(&BTRFS_I(inode)->dio_sem);
A
Al Viro 已提交
2165
		inode_unlock(inode);
C
Chris Mason 已提交
2166 2167
		goto out;
	}
2168
	trans->sync = true;
2169

2170
	ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
2171
	if (ret < 0) {
2172 2173
		/* Fallthrough and commit/free transaction. */
		ret = 1;
2174
	}
C
Chris Mason 已提交
2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185

	/* we've logged all the items and now have a consistent
	 * version of the file in the log.  It is possible that
	 * someone will come in and modify the file, but that's
	 * fine because the log is consistent on disk, and we
	 * have references to all of the file's extents
	 *
	 * It is possible that someone will come in and log the
	 * file again, but that will end up using the synchronization
	 * inside btrfs_sync_log to keep things safe.
	 */
2186
	up_write(&BTRFS_I(inode)->dio_sem);
A
Al Viro 已提交
2187
	inode_unlock(inode);
C
Chris Mason 已提交
2188

2189
	if (ret != BTRFS_NO_LOG_SYNC) {
2190
		if (!ret) {
2191
			ret = btrfs_sync_log(trans, root, &ctx);
2192
			if (!ret) {
2193
				ret = btrfs_end_transaction(trans);
2194
				goto out;
2195
			}
2196
		}
2197
		ret = btrfs_commit_transaction(trans);
2198
	} else {
2199
		ret = btrfs_end_transaction(trans);
2200
	}
C
Chris Mason 已提交
2201
out:
2202
	ASSERT(list_empty(&ctx.list));
2203 2204 2205
	err = file_check_and_advance_wb_err(file);
	if (!ret)
		ret = err;
2206
	return ret > 0 ? -EIO : ret;
C
Chris Mason 已提交
2207 2208
}

2209
static const struct vm_operations_struct btrfs_file_vm_ops = {
2210
	.fault		= filemap_fault,
2211
	.map_pages	= filemap_map_pages,
C
Chris Mason 已提交
2212 2213 2214 2215 2216
	.page_mkwrite	= btrfs_page_mkwrite,
};

static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
{
M
Miao Xie 已提交
2217 2218 2219 2220 2221
	struct address_space *mapping = filp->f_mapping;

	if (!mapping->a_ops->readpage)
		return -ENOEXEC;

C
Chris Mason 已提交
2222
	file_accessed(filp);
M
Miao Xie 已提交
2223 2224
	vma->vm_ops = &btrfs_file_vm_ops;

C
Chris Mason 已提交
2225 2226 2227
	return 0;
}

2228
static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
J
Josef Bacik 已提交
2229 2230 2231 2232 2233 2234 2235 2236 2237
			  int slot, u64 start, u64 end)
{
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;

	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
		return 0;

	btrfs_item_key_to_cpu(leaf, &key, slot);
2238
	if (key.objectid != btrfs_ino(inode) ||
J
Josef Bacik 已提交
2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256
	    key.type != BTRFS_EXTENT_DATA_KEY)
		return 0;

	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
		return 0;

	if (btrfs_file_extent_disk_bytenr(leaf, fi))
		return 0;

	if (key.offset == end)
		return 1;
	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
		return 1;
	return 0;
}

2257 2258 2259
static int fill_holes(struct btrfs_trans_handle *trans,
		struct btrfs_inode *inode,
		struct btrfs_path *path, u64 offset, u64 end)
J
Josef Bacik 已提交
2260
{
2261
	struct btrfs_fs_info *fs_info = trans->fs_info;
2262
	struct btrfs_root *root = inode->root;
J
Josef Bacik 已提交
2263 2264 2265
	struct extent_buffer *leaf;
	struct btrfs_file_extent_item *fi;
	struct extent_map *hole_em;
2266
	struct extent_map_tree *em_tree = &inode->extent_tree;
J
Josef Bacik 已提交
2267 2268 2269
	struct btrfs_key key;
	int ret;

2270
	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2271 2272
		goto out;

2273
	key.objectid = btrfs_ino(inode);
J
Josef Bacik 已提交
2274 2275 2276 2277
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = offset;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2278 2279 2280 2281 2282 2283 2284
	if (ret <= 0) {
		/*
		 * We should have dropped this offset, so if we find it then
		 * something has gone horribly wrong.
		 */
		if (ret == 0)
			ret = -EINVAL;
J
Josef Bacik 已提交
2285
		return ret;
2286
	}
J
Josef Bacik 已提交
2287 2288

	leaf = path->nodes[0];
2289
	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
J
Josef Bacik 已提交
2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303
		u64 num_bytes;

		path->slots[0]--;
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
			end - offset;
		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_offset(leaf, fi, 0);
		btrfs_mark_buffer_dirty(leaf);
		goto out;
	}

2304
	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
J
Josef Bacik 已提交
2305 2306 2307
		u64 num_bytes;

		key.offset = offset;
2308
		btrfs_set_item_key_safe(fs_info, path, &key);
J
Josef Bacik 已提交
2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
			offset;
		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_offset(leaf, fi, 0);
		btrfs_mark_buffer_dirty(leaf);
		goto out;
	}
	btrfs_release_path(path);

2321
	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2322
			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
J
Josef Bacik 已提交
2323 2324 2325 2326 2327 2328 2329 2330 2331
	if (ret)
		return ret;

out:
	btrfs_release_path(path);

	hole_em = alloc_extent_map();
	if (!hole_em) {
		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2332
		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
J
Josef Bacik 已提交
2333 2334 2335
	} else {
		hole_em->start = offset;
		hole_em->len = end - offset;
J
Josef Bacik 已提交
2336
		hole_em->ram_bytes = hole_em->len;
J
Josef Bacik 已提交
2337 2338 2339 2340
		hole_em->orig_start = offset;

		hole_em->block_start = EXTENT_MAP_HOLE;
		hole_em->block_len = 0;
2341
		hole_em->orig_block_len = 0;
2342
		hole_em->bdev = fs_info->fs_devices->latest_bdev;
J
Josef Bacik 已提交
2343 2344 2345 2346 2347 2348
		hole_em->compress_type = BTRFS_COMPRESS_NONE;
		hole_em->generation = trans->transid;

		do {
			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
			write_lock(&em_tree->lock);
J
Josef Bacik 已提交
2349
			ret = add_extent_mapping(em_tree, hole_em, 1);
J
Josef Bacik 已提交
2350 2351 2352 2353 2354
			write_unlock(&em_tree->lock);
		} while (ret == -EEXIST);
		free_extent_map(hole_em);
		if (ret)
			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2355
					&inode->runtime_flags);
J
Josef Bacik 已提交
2356 2357 2358 2359 2360
	}

	return 0;
}

2361 2362 2363 2364 2365 2366 2367 2368
/*
 * Find a hole extent on given inode and change start/len to the end of hole
 * extent.(hole/vacuum extent whose em->start <= start &&
 *	   em->start + em->len > start)
 * When a hole extent is found, return 1 and modify start/len.
 */
static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
{
2369
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2370 2371 2372
	struct extent_map *em;
	int ret = 0;

2373 2374 2375
	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
			      round_down(*start, fs_info->sectorsize),
			      round_up(*len, fs_info->sectorsize), 0);
2376 2377
	if (IS_ERR(em))
		return PTR_ERR(em);
2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389

	/* Hole or vacuum extent(only exists in no-hole mode) */
	if (em->block_start == EXTENT_MAP_HOLE) {
		ret = 1;
		*len = em->start + em->len > *start + *len ?
		       0 : *start + *len - em->start - em->len;
		*start = em->start + em->len;
	}
	free_extent_map(em);
	return ret;
}

2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412
static int btrfs_punch_hole_lock_range(struct inode *inode,
				       const u64 lockstart,
				       const u64 lockend,
				       struct extent_state **cached_state)
{
	while (1) {
		struct btrfs_ordered_extent *ordered;
		int ret;

		truncate_pagecache_range(inode, lockstart, lockend);

		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
				 cached_state);
		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);

		/*
		 * We need to make sure we have no ordered extents in this range
		 * and nobody raced in and read a page in this range, if we did
		 * we need to try again.
		 */
		if ((!ordered ||
		    (ordered->file_offset + ordered->len <= lockstart ||
		     ordered->file_offset > lockend)) &&
2413 2414
		     !filemap_range_has_page(inode->i_mapping,
					     lockstart, lockend)) {
2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430
			if (ordered)
				btrfs_put_ordered_extent(ordered);
			break;
		}
		if (ordered)
			btrfs_put_ordered_extent(ordered);
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
				     lockend, cached_state);
		ret = btrfs_wait_ordered_range(inode, lockstart,
					       lockend - lockstart + 1);
		if (ret)
			return ret;
	}
	return 0;
}

J
Josef Bacik 已提交
2431 2432
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
2433
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
2434 2435 2436 2437 2438
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_state *cached_state = NULL;
	struct btrfs_path *path;
	struct btrfs_block_rsv *rsv;
	struct btrfs_trans_handle *trans;
2439 2440 2441 2442 2443 2444
	u64 lockstart;
	u64 lockend;
	u64 tail_start;
	u64 tail_len;
	u64 orig_start = offset;
	u64 cur_offset;
C
Chris Mason 已提交
2445
	u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
J
Josef Bacik 已提交
2446 2447 2448
	u64 drop_end;
	int ret = 0;
	int err = 0;
2449
	unsigned int rsv_count;
2450
	bool same_block;
2451
	bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
2452
	u64 ino_size;
2453
	bool truncated_block = false;
2454
	bool updated_inode = false;
J
Josef Bacik 已提交
2455

2456 2457 2458
	ret = btrfs_wait_ordered_range(inode, offset, len);
	if (ret)
		return ret;
J
Josef Bacik 已提交
2459

A
Al Viro 已提交
2460
	inode_lock(inode);
2461
	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2462 2463 2464 2465 2466 2467 2468 2469 2470
	ret = find_first_non_hole(inode, &offset, &len);
	if (ret < 0)
		goto out_only_mutex;
	if (ret && !len) {
		/* Already in a large hole */
		ret = 0;
		goto out_only_mutex;
	}

2471
	lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
2472
	lockend = round_down(offset + len,
2473
			     btrfs_inode_sectorsize(inode)) - 1;
2474 2475
	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2476
	/*
2477
	 * We needn't truncate any block which is beyond the end of the file
2478 2479
	 * because we are sure there is no data there.
	 */
J
Josef Bacik 已提交
2480
	/*
2481 2482
	 * Only do this if we are in the same block and we aren't doing the
	 * entire block.
J
Josef Bacik 已提交
2483
	 */
2484
	if (same_block && len < fs_info->sectorsize) {
2485
		if (offset < ino_size) {
2486 2487
			truncated_block = true;
			ret = btrfs_truncate_block(inode, offset, len, 0);
2488 2489 2490
		} else {
			ret = 0;
		}
2491
		goto out_only_mutex;
J
Josef Bacik 已提交
2492 2493
	}

2494
	/* zero back part of the first block */
2495
	if (offset < ino_size) {
2496 2497
		truncated_block = true;
		ret = btrfs_truncate_block(inode, offset, 0, 0);
2498
		if (ret) {
A
Al Viro 已提交
2499
			inode_unlock(inode);
2500 2501
			return ret;
		}
J
Josef Bacik 已提交
2502 2503
	}

2504 2505
	/* Check the aligned pages after the first unaligned page,
	 * if offset != orig_start, which means the first unaligned page
2506
	 * including several following pages are already in holes,
2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531
	 * the extra check can be skipped */
	if (offset == orig_start) {
		/* after truncate page, check hole again */
		len = offset + len - lockstart;
		offset = lockstart;
		ret = find_first_non_hole(inode, &offset, &len);
		if (ret < 0)
			goto out_only_mutex;
		if (ret && !len) {
			ret = 0;
			goto out_only_mutex;
		}
		lockstart = offset;
	}

	/* Check the tail unaligned part is in a hole */
	tail_start = lockend + 1;
	tail_len = offset + len - tail_start;
	if (tail_len) {
		ret = find_first_non_hole(inode, &tail_start, &tail_len);
		if (unlikely(ret < 0))
			goto out_only_mutex;
		if (!ret) {
			/* zero the front end of the last page */
			if (tail_start + tail_len < ino_size) {
2532 2533 2534 2535
				truncated_block = true;
				ret = btrfs_truncate_block(inode,
							tail_start + tail_len,
							0, 1);
2536 2537
				if (ret)
					goto out_only_mutex;
2538
			}
M
Miao Xie 已提交
2539
		}
J
Josef Bacik 已提交
2540 2541 2542
	}

	if (lockend < lockstart) {
2543 2544
		ret = 0;
		goto out_only_mutex;
J
Josef Bacik 已提交
2545 2546
	}

2547 2548 2549 2550 2551
	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
					  &cached_state);
	if (ret) {
		inode_unlock(inode);
		goto out_only_mutex;
J
Josef Bacik 已提交
2552 2553 2554 2555 2556 2557 2558 2559
	}

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

2560
	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
J
Josef Bacik 已提交
2561 2562 2563 2564
	if (!rsv) {
		ret = -ENOMEM;
		goto out_free;
	}
C
Chris Mason 已提交
2565
	rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
J
Josef Bacik 已提交
2566 2567 2568 2569 2570
	rsv->failfast = 1;

	/*
	 * 1 - update the inode
	 * 1 - removing the extents in the range
2571
	 * 1 - adding the hole extent if no_holes isn't set
J
Josef Bacik 已提交
2572
	 */
2573 2574
	rsv_count = no_holes ? 2 : 3;
	trans = btrfs_start_transaction(root, rsv_count);
J
Josef Bacik 已提交
2575 2576 2577 2578 2579
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}

2580
	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2581
				      min_size, false);
J
Josef Bacik 已提交
2582 2583 2584
	BUG_ON(ret);
	trans->block_rsv = rsv;

2585 2586
	cur_offset = lockstart;
	len = lockend - cur_offset;
J
Josef Bacik 已提交
2587 2588 2589
	while (cur_offset < lockend) {
		ret = __btrfs_drop_extents(trans, root, inode, path,
					   cur_offset, lockend + 1,
2590
					   &drop_end, 1, 0, 0, NULL);
J
Josef Bacik 已提交
2591 2592 2593
		if (ret != -ENOSPC)
			break;

2594
		trans->block_rsv = &fs_info->trans_block_rsv;
J
Josef Bacik 已提交
2595

J
Josef Bacik 已提交
2596
		if (cur_offset < drop_end && cur_offset < ino_size) {
2597 2598
			ret = fill_holes(trans, BTRFS_I(inode), path,
					cur_offset, drop_end);
2599
			if (ret) {
2600 2601 2602 2603 2604 2605 2606
				/*
				 * If we failed then we didn't insert our hole
				 * entries for the area we dropped, so now the
				 * fs is corrupted, so we must abort the
				 * transaction.
				 */
				btrfs_abort_transaction(trans, ret);
2607 2608 2609
				err = ret;
				break;
			}
J
Josef Bacik 已提交
2610 2611 2612 2613 2614 2615 2616 2617 2618 2619
		}

		cur_offset = drop_end;

		ret = btrfs_update_inode(trans, root, inode);
		if (ret) {
			err = ret;
			break;
		}

2620
		btrfs_end_transaction(trans);
2621
		btrfs_btree_balance_dirty(fs_info);
J
Josef Bacik 已提交
2622

2623
		trans = btrfs_start_transaction(root, rsv_count);
J
Josef Bacik 已提交
2624 2625 2626 2627 2628 2629
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			trans = NULL;
			break;
		}

2630
		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2631
					      rsv, min_size, false);
J
Josef Bacik 已提交
2632 2633
		BUG_ON(ret);	/* shouldn't happen */
		trans->block_rsv = rsv;
2634 2635 2636 2637 2638 2639 2640 2641

		ret = find_first_non_hole(inode, &cur_offset, &len);
		if (unlikely(ret < 0))
			break;
		if (ret && !len) {
			ret = 0;
			break;
		}
J
Josef Bacik 已提交
2642 2643 2644 2645 2646 2647 2648
	}

	if (ret) {
		err = ret;
		goto out_trans;
	}

2649
	trans->block_rsv = &fs_info->trans_block_rsv;
2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662
	/*
	 * If we are using the NO_HOLES feature we might have had already an
	 * hole that overlaps a part of the region [lockstart, lockend] and
	 * ends at (or beyond) lockend. Since we have no file extent items to
	 * represent holes, drop_end can be less than lockend and so we must
	 * make sure we have an extent map representing the existing hole (the
	 * call to __btrfs_drop_extents() might have dropped the existing extent
	 * map representing the existing hole), otherwise the fast fsync path
	 * will not record the existence of the hole region
	 * [existing_hole_start, lockend].
	 */
	if (drop_end <= lockend)
		drop_end = lockend + 1;
2663 2664 2665 2666 2667 2668
	/*
	 * Don't insert file hole extent item if it's for a range beyond eof
	 * (because it's useless) or if it represents a 0 bytes range (when
	 * cur_offset == drop_end).
	 */
	if (cur_offset < ino_size && cur_offset < drop_end) {
2669 2670
		ret = fill_holes(trans, BTRFS_I(inode), path,
				cur_offset, drop_end);
2671
		if (ret) {
2672 2673
			/* Same comment as above. */
			btrfs_abort_transaction(trans, ret);
2674 2675 2676
			err = ret;
			goto out_trans;
		}
J
Josef Bacik 已提交
2677 2678 2679 2680 2681 2682
	}

out_trans:
	if (!trans)
		goto out_free;

2683
	inode_inc_iversion(inode);
2684
	inode->i_mtime = inode->i_ctime = current_time(inode);
2685

2686
	trans->block_rsv = &fs_info->trans_block_rsv;
J
Josef Bacik 已提交
2687
	ret = btrfs_update_inode(trans, root, inode);
2688
	updated_inode = true;
2689
	btrfs_end_transaction(trans);
2690
	btrfs_btree_balance_dirty(fs_info);
J
Josef Bacik 已提交
2691 2692
out_free:
	btrfs_free_path(path);
2693
	btrfs_free_block_rsv(fs_info, rsv);
J
Josef Bacik 已提交
2694 2695
out:
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2696
			     &cached_state);
2697
out_only_mutex:
2698
	if (!updated_inode && truncated_block && !ret && !err) {
2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
		/*
		 * If we only end up zeroing part of a page, we still need to
		 * update the inode item, so that all the time fields are
		 * updated as well as the necessary btrfs inode in memory fields
		 * for detecting, at fsync time, if the inode isn't yet in the
		 * log tree or it's there but not up to date.
		 */
		trans = btrfs_start_transaction(root, 1);
		if (IS_ERR(trans)) {
			err = PTR_ERR(trans);
		} else {
			err = btrfs_update_inode(trans, root, inode);
2711
			ret = btrfs_end_transaction(trans);
2712 2713
		}
	}
A
Al Viro 已提交
2714
	inode_unlock(inode);
J
Josef Bacik 已提交
2715 2716 2717 2718 2719
	if (ret && !err)
		err = ret;
	return err;
}

2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750
/* Helper structure to record which range is already reserved */
struct falloc_range {
	struct list_head list;
	u64 start;
	u64 len;
};

/*
 * Helper function to add falloc range
 *
 * Caller should have locked the larger range of extent containing
 * [start, len)
 */
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
	struct falloc_range *prev = NULL;
	struct falloc_range *range = NULL;

	if (list_empty(head))
		goto insert;

	/*
	 * As fallocate iterate by bytenr order, we only need to check
	 * the last range.
	 */
	prev = list_entry(head->prev, struct falloc_range, list);
	if (prev->start + prev->len == start) {
		prev->len += len;
		return 0;
	}
insert:
D
David Sterba 已提交
2751
	range = kmalloc(sizeof(*range), GFP_KERNEL);
2752 2753 2754 2755 2756 2757 2758 2759
	if (!range)
		return -ENOMEM;
	range->start = start;
	range->len = len;
	list_add_tail(&range->list, head);
	return 0;
}

2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784
static int btrfs_fallocate_update_isize(struct inode *inode,
					const u64 end,
					const int mode)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	int ret;
	int ret2;

	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
		return 0;

	trans = btrfs_start_transaction(root, 1);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	inode->i_ctime = current_time(inode);
	i_size_write(inode, end);
	btrfs_ordered_update_i_size(inode, end, NULL);
	ret = btrfs_update_inode(trans, root, inode);
	ret2 = btrfs_end_transaction(trans);

	return ret ? ret : ret2;
}

2785 2786 2787 2788 2789 2790
enum {
	RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
	RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
	RANGE_BOUNDARY_HOLE = 2,
};

2791 2792 2793 2794 2795
static int btrfs_zero_range_check_range_boundary(struct inode *inode,
						 u64 offset)
{
	const u64 sectorsize = btrfs_inode_sectorsize(inode);
	struct extent_map *em;
2796
	int ret;
2797 2798 2799 2800 2801 2802 2803

	offset = round_down(offset, sectorsize);
	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
	if (IS_ERR(em))
		return PTR_ERR(em);

	if (em->block_start == EXTENT_MAP_HOLE)
2804 2805 2806 2807 2808
		ret = RANGE_BOUNDARY_HOLE;
	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
	else
		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916

	free_extent_map(em);
	return ret;
}

static int btrfs_zero_range(struct inode *inode,
			    loff_t offset,
			    loff_t len,
			    const int mode)
{
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct extent_map *em;
	struct extent_changeset *data_reserved = NULL;
	int ret;
	u64 alloc_hint = 0;
	const u64 sectorsize = btrfs_inode_sectorsize(inode);
	u64 alloc_start = round_down(offset, sectorsize);
	u64 alloc_end = round_up(offset + len, sectorsize);
	u64 bytes_to_reserve = 0;
	bool space_reserved = false;

	inode_dio_wait(inode);

	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
			      alloc_start, alloc_end - alloc_start, 0);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out;
	}

	/*
	 * Avoid hole punching and extent allocation for some cases. More cases
	 * could be considered, but these are unlikely common and we keep things
	 * as simple as possible for now. Also, intentionally, if the target
	 * range contains one or more prealloc extents together with regular
	 * extents and holes, we drop all the existing extents and allocate a
	 * new prealloc extent, so that we get a larger contiguous disk extent.
	 */
	if (em->start <= alloc_start &&
	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
		const u64 em_end = em->start + em->len;

		if (em_end >= offset + len) {
			/*
			 * The whole range is already a prealloc extent,
			 * do nothing except updating the inode's i_size if
			 * needed.
			 */
			free_extent_map(em);
			ret = btrfs_fallocate_update_isize(inode, offset + len,
							   mode);
			goto out;
		}
		/*
		 * Part of the range is already a prealloc extent, so operate
		 * only on the remaining part of the range.
		 */
		alloc_start = em_end;
		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
		len = offset + len - alloc_start;
		offset = alloc_start;
		alloc_hint = em->block_start + em->len;
	}
	free_extent_map(em);

	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
				      alloc_start, sectorsize, 0);
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}

		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
			free_extent_map(em);
			ret = btrfs_fallocate_update_isize(inode, offset + len,
							   mode);
			goto out;
		}
		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
			free_extent_map(em);
			ret = btrfs_truncate_block(inode, offset, len, 0);
			if (!ret)
				ret = btrfs_fallocate_update_isize(inode,
								   offset + len,
								   mode);
			return ret;
		}
		free_extent_map(em);
		alloc_start = round_down(offset, sectorsize);
		alloc_end = alloc_start + sectorsize;
		goto reserve_space;
	}

	alloc_start = round_up(offset, sectorsize);
	alloc_end = round_down(offset + len, sectorsize);

	/*
	 * For unaligned ranges, check the pages at the boundaries, they might
	 * map to an extent, in which case we need to partially zero them, or
	 * they might map to a hole, in which case we need our allocation range
	 * to cover them.
	 */
	if (!IS_ALIGNED(offset, sectorsize)) {
		ret = btrfs_zero_range_check_range_boundary(inode, offset);
		if (ret < 0)
			goto out;
2917
		if (ret == RANGE_BOUNDARY_HOLE) {
2918 2919
			alloc_start = round_down(offset, sectorsize);
			ret = 0;
2920
		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2921 2922 2923
			ret = btrfs_truncate_block(inode, offset, 0, 0);
			if (ret)
				goto out;
2924 2925
		} else {
			ret = 0;
2926 2927 2928 2929 2930 2931 2932 2933
		}
	}

	if (!IS_ALIGNED(offset + len, sectorsize)) {
		ret = btrfs_zero_range_check_range_boundary(inode,
							    offset + len);
		if (ret < 0)
			goto out;
2934
		if (ret == RANGE_BOUNDARY_HOLE) {
2935 2936
			alloc_end = round_up(offset + len, sectorsize);
			ret = 0;
2937
		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2938 2939 2940
			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
			if (ret)
				goto out;
2941 2942
		} else {
			ret = 0;
2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972
		}
	}

reserve_space:
	if (alloc_start < alloc_end) {
		struct extent_state *cached_state = NULL;
		const u64 lockstart = alloc_start;
		const u64 lockend = alloc_end - 1;

		bytes_to_reserve = alloc_end - alloc_start;
		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
						      bytes_to_reserve);
		if (ret < 0)
			goto out;
		space_reserved = true;
		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
						alloc_start, bytes_to_reserve);
		if (ret)
			goto out;
		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
						  &cached_state);
		if (ret)
			goto out;
		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
						alloc_end - alloc_start,
						i_blocksize(inode),
						offset + len, &alloc_hint);
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
				     lockend, &cached_state);
		/* btrfs_prealloc_file_range releases reserved space on error */
2973
		if (ret) {
2974
			space_reserved = false;
2975 2976
			goto out;
		}
2977
	}
2978
	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
2979 2980 2981 2982 2983 2984 2985 2986 2987
 out:
	if (ret && space_reserved)
		btrfs_free_reserved_data_space(inode, data_reserved,
					       alloc_start, bytes_to_reserve);
	extent_changeset_free(data_reserved);

	return ret;
}

2988 2989 2990
static long btrfs_fallocate(struct file *file, int mode,
			    loff_t offset, loff_t len)
{
A
Al Viro 已提交
2991
	struct inode *inode = file_inode(file);
2992
	struct extent_state *cached_state = NULL;
2993
	struct extent_changeset *data_reserved = NULL;
2994 2995 2996
	struct falloc_range *range;
	struct falloc_range *tmp;
	struct list_head reserve_list;
2997 2998 2999 3000 3001 3002
	u64 cur_offset;
	u64 last_byte;
	u64 alloc_start;
	u64 alloc_end;
	u64 alloc_hint = 0;
	u64 locked_end;
3003
	u64 actual_end = 0;
3004
	struct extent_map *em;
3005
	int blocksize = btrfs_inode_sectorsize(inode);
3006 3007
	int ret;

3008 3009
	alloc_start = round_down(offset, blocksize);
	alloc_end = round_up(offset + len, blocksize);
3010
	cur_offset = alloc_start;
3011

J
Josef Bacik 已提交
3012
	/* Make sure we aren't being give some crap mode */
3013 3014
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
		     FALLOC_FL_ZERO_RANGE))
3015 3016
		return -EOPNOTSUPP;

J
Josef Bacik 已提交
3017 3018 3019
	if (mode & FALLOC_FL_PUNCH_HOLE)
		return btrfs_punch_hole(inode, offset, len);

3020
	/*
3021 3022 3023
	 * Only trigger disk allocation, don't trigger qgroup reserve
	 *
	 * For qgroup space, it will be checked later.
3024
	 */
3025 3026 3027 3028 3029 3030
	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
						      alloc_end - alloc_start);
		if (ret < 0)
			return ret;
	}
3031

A
Al Viro 已提交
3032
	inode_lock(inode);
3033 3034 3035 3036 3037 3038

	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
		ret = inode_newsize_ok(inode, offset + len);
		if (ret)
			goto out;
	}
3039

3040 3041 3042 3043 3044 3045 3046
	/*
	 * TODO: Move these two operations after we have checked
	 * accurate reserved space, or fallocate can still fail but
	 * with page truncated or size expanded.
	 *
	 * But that's a minor problem and won't do much harm BTW.
	 */
3047
	if (alloc_start > inode->i_size) {
3048 3049
		ret = btrfs_cont_expand(inode, i_size_read(inode),
					alloc_start);
3050 3051
		if (ret)
			goto out;
3052
	} else if (offset + len > inode->i_size) {
3053 3054
		/*
		 * If we are fallocating from the end of the file onward we
3055 3056
		 * need to zero out the end of the block if i_size lands in the
		 * middle of a block.
3057
		 */
3058
		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
3059 3060
		if (ret)
			goto out;
3061 3062
	}

3063 3064 3065 3066
	/*
	 * wait for ordered IO before we have any locks.  We'll loop again
	 * below with the locks held.
	 */
3067 3068 3069 3070
	ret = btrfs_wait_ordered_range(inode, alloc_start,
				       alloc_end - alloc_start);
	if (ret)
		goto out;
3071

3072 3073 3074 3075 3076 3077
	if (mode & FALLOC_FL_ZERO_RANGE) {
		ret = btrfs_zero_range(inode, offset, len, mode);
		inode_unlock(inode);
		return ret;
	}

3078 3079 3080 3081 3082 3083 3084 3085
	locked_end = alloc_end - 1;
	while (1) {
		struct btrfs_ordered_extent *ordered;

		/* the extent lock is ordered inside the running
		 * transaction
		 */
		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3086
				 locked_end, &cached_state);
3087 3088
		ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);

3089 3090 3091 3092 3093 3094
		if (ordered &&
		    ordered->file_offset + ordered->len > alloc_start &&
		    ordered->file_offset < alloc_end) {
			btrfs_put_ordered_extent(ordered);
			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
					     alloc_start, locked_end,
3095
					     &cached_state);
3096 3097 3098 3099
			/*
			 * we can't wait on the range with the transaction
			 * running or with the extent lock held
			 */
3100 3101 3102 3103
			ret = btrfs_wait_ordered_range(inode, alloc_start,
						       alloc_end - alloc_start);
			if (ret)
				goto out;
3104 3105 3106 3107 3108 3109 3110
		} else {
			if (ordered)
				btrfs_put_ordered_extent(ordered);
			break;
		}
	}

3111 3112
	/* First, check if we exceed the qgroup limit */
	INIT_LIST_HEAD(&reserve_list);
3113
	while (cur_offset < alloc_end) {
3114
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3115
				      alloc_end - cur_offset, 0);
3116 3117
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
3118 3119
			break;
		}
3120
		last_byte = min(extent_map_end(em), alloc_end);
3121
		actual_end = min_t(u64, extent_map_end(em), offset + len);
3122
		last_byte = ALIGN(last_byte, blocksize);
3123 3124 3125
		if (em->block_start == EXTENT_MAP_HOLE ||
		    (cur_offset >= inode->i_size &&
		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3126 3127 3128 3129 3130
			ret = add_falloc_range(&reserve_list, cur_offset,
					       last_byte - cur_offset);
			if (ret < 0) {
				free_extent_map(em);
				break;
3131
			}
3132 3133
			ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
					cur_offset, last_byte - cur_offset);
3134 3135
			if (ret < 0) {
				free_extent_map(em);
3136
				break;
3137
			}
3138 3139 3140 3141 3142 3143
		} else {
			/*
			 * Do not need to reserve unwritten extent for this
			 * range, free reserved data space first, otherwise
			 * it'll result in false ENOSPC error.
			 */
3144 3145
			btrfs_free_reserved_data_space(inode, data_reserved,
					cur_offset, last_byte - cur_offset);
3146 3147 3148
		}
		free_extent_map(em);
		cur_offset = last_byte;
3149 3150 3151 3152 3153 3154 3155 3156 3157 3158
	}

	/*
	 * If ret is still 0, means we're OK to fallocate.
	 * Or just cleanup the list and exit.
	 */
	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
		if (!ret)
			ret = btrfs_prealloc_file_range(inode, mode,
					range->start,
F
Fabian Frederick 已提交
3159
					range->len, i_blocksize(inode),
3160
					offset + len, &alloc_hint);
3161
		else
3162 3163 3164
			btrfs_free_reserved_data_space(inode,
					data_reserved, range->start,
					range->len);
3165 3166 3167 3168 3169 3170
		list_del(&range->list);
		kfree(range);
	}
	if (ret < 0)
		goto out_unlock;

3171 3172 3173 3174 3175
	/*
	 * We didn't need to allocate any more space, but we still extended the
	 * size of the file so we need to update i_size and the inode item.
	 */
	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3176
out_unlock:
3177
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3178
			     &cached_state);
3179
out:
A
Al Viro 已提交
3180
	inode_unlock(inode);
3181
	/* Let go of our reservation. */
3182
	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3183 3184
		btrfs_free_reserved_data_space(inode, data_reserved,
				alloc_start, alloc_end - cur_offset);
3185
	extent_changeset_free(data_reserved);
3186 3187 3188
	return ret;
}

3189
static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
J
Josef Bacik 已提交
3190
{
3191
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3192
	struct extent_map *em = NULL;
J
Josef Bacik 已提交
3193
	struct extent_state *cached_state = NULL;
L
Liu Bo 已提交
3194 3195 3196 3197
	u64 lockstart;
	u64 lockend;
	u64 start;
	u64 len;
J
Josef Bacik 已提交
3198 3199
	int ret = 0;

L
Liu Bo 已提交
3200 3201 3202 3203 3204 3205 3206 3207 3208
	if (inode->i_size == 0)
		return -ENXIO;

	/*
	 * *offset can be negative, in this case we start finding DATA/HOLE from
	 * the very start of the file.
	 */
	start = max_t(loff_t, 0, *offset);

3209
	lockstart = round_down(start, fs_info->sectorsize);
3210
	lockend = round_up(i_size_read(inode),
3211
			   fs_info->sectorsize);
J
Josef Bacik 已提交
3212
	if (lockend <= lockstart)
3213
		lockend = lockstart + fs_info->sectorsize;
L
Liu Bo 已提交
3214
	lockend--;
J
Josef Bacik 已提交
3215 3216
	len = lockend - lockstart + 1;

3217
	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3218
			 &cached_state);
J
Josef Bacik 已提交
3219

3220
	while (start < inode->i_size) {
3221 3222
		em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
				start, len, 0);
J
Josef Bacik 已提交
3223
		if (IS_ERR(em)) {
3224
			ret = PTR_ERR(em);
3225
			em = NULL;
J
Josef Bacik 已提交
3226 3227 3228
			break;
		}

3229 3230 3231 3232 3233 3234 3235 3236
		if (whence == SEEK_HOLE &&
		    (em->block_start == EXTENT_MAP_HOLE ||
		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
			break;
		else if (whence == SEEK_DATA &&
			   (em->block_start != EXTENT_MAP_HOLE &&
			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
			break;
J
Josef Bacik 已提交
3237 3238 3239

		start = em->start + em->len;
		free_extent_map(em);
3240
		em = NULL;
J
Josef Bacik 已提交
3241 3242
		cond_resched();
	}
3243 3244 3245 3246 3247 3248 3249
	free_extent_map(em);
	if (!ret) {
		if (whence == SEEK_DATA && start >= inode->i_size)
			ret = -ENXIO;
		else
			*offset = min_t(loff_t, start, inode->i_size);
	}
J
Josef Bacik 已提交
3250
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3251
			     &cached_state);
J
Josef Bacik 已提交
3252 3253 3254
	return ret;
}

3255
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
J
Josef Bacik 已提交
3256 3257 3258 3259
{
	struct inode *inode = file->f_mapping->host;
	int ret;

A
Al Viro 已提交
3260
	inode_lock(inode);
3261
	switch (whence) {
J
Josef Bacik 已提交
3262 3263
	case SEEK_END:
	case SEEK_CUR:
3264
		offset = generic_file_llseek(file, offset, whence);
J
Josef Bacik 已提交
3265 3266 3267
		goto out;
	case SEEK_DATA:
	case SEEK_HOLE:
3268
		if (offset >= i_size_read(inode)) {
A
Al Viro 已提交
3269
			inode_unlock(inode);
3270 3271 3272
			return -ENXIO;
		}

3273
		ret = find_desired_extent(inode, &offset, whence);
J
Josef Bacik 已提交
3274
		if (ret) {
A
Al Viro 已提交
3275
			inode_unlock(inode);
J
Josef Bacik 已提交
3276 3277 3278 3279
			return ret;
		}
	}

J
Jie Liu 已提交
3280
	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
J
Josef Bacik 已提交
3281
out:
A
Al Viro 已提交
3282
	inode_unlock(inode);
J
Josef Bacik 已提交
3283 3284 3285
	return offset;
}

G
Goldwyn Rodrigues 已提交
3286 3287
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
3288
	filp->f_mode |= FMODE_NOWAIT;
G
Goldwyn Rodrigues 已提交
3289 3290 3291
	return generic_file_open(inode, filp);
}

3292
const struct file_operations btrfs_file_operations = {
J
Josef Bacik 已提交
3293
	.llseek		= btrfs_file_llseek,
3294
	.read_iter      = generic_file_read_iter,
C
Chris Mason 已提交
3295
	.splice_read	= generic_file_splice_read,
A
Al Viro 已提交
3296
	.write_iter	= btrfs_file_write_iter,
C
Chris Mason 已提交
3297
	.mmap		= btrfs_file_mmap,
G
Goldwyn Rodrigues 已提交
3298
	.open		= btrfs_file_open,
3299
	.release	= btrfs_release_file,
C
Chris Mason 已提交
3300
	.fsync		= btrfs_sync_file,
3301
	.fallocate	= btrfs_fallocate,
C
Christoph Hellwig 已提交
3302
	.unlocked_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
3303
#ifdef CONFIG_COMPAT
3304
	.compat_ioctl	= btrfs_compat_ioctl,
C
Chris Mason 已提交
3305
#endif
3306
	.remap_file_range = btrfs_remap_file_range,
C
Chris Mason 已提交
3307
};
3308

3309
void __cold btrfs_auto_defrag_exit(void)
3310
{
3311
	kmem_cache_destroy(btrfs_inode_defrag_cachep);
3312 3313
}

3314
int __init btrfs_auto_defrag_init(void)
3315 3316 3317
{
	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
					sizeof(struct inode_defrag), 0,
3318
					SLAB_MEM_SPREAD,
3319 3320 3321 3322 3323 3324
					NULL);
	if (!btrfs_inode_defrag_cachep)
		return -ENOMEM;

	return 0;
}
3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350

int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
	int ret;

	/*
	 * So with compression we will find and lock a dirty page and clear the
	 * first one as dirty, setup an async extent, and immediately return
	 * with the entire range locked but with nobody actually marked with
	 * writeback.  So we can't just filemap_write_and_wait_range() and
	 * expect it to work since it will just kick off a thread to do the
	 * actual work.  So we need to call filemap_fdatawrite_range _again_
	 * since it will wait on the page lock, which won't be unlocked until
	 * after the pages have been marked as writeback and so we're good to go
	 * from there.  We have to do this otherwise we'll miss the ordered
	 * extents and that results in badness.  Please Josef, do not think you
	 * know better and pull this out at some point in the future, it is
	 * right and you are wrong.
	 */
	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
			     &BTRFS_I(inode)->runtime_flags))
		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);

	return ret;
}