file.c 90.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

C
Chris Mason 已提交
6 7 8 9 10 11 12 13
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
14
#include <linux/falloc.h>
C
Chris Mason 已提交
15 16 17
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/compat.h>
18
#include <linux/slab.h>
19
#include <linux/btrfs.h>
20
#include <linux/uio.h>
21
#include <linux/iversion.h>
C
Chris Mason 已提交
22 23 24 25 26
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
27 28
#include "tree-log.h"
#include "locking.h"
J
Josef Bacik 已提交
29
#include "volumes.h"
J
Josef Bacik 已提交
30
#include "qgroup.h"
31
#include "compression.h"
C
Chris Mason 已提交
32

33
static struct kmem_cache *btrfs_inode_defrag_cachep;
C
Chris Mason 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
/*
 * when auto defrag is enabled we
 * queue up these defrag structs to remember which
 * inodes need defragging passes
 */
struct inode_defrag {
	struct rb_node rb_node;
	/* objectid */
	u64 ino;
	/*
	 * transid where the defrag was added, we search for
	 * extents newer than this
	 */
	u64 transid;

	/* root objectid */
	u64 root;

	/* last offset we were able to defrag */
	u64 last_offset;

	/* if we've wrapped around back to zero once already */
	int cycled;
};

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
static int __compare_inode_defrag(struct inode_defrag *defrag1,
				  struct inode_defrag *defrag2)
{
	if (defrag1->root > defrag2->root)
		return 1;
	else if (defrag1->root < defrag2->root)
		return -1;
	else if (defrag1->ino > defrag2->ino)
		return 1;
	else if (defrag1->ino < defrag2->ino)
		return -1;
	else
		return 0;
}

C
Chris Mason 已提交
74 75 76 77 78 79 80 81 82
/* pop a record for an inode into the defrag tree.  The lock
 * must be held already
 *
 * If you're inserting a record for an older transid than an
 * existing record, the transid already in the tree is lowered
 *
 * If an existing record is found the defrag item you
 * pass in is freed
 */
83
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
C
Chris Mason 已提交
84 85
				    struct inode_defrag *defrag)
{
86
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
C
Chris Mason 已提交
87 88 89
	struct inode_defrag *entry;
	struct rb_node **p;
	struct rb_node *parent = NULL;
90
	int ret;
C
Chris Mason 已提交
91

92
	p = &fs_info->defrag_inodes.rb_node;
C
Chris Mason 已提交
93 94 95 96
	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

97 98
		ret = __compare_inode_defrag(defrag, entry);
		if (ret < 0)
C
Chris Mason 已提交
99
			p = &parent->rb_left;
100
		else if (ret > 0)
C
Chris Mason 已提交
101 102 103 104 105 106 107 108 109 110
			p = &parent->rb_right;
		else {
			/* if we're reinserting an entry for
			 * an old defrag run, make sure to
			 * lower the transid of our existing record
			 */
			if (defrag->transid < entry->transid)
				entry->transid = defrag->transid;
			if (defrag->last_offset > entry->last_offset)
				entry->last_offset = defrag->last_offset;
111
			return -EEXIST;
C
Chris Mason 已提交
112 113
		}
	}
114
	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
C
Chris Mason 已提交
115
	rb_link_node(&defrag->rb_node, parent, p);
116
	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
117 118
	return 0;
}
C
Chris Mason 已提交
119

120
static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
121
{
122
	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
123 124
		return 0;

125
	if (btrfs_fs_closing(fs_info))
126
		return 0;
C
Chris Mason 已提交
127

128
	return 1;
C
Chris Mason 已提交
129 130 131 132 133 134 135
}

/*
 * insert a defrag record for this inode if auto defrag is
 * enabled
 */
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
136
			   struct btrfs_inode *inode)
C
Chris Mason 已提交
137
{
138 139
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
	struct btrfs_root *root = inode->root;
C
Chris Mason 已提交
140 141
	struct inode_defrag *defrag;
	u64 transid;
142
	int ret;
C
Chris Mason 已提交
143

144
	if (!__need_auto_defrag(fs_info))
C
Chris Mason 已提交
145 146
		return 0;

147
	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
C
Chris Mason 已提交
148 149 150 151 152
		return 0;

	if (trans)
		transid = trans->transid;
	else
153
		transid = inode->root->last_trans;
C
Chris Mason 已提交
154

155
	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
C
Chris Mason 已提交
156 157 158
	if (!defrag)
		return -ENOMEM;

159
	defrag->ino = btrfs_ino(inode);
C
Chris Mason 已提交
160 161 162
	defrag->transid = transid;
	defrag->root = root->root_key.objectid;

163
	spin_lock(&fs_info->defrag_inodes_lock);
164
	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
165 166 167 168 169 170 171 172 173
		/*
		 * If we set IN_DEFRAG flag and evict the inode from memory,
		 * and then re-read this inode, this new inode doesn't have
		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
		 */
		ret = __btrfs_add_inode_defrag(inode, defrag);
		if (ret)
			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	} else {
174
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
175
	}
176
	spin_unlock(&fs_info->defrag_inodes_lock);
177
	return 0;
C
Chris Mason 已提交
178 179 180
}

/*
181 182 183
 * Requeue the defrag object. If there is a defrag object that points to
 * the same inode in the tree, we will merge them together (by
 * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
C
Chris Mason 已提交
184
 */
185
static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
186
				       struct inode_defrag *defrag)
187
{
188
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
189 190
	int ret;

191
	if (!__need_auto_defrag(fs_info))
192 193 194 195 196 197
		goto out;

	/*
	 * Here we don't check the IN_DEFRAG flag, because we need merge
	 * them together.
	 */
198
	spin_lock(&fs_info->defrag_inodes_lock);
199
	ret = __btrfs_add_inode_defrag(inode, defrag);
200
	spin_unlock(&fs_info->defrag_inodes_lock);
201 202 203 204 205 206 207
	if (ret)
		goto out;
	return;
out:
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}

C
Chris Mason 已提交
208
/*
209 210
 * pick the defragable inode that we want, if it doesn't exist, we will get
 * the next one.
C
Chris Mason 已提交
211
 */
212 213
static struct inode_defrag *
btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
C
Chris Mason 已提交
214 215
{
	struct inode_defrag *entry = NULL;
216
	struct inode_defrag tmp;
C
Chris Mason 已提交
217 218
	struct rb_node *p;
	struct rb_node *parent = NULL;
219 220 221 222
	int ret;

	tmp.ino = ino;
	tmp.root = root;
C
Chris Mason 已提交
223

224 225
	spin_lock(&fs_info->defrag_inodes_lock);
	p = fs_info->defrag_inodes.rb_node;
C
Chris Mason 已提交
226 227 228 229
	while (p) {
		parent = p;
		entry = rb_entry(parent, struct inode_defrag, rb_node);

230 231
		ret = __compare_inode_defrag(&tmp, entry);
		if (ret < 0)
C
Chris Mason 已提交
232
			p = parent->rb_left;
233
		else if (ret > 0)
C
Chris Mason 已提交
234 235
			p = parent->rb_right;
		else
236
			goto out;
C
Chris Mason 已提交
237 238
	}

239 240 241
	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
		parent = rb_next(parent);
		if (parent)
C
Chris Mason 已提交
242
			entry = rb_entry(parent, struct inode_defrag, rb_node);
243 244
		else
			entry = NULL;
C
Chris Mason 已提交
245
	}
246 247 248 249 250
out:
	if (entry)
		rb_erase(parent, &fs_info->defrag_inodes);
	spin_unlock(&fs_info->defrag_inodes_lock);
	return entry;
C
Chris Mason 已提交
251 252
}

253
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
254 255
{
	struct inode_defrag *defrag;
256 257 258 259 260 261 262 263 264
	struct rb_node *node;

	spin_lock(&fs_info->defrag_inodes_lock);
	node = rb_first(&fs_info->defrag_inodes);
	while (node) {
		rb_erase(node, &fs_info->defrag_inodes);
		defrag = rb_entry(node, struct inode_defrag, rb_node);
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);

265
		cond_resched_lock(&fs_info->defrag_inodes_lock);
266 267 268 269 270 271 272 273 274 275 276

		node = rb_first(&fs_info->defrag_inodes);
	}
	spin_unlock(&fs_info->defrag_inodes_lock);
}

#define BTRFS_DEFRAG_BATCH	1024

static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
				    struct inode_defrag *defrag)
{
C
Chris Mason 已提交
277 278 279 280 281
	struct btrfs_root *inode_root;
	struct inode *inode;
	struct btrfs_key key;
	struct btrfs_ioctl_defrag_range_args range;
	int num_defrag;
282 283
	int index;
	int ret;
C
Chris Mason 已提交
284

285 286
	/* get the inode */
	key.objectid = defrag->root;
287
	key.type = BTRFS_ROOT_ITEM_KEY;
288
	key.offset = (u64)-1;
289 290 291

	index = srcu_read_lock(&fs_info->subvol_srcu);

292 293
	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
	if (IS_ERR(inode_root)) {
294 295 296
		ret = PTR_ERR(inode_root);
		goto cleanup;
	}
297 298

	key.objectid = defrag->ino;
299
	key.type = BTRFS_INODE_ITEM_KEY;
300 301 302
	key.offset = 0;
	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
	if (IS_ERR(inode)) {
303 304
		ret = PTR_ERR(inode);
		goto cleanup;
305
	}
306
	srcu_read_unlock(&fs_info->subvol_srcu, index);
307 308 309

	/* do a chunk of defrag */
	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
C
Chris Mason 已提交
310 311
	memset(&range, 0, sizeof(range));
	range.len = (u64)-1;
312
	range.start = defrag->last_offset;
M
Miao Xie 已提交
313 314

	sb_start_write(fs_info->sb);
315 316
	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
				       BTRFS_DEFRAG_BATCH);
M
Miao Xie 已提交
317
	sb_end_write(fs_info->sb);
318 319 320 321 322 323 324
	/*
	 * if we filled the whole defrag batch, there
	 * must be more work to do.  Queue this defrag
	 * again
	 */
	if (num_defrag == BTRFS_DEFRAG_BATCH) {
		defrag->last_offset = range.start;
325
		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
326 327 328 329 330 331 332 333
	} else if (defrag->last_offset && !defrag->cycled) {
		/*
		 * we didn't fill our defrag batch, but
		 * we didn't start at zero.  Make sure we loop
		 * around to the start of the file.
		 */
		defrag->last_offset = 0;
		defrag->cycled = 1;
334
		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
335 336 337 338 339 340
	} else {
		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	}

	iput(inode);
	return 0;
341 342 343 344
cleanup:
	srcu_read_unlock(&fs_info->subvol_srcu, index);
	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
	return ret;
345 346 347 348 349 350 351 352 353 354 355
}

/*
 * run through the list of inodes in the FS that need
 * defragging
 */
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
	struct inode_defrag *defrag;
	u64 first_ino = 0;
	u64 root_objectid = 0;
C
Chris Mason 已提交
356 357

	atomic_inc(&fs_info->defrag_running);
358
	while (1) {
M
Miao Xie 已提交
359 360 361 362 363
		/* Pause the auto defragger. */
		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
			     &fs_info->fs_state))
			break;

364
		if (!__need_auto_defrag(fs_info))
365
			break;
C
Chris Mason 已提交
366 367

		/* find an inode to defrag */
368 369
		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
						 first_ino);
C
Chris Mason 已提交
370
		if (!defrag) {
371
			if (root_objectid || first_ino) {
372
				root_objectid = 0;
C
Chris Mason 已提交
373 374 375 376 377 378 379 380
				first_ino = 0;
				continue;
			} else {
				break;
			}
		}

		first_ino = defrag->ino + 1;
381
		root_objectid = defrag->root;
C
Chris Mason 已提交
382

383
		__btrfs_run_defrag_inode(fs_info, defrag);
C
Chris Mason 已提交
384 385 386 387 388 389 390 391 392 393
	}
	atomic_dec(&fs_info->defrag_running);

	/*
	 * during unmount, we use the transaction_wait queue to
	 * wait for the defragger to stop
	 */
	wake_up(&fs_info->transaction_wait);
	return 0;
}
C
Chris Mason 已提交
394

C
Chris Mason 已提交
395 396 397
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
398
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
399
					 struct page **prepared_pages,
400
					 struct iov_iter *i)
C
Chris Mason 已提交
401
{
402
	size_t copied = 0;
J
Josef Bacik 已提交
403
	size_t total_copied = 0;
404
	int pg = 0;
405
	int offset = pos & (PAGE_SIZE - 1);
C
Chris Mason 已提交
406

407
	while (write_bytes > 0) {
C
Chris Mason 已提交
408
		size_t count = min_t(size_t,
409
				     PAGE_SIZE - offset, write_bytes);
410
		struct page *page = prepared_pages[pg];
411 412 413 414
		/*
		 * Copy data from userspace to the current page
		 */
		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
415

C
Chris Mason 已提交
416 417
		/* Flush processor's dcache for this page */
		flush_dcache_page(page);
418 419 420 421 422 423 424 425 426 427 428 429 430

		/*
		 * if we get a partial write, we can end up with
		 * partially up to date pages.  These add
		 * a lot of complexity, so make sure they don't
		 * happen by forcing this copy to be retried.
		 *
		 * The rest of the btrfs_file_write code will fall
		 * back to page at a time copies after we return 0.
		 */
		if (!PageUptodate(page) && copied < count)
			copied = 0;

431 432
		iov_iter_advance(i, copied);
		write_bytes -= copied;
433
		total_copied += copied;
C
Chris Mason 已提交
434

A
Al Viro 已提交
435
		/* Return to btrfs_file_write_iter to fault page */
J
Josef Bacik 已提交
436
		if (unlikely(copied == 0))
437
			break;
438

439
		if (copied < PAGE_SIZE - offset) {
440 441 442 443 444
			offset += copied;
		} else {
			pg++;
			offset = 0;
		}
C
Chris Mason 已提交
445
	}
446
	return total_copied;
C
Chris Mason 已提交
447 448
}

C
Chris Mason 已提交
449 450 451
/*
 * unlocks pages after btrfs_file_write is done with them
 */
452
static void btrfs_drop_pages(struct page **pages, size_t num_pages)
C
Chris Mason 已提交
453 454 455
{
	size_t i;
	for (i = 0; i < num_pages; i++) {
C
Chris Mason 已提交
456 457
		/* page checked is some magic around finding pages that
		 * have been modified without going through btrfs_set_page_dirty
458 459 460
		 * clear it here. There should be no need to mark the pages
		 * accessed as prepare_pages should have marked them accessed
		 * in prepare_pages via find_or_create_page()
C
Chris Mason 已提交
461
		 */
C
Chris Mason 已提交
462
		ClearPageChecked(pages[i]);
C
Chris Mason 已提交
463
		unlock_page(pages[i]);
464
		put_page(pages[i]);
C
Chris Mason 已提交
465 466 467
	}
}

468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
					 const u64 start,
					 const u64 len,
					 struct extent_state **cached_state)
{
	u64 search_start = start;
	const u64 end = start + len - 1;

	while (search_start < end) {
		const u64 search_len = end - search_start + 1;
		struct extent_map *em;
		u64 em_len;
		int ret = 0;

		em = btrfs_get_extent(inode, NULL, 0, search_start,
				      search_len, 0);
		if (IS_ERR(em))
			return PTR_ERR(em);

		if (em->block_start != EXTENT_MAP_HOLE)
			goto next;

		em_len = em->len;
		if (em->start < search_start)
			em_len -= search_start - em->start;
		if (em_len > search_len)
			em_len = search_len;

		ret = set_extent_bit(&inode->io_tree, search_start,
				     search_start + em_len - 1,
				     EXTENT_DELALLOC_NEW,
				     NULL, cached_state, GFP_NOFS);
next:
		search_start = extent_map_end(em);
		free_extent_map(em);
		if (ret)
			return ret;
	}
	return 0;
}

C
Chris Mason 已提交
509 510 511 512 513 514 515 516
/*
 * after copy_from_user, pages need to be dirtied and we need to make
 * sure holes are created between the current EOF and the start of
 * any next extents (if required).
 *
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
517 518 519
int btrfs_dirty_pages(struct inode *inode, struct page **pages,
		      size_t num_pages, loff_t pos, size_t write_bytes,
		      struct extent_state **cached)
C
Chris Mason 已提交
520
{
521
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
522
	int err = 0;
523
	int i;
524
	u64 num_bytes;
525 526 527 528
	u64 start_pos;
	u64 end_of_last_block;
	u64 end_pos = pos + write_bytes;
	loff_t isize = i_size_read(inode);
529
	unsigned int extra_bits = 0;
C
Chris Mason 已提交
530

531
	start_pos = pos & ~((u64) fs_info->sectorsize - 1);
532
	num_bytes = round_up(write_bytes + pos - start_pos,
533
			     fs_info->sectorsize);
C
Chris Mason 已提交
534

535
	end_of_last_block = start_pos + num_bytes - 1;
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554

	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
		if (start_pos >= isize &&
		    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
			/*
			 * There can't be any extents following eof in this case
			 * so just set the delalloc new bit for the range
			 * directly.
			 */
			extra_bits |= EXTENT_DELALLOC_NEW;
		} else {
			err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
							    start_pos,
							    num_bytes, cached);
			if (err)
				return err;
		}
	}

555
	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
556
					extra_bits, cached, 0);
J
Josef Bacik 已提交
557 558
	if (err)
		return err;
J
Josef Bacik 已提交
559

C
Chris Mason 已提交
560 561 562 563 564
	for (i = 0; i < num_pages; i++) {
		struct page *p = pages[i];
		SetPageUptodate(p);
		ClearPageChecked(p);
		set_page_dirty(p);
565
	}
J
Josef Bacik 已提交
566 567 568 569 570 571 572

	/*
	 * we've only changed i_size in ram, and we haven't updated
	 * the disk i_size.  There is no need to log the inode
	 * at this time.
	 */
	if (end_pos > isize)
573
		i_size_write(inode, end_pos);
574
	return 0;
C
Chris Mason 已提交
575 576
}

C
Chris Mason 已提交
577 578 579 580
/*
 * this drops all the extents in the cache that intersect the range
 * [start, end].  Existing extents are split as required.
 */
581
void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
582
			     int skip_pinned)
583 584
{
	struct extent_map *em;
585 586
	struct extent_map *split = NULL;
	struct extent_map *split2 = NULL;
587
	struct extent_map_tree *em_tree = &inode->extent_tree;
588
	u64 len = end - start + 1;
J
Josef Bacik 已提交
589
	u64 gen;
590 591
	int ret;
	int testend = 1;
592
	unsigned long flags;
C
Chris Mason 已提交
593
	int compressed = 0;
J
Josef Bacik 已提交
594
	bool modified;
595

596
	WARN_ON(end < start);
597
	if (end == (u64)-1) {
598
		len = (u64)-1;
599 600
		testend = 0;
	}
C
Chris Mason 已提交
601
	while (1) {
602 603
		int no_splits = 0;

J
Josef Bacik 已提交
604
		modified = false;
605
		if (!split)
606
			split = alloc_extent_map();
607
		if (!split2)
608
			split2 = alloc_extent_map();
609 610
		if (!split || !split2)
			no_splits = 1;
611

612
		write_lock(&em_tree->lock);
613
		em = lookup_extent_mapping(em_tree, start, len);
614
		if (!em) {
615
			write_unlock(&em_tree->lock);
616
			break;
617
		}
618
		flags = em->flags;
J
Josef Bacik 已提交
619
		gen = em->generation;
620
		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
621
			if (testend && em->start + em->len >= start + len) {
622
				free_extent_map(em);
C
Chris Mason 已提交
623
				write_unlock(&em_tree->lock);
624 625
				break;
			}
626 627
			start = em->start + em->len;
			if (testend)
628 629
				len = start + len - (em->start + em->len);
			free_extent_map(em);
C
Chris Mason 已提交
630
			write_unlock(&em_tree->lock);
631 632
			continue;
		}
C
Chris Mason 已提交
633
		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
634
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
L
Liu Bo 已提交
635
		clear_bit(EXTENT_FLAG_LOGGING, &flags);
J
Josef Bacik 已提交
636
		modified = !list_empty(&em->list);
637 638
		if (no_splits)
			goto next;
639

640
		if (em->start < start) {
641 642
			split->start = em->start;
			split->len = start - em->start;
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662

			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				split->orig_start = em->orig_start;
				split->block_start = em->block_start;

				if (compressed)
					split->block_len = em->block_len;
				else
					split->block_len = split->len;
				split->orig_block_len = max(split->block_len,
						em->orig_block_len);
				split->ram_bytes = em->ram_bytes;
			} else {
				split->orig_start = split->start;
				split->block_len = 0;
				split->block_start = em->block_start;
				split->orig_block_len = 0;
				split->ram_bytes = split->len;
			}

J
Josef Bacik 已提交
663
			split->generation = gen;
664
			split->bdev = em->bdev;
665
			split->flags = flags;
666
			split->compress_type = em->compress_type;
667
			replace_extent_mapping(em_tree, em, split, modified);
668 669 670 671
			free_extent_map(split);
			split = split2;
			split2 = NULL;
		}
672
		if (testend && em->start + em->len > start + len) {
673 674 675 676 677
			u64 diff = start + len - em->start;

			split->start = start + len;
			split->len = em->start + em->len - (start + len);
			split->bdev = em->bdev;
678
			split->flags = flags;
679
			split->compress_type = em->compress_type;
J
Josef Bacik 已提交
680
			split->generation = gen;
681 682 683

			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
				split->orig_block_len = max(em->block_len,
684
						    em->orig_block_len);
685

686 687 688 689 690 691 692 693 694 695 696
				split->ram_bytes = em->ram_bytes;
				if (compressed) {
					split->block_len = em->block_len;
					split->block_start = em->block_start;
					split->orig_start = em->orig_start;
				} else {
					split->block_len = split->len;
					split->block_start = em->block_start
						+ diff;
					split->orig_start = em->orig_start;
				}
C
Chris Mason 已提交
697
			} else {
698 699 700 701 702
				split->ram_bytes = split->len;
				split->orig_start = split->start;
				split->block_len = 0;
				split->block_start = em->block_start;
				split->orig_block_len = 0;
C
Chris Mason 已提交
703
			}
704

705 706 707 708 709 710 711 712
			if (extent_map_in_tree(em)) {
				replace_extent_mapping(em_tree, em, split,
						       modified);
			} else {
				ret = add_extent_mapping(em_tree, split,
							 modified);
				ASSERT(ret == 0); /* Logic error */
			}
713 714 715
			free_extent_map(split);
			split = NULL;
		}
716
next:
717 718
		if (extent_map_in_tree(em))
			remove_extent_mapping(em_tree, em);
719
		write_unlock(&em_tree->lock);
720

721 722 723 724 725
		/* once for us */
		free_extent_map(em);
		/* once for the tree*/
		free_extent_map(em);
	}
726 727 728 729
	if (split)
		free_extent_map(split);
	if (split2)
		free_extent_map(split2);
730 731
}

C
Chris Mason 已提交
732 733 734 735 736 737 738 739 740
/*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
 * that would be a good hint to the block allocator for this file.
 *
 * If an extent intersects the range but is not entirely inside the range
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
 */
J
Josef Bacik 已提交
741 742 743
int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root, struct inode *inode,
			 struct btrfs_path *path, u64 start, u64 end,
744 745 746 747
			 u64 *drop_end, int drop_cache,
			 int replace_extent,
			 u32 extent_item_size,
			 int *key_inserted)
C
Chris Mason 已提交
748
{
749
	struct btrfs_fs_info *fs_info = root->fs_info;
750
	struct extent_buffer *leaf;
Y
Yan, Zheng 已提交
751
	struct btrfs_file_extent_item *fi;
752
	struct btrfs_key key;
Y
Yan, Zheng 已提交
753
	struct btrfs_key new_key;
754
	u64 ino = btrfs_ino(BTRFS_I(inode));
Y
Yan, Zheng 已提交
755 756 757 758 759
	u64 search_start = start;
	u64 disk_bytenr = 0;
	u64 num_bytes = 0;
	u64 extent_offset = 0;
	u64 extent_end = 0;
J
Josef Bacik 已提交
760
	u64 last_end = start;
Y
Yan, Zheng 已提交
761 762 763
	int del_nr = 0;
	int del_slot = 0;
	int extent_type;
C
Chris Mason 已提交
764
	int recow;
765
	int ret;
766
	int modify_tree = -1;
767
	int update_refs;
768
	int found = 0;
769
	int leafs_visited = 0;
C
Chris Mason 已提交
770

C
Chris Mason 已提交
771
	if (drop_cache)
772
		btrfs_drop_extent_cache(BTRFS_I(inode), start, end - 1, 0);
773

774
	if (start >= BTRFS_I(inode)->disk_i_size && !replace_extent)
775 776
		modify_tree = 0;

777
	update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
778
		       root == fs_info->tree_root);
C
Chris Mason 已提交
779
	while (1) {
C
Chris Mason 已提交
780
		recow = 0;
L
Li Zefan 已提交
781
		ret = btrfs_lookup_file_extent(trans, root, path, ino,
782
					       search_start, modify_tree);
C
Chris Mason 已提交
783
		if (ret < 0)
Y
Yan, Zheng 已提交
784 785 786 787
			break;
		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
L
Li Zefan 已提交
788
			if (key.objectid == ino &&
Y
Yan, Zheng 已提交
789 790
			    key.type == BTRFS_EXTENT_DATA_KEY)
				path->slots[0]--;
C
Chris Mason 已提交
791
		}
Y
Yan, Zheng 已提交
792
		ret = 0;
793
		leafs_visited++;
794
next_slot:
795
		leaf = path->nodes[0];
Y
Yan, Zheng 已提交
796 797 798 799 800 801 802 803
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			BUG_ON(del_nr > 0);
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				break;
			if (ret > 0) {
				ret = 0;
				break;
804
			}
805
			leafs_visited++;
Y
Yan, Zheng 已提交
806 807 808 809 810
			leaf = path->nodes[0];
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
811 812 813 814 815 816 817 818 819 820

		if (key.objectid > ino)
			break;
		if (WARN_ON_ONCE(key.objectid < ino) ||
		    key.type < BTRFS_EXTENT_DATA_KEY) {
			ASSERT(del_nr == 0);
			path->slots[0]++;
			goto next_slot;
		}
		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
Y
Yan, Zheng 已提交
821 822 823 824 825 826 827 828 829 830 831 832 833 834 835
			break;

		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_type = btrfs_file_extent_type(leaf, fi);

		if (extent_type == BTRFS_FILE_EXTENT_REG ||
		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
			extent_offset = btrfs_file_extent_offset(leaf, fi);
			extent_end = key.offset +
				btrfs_file_extent_num_bytes(leaf, fi);
		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
			extent_end = key.offset +
836 837
				btrfs_file_extent_inline_len(leaf,
						     path->slots[0], fi);
838
		} else {
839 840
			/* can't happen */
			BUG();
C
Chris Mason 已提交
841 842
		}

843 844 845 846 847 848 849 850 851
		/*
		 * Don't skip extent items representing 0 byte lengths. They
		 * used to be created (bug) if while punching holes we hit
		 * -ENOSPC condition. So if we find one here, just ensure we
		 * delete it, otherwise we would insert a new file extent item
		 * with the same key (offset) as that 0 bytes length file
		 * extent item in the call to setup_items_for_insert() later
		 * in this function.
		 */
J
Josef Bacik 已提交
852 853
		if (extent_end == key.offset && extent_end >= search_start) {
			last_end = extent_end;
854
			goto delete_extent_item;
J
Josef Bacik 已提交
855
		}
856

Y
Yan, Zheng 已提交
857 858
		if (extent_end <= search_start) {
			path->slots[0]++;
859
			goto next_slot;
C
Chris Mason 已提交
860 861
		}

862
		found = 1;
Y
Yan, Zheng 已提交
863
		search_start = max(key.offset, start);
864 865
		if (recow || !modify_tree) {
			modify_tree = -1;
866
			btrfs_release_path(path);
Y
Yan, Zheng 已提交
867
			continue;
C
Chris Mason 已提交
868
		}
Y
Yan Zheng 已提交
869

Y
Yan, Zheng 已提交
870 871 872 873 874 875
		/*
		 *     | - range to drop - |
		 *  | -------- extent -------- |
		 */
		if (start > key.offset && end < extent_end) {
			BUG_ON(del_nr > 0);
876
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
877
				ret = -EOPNOTSUPP;
878 879
				break;
			}
Y
Yan, Zheng 已提交
880 881 882 883 884 885

			memcpy(&new_key, &key, sizeof(new_key));
			new_key.offset = start;
			ret = btrfs_duplicate_item(trans, root, path,
						   &new_key);
			if (ret == -EAGAIN) {
886
				btrfs_release_path(path);
Y
Yan, Zheng 已提交
887
				continue;
Y
Yan Zheng 已提交
888
			}
Y
Yan, Zheng 已提交
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
			if (ret < 0)
				break;

			leaf = path->nodes[0];
			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
					    struct btrfs_file_extent_item);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);

			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);

			extent_offset += start - key.offset;
			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - start);
			btrfs_mark_buffer_dirty(leaf);

J
Josef Bacik 已提交
907
			if (update_refs && disk_bytenr > 0) {
908
				ret = btrfs_inc_extent_ref(trans, root,
Y
Yan, Zheng 已提交
909 910 911
						disk_bytenr, num_bytes, 0,
						root->root_key.objectid,
						new_key.objectid,
912
						start - extent_offset);
913
				BUG_ON(ret); /* -ENOMEM */
914
			}
Y
Yan, Zheng 已提交
915
			key.offset = start;
Y
Yan Zheng 已提交
916
		}
J
Josef Bacik 已提交
917 918 919 920 921 922
		/*
		 * From here on out we will have actually dropped something, so
		 * last_end can be updated.
		 */
		last_end = extent_end;

Y
Yan, Zheng 已提交
923 924 925 926 927
		/*
		 *  | ---- range to drop ----- |
		 *      | -------- extent -------- |
		 */
		if (start <= key.offset && end < extent_end) {
928
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
929
				ret = -EOPNOTSUPP;
930 931
				break;
			}
Y
Yan Zheng 已提交
932

Y
Yan, Zheng 已提交
933 934
			memcpy(&new_key, &key, sizeof(new_key));
			new_key.offset = end;
935
			btrfs_set_item_key_safe(fs_info, path, &new_key);
Y
Yan Zheng 已提交
936

Y
Yan, Zheng 已提交
937 938 939 940 941
			extent_offset += end - key.offset;
			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - end);
			btrfs_mark_buffer_dirty(leaf);
942
			if (update_refs && disk_bytenr > 0)
Y
Yan, Zheng 已提交
943 944
				inode_sub_bytes(inode, end - key.offset);
			break;
C
Chris Mason 已提交
945
		}
946

Y
Yan, Zheng 已提交
947 948 949 950 951 952 953
		search_start = extent_end;
		/*
		 *       | ---- range to drop ----- |
		 *  | -------- extent -------- |
		 */
		if (start > key.offset && end >= extent_end) {
			BUG_ON(del_nr > 0);
954
			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
955
				ret = -EOPNOTSUPP;
956 957
				break;
			}
958

Y
Yan, Zheng 已提交
959 960 961
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);
			btrfs_mark_buffer_dirty(leaf);
962
			if (update_refs && disk_bytenr > 0)
Y
Yan, Zheng 已提交
963 964 965
				inode_sub_bytes(inode, extent_end - start);
			if (end == extent_end)
				break;
C
Chris Mason 已提交
966

Y
Yan, Zheng 已提交
967 968
			path->slots[0]++;
			goto next_slot;
Z
Zheng Yan 已提交
969 970
		}

Y
Yan, Zheng 已提交
971 972 973 974 975
		/*
		 *  | ---- range to drop ----- |
		 *    | ------ extent ------ |
		 */
		if (start <= key.offset && end >= extent_end) {
976
delete_extent_item:
Y
Yan, Zheng 已提交
977 978 979 980 981 982 983
			if (del_nr == 0) {
				del_slot = path->slots[0];
				del_nr = 1;
			} else {
				BUG_ON(del_slot + del_nr != path->slots[0]);
				del_nr++;
			}
Z
Zheng Yan 已提交
984

J
Josef Bacik 已提交
985 986
			if (update_refs &&
			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
987
				inode_sub_bytes(inode,
Y
Yan, Zheng 已提交
988 989
						extent_end - key.offset);
				extent_end = ALIGN(extent_end,
990
						   fs_info->sectorsize);
J
Josef Bacik 已提交
991
			} else if (update_refs && disk_bytenr > 0) {
992
				ret = btrfs_free_extent(trans, root,
Y
Yan, Zheng 已提交
993 994
						disk_bytenr, num_bytes, 0,
						root->root_key.objectid,
995
						key.objectid, key.offset -
996
						extent_offset);
997
				BUG_ON(ret); /* -ENOMEM */
Y
Yan, Zheng 已提交
998 999
				inode_sub_bytes(inode,
						extent_end - key.offset);
Z
Zheng Yan 已提交
1000 1001
			}

Y
Yan, Zheng 已提交
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
			if (end == extent_end)
				break;

			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
				path->slots[0]++;
				goto next_slot;
			}

			ret = btrfs_del_items(trans, root, path, del_slot,
					      del_nr);
1012
			if (ret) {
1013
				btrfs_abort_transaction(trans, ret);
J
Josef Bacik 已提交
1014
				break;
1015
			}
Y
Yan, Zheng 已提交
1016 1017 1018 1019

			del_nr = 0;
			del_slot = 0;

1020
			btrfs_release_path(path);
Y
Yan, Zheng 已提交
1021
			continue;
C
Chris Mason 已提交
1022
		}
Y
Yan, Zheng 已提交
1023 1024

		BUG_ON(1);
C
Chris Mason 已提交
1025
	}
Y
Yan, Zheng 已提交
1026

1027
	if (!ret && del_nr > 0) {
1028 1029 1030 1031
		/*
		 * Set path->slots[0] to first slot, so that after the delete
		 * if items are move off from our leaf to its immediate left or
		 * right neighbor leafs, we end up with a correct and adjusted
1032
		 * path->slots[0] for our insertion (if replace_extent != 0).
1033 1034
		 */
		path->slots[0] = del_slot;
Y
Yan, Zheng 已提交
1035
		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1036
		if (ret)
1037
			btrfs_abort_transaction(trans, ret);
1038
	}
1039

1040 1041 1042 1043 1044 1045 1046 1047 1048
	leaf = path->nodes[0];
	/*
	 * If btrfs_del_items() was called, it might have deleted a leaf, in
	 * which case it unlocked our path, so check path->locks[0] matches a
	 * write lock.
	 */
	if (!ret && replace_extent && leafs_visited == 1 &&
	    (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
	     path->locks[0] == BTRFS_WRITE_LOCK) &&
1049
	    btrfs_leaf_free_space(fs_info, leaf) >=
1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
	    sizeof(struct btrfs_item) + extent_item_size) {

		key.objectid = ino;
		key.type = BTRFS_EXTENT_DATA_KEY;
		key.offset = start;
		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
			struct btrfs_key slot_key;

			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
				path->slots[0]++;
1061
		}
1062 1063 1064 1065 1066 1067
		setup_items_for_insert(root, path, &key,
				       &extent_item_size,
				       extent_item_size,
				       sizeof(struct btrfs_item) +
				       extent_item_size, 1);
		*key_inserted = 1;
Y
Yan Zheng 已提交
1068
	}
Y
Yan, Zheng 已提交
1069

1070 1071
	if (!replace_extent || !(*key_inserted))
		btrfs_release_path(path);
J
Josef Bacik 已提交
1072
	if (drop_end)
J
Josef Bacik 已提交
1073
		*drop_end = found ? min(end, last_end) : end;
J
Josef Bacik 已提交
1074 1075 1076 1077 1078
	return ret;
}

int btrfs_drop_extents(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root, struct inode *inode, u64 start,
1079
		       u64 end, int drop_cache)
J
Josef Bacik 已提交
1080 1081 1082 1083 1084 1085 1086
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
J
Josef Bacik 已提交
1087
	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
1088
				   drop_cache, 0, 0, NULL);
Y
Yan, Zheng 已提交
1089
	btrfs_free_path(path);
C
Chris Mason 已提交
1090 1091 1092
	return ret;
}

Y
Yan Zheng 已提交
1093
static int extent_mergeable(struct extent_buffer *leaf, int slot,
1094 1095
			    u64 objectid, u64 bytenr, u64 orig_offset,
			    u64 *start, u64 *end)
Y
Yan Zheng 已提交
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110
{
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;
	u64 extent_end;

	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
		return 0;

	btrfs_item_key_to_cpu(leaf, &key, slot);
	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
		return 0;

	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1111
	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
Y
Yan Zheng 已提交
1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
	    btrfs_file_extent_compression(leaf, fi) ||
	    btrfs_file_extent_encryption(leaf, fi) ||
	    btrfs_file_extent_other_encoding(leaf, fi))
		return 0;

	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
	if ((*start && *start != key.offset) || (*end && *end != extent_end))
		return 0;

	*start = key.offset;
	*end = extent_end;
	return 1;
}

/*
 * Mark extent in the range start - end as written.
 *
 * This changes extent type from 'pre-allocated' to 'regular'. If only
 * part of extent is marked as written, the extent will be split into
 * two or three.
 */
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1134
			      struct btrfs_inode *inode, u64 start, u64 end)
Y
Yan Zheng 已提交
1135
{
1136 1137
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
	struct btrfs_root *root = inode->root;
Y
Yan Zheng 已提交
1138 1139 1140 1141
	struct extent_buffer *leaf;
	struct btrfs_path *path;
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;
Y
Yan, Zheng 已提交
1142
	struct btrfs_key new_key;
Y
Yan Zheng 已提交
1143 1144 1145
	u64 bytenr;
	u64 num_bytes;
	u64 extent_end;
1146
	u64 orig_offset;
Y
Yan Zheng 已提交
1147 1148
	u64 other_start;
	u64 other_end;
Y
Yan, Zheng 已提交
1149 1150 1151
	u64 split;
	int del_nr = 0;
	int del_slot = 0;
1152
	int recow;
Y
Yan Zheng 已提交
1153
	int ret;
1154
	u64 ino = btrfs_ino(inode);
Y
Yan Zheng 已提交
1155 1156

	path = btrfs_alloc_path();
1157 1158
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
1159
again:
1160
	recow = 0;
Y
Yan, Zheng 已提交
1161
	split = start;
L
Li Zefan 已提交
1162
	key.objectid = ino;
Y
Yan Zheng 已提交
1163
	key.type = BTRFS_EXTENT_DATA_KEY;
Y
Yan, Zheng 已提交
1164
	key.offset = split;
Y
Yan Zheng 已提交
1165 1166

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1167 1168
	if (ret < 0)
		goto out;
Y
Yan Zheng 已提交
1169 1170 1171 1172 1173
	if (ret > 0 && path->slots[0] > 0)
		path->slots[0]--;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1174 1175 1176 1177 1178 1179
	if (key.objectid != ino ||
	    key.type != BTRFS_EXTENT_DATA_KEY) {
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
Y
Yan Zheng 已提交
1180 1181
	fi = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
1182 1183 1184 1185 1186
	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
Y
Yan Zheng 已提交
1187
	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1188 1189 1190 1191 1192
	if (key.offset > start || extent_end < end) {
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
Y
Yan Zheng 已提交
1193 1194 1195

	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1196
	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1197 1198 1199 1200 1201 1202
	memcpy(&new_key, &key, sizeof(new_key));

	if (start == key.offset && end < extent_end) {
		other_start = 0;
		other_end = start;
		if (extent_mergeable(leaf, path->slots[0] - 1,
L
Li Zefan 已提交
1203
				     ino, bytenr, orig_offset,
1204 1205
				     &other_start, &other_end)) {
			new_key.offset = end;
1206
			btrfs_set_item_key_safe(fs_info, path, &new_key);
1207 1208
			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
1209 1210
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1211 1212 1213 1214 1215 1216
			btrfs_set_file_extent_num_bytes(leaf, fi,
							extent_end - end);
			btrfs_set_file_extent_offset(leaf, fi,
						     end - orig_offset);
			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
					    struct btrfs_file_extent_item);
1217 1218
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229
			btrfs_set_file_extent_num_bytes(leaf, fi,
							end - other_start);
			btrfs_mark_buffer_dirty(leaf);
			goto out;
		}
	}

	if (start > key.offset && end == extent_end) {
		other_start = end;
		other_end = 0;
		if (extent_mergeable(leaf, path->slots[0] + 1,
L
Li Zefan 已提交
1230
				     ino, bytenr, orig_offset,
1231 1232 1233 1234 1235
				     &other_start, &other_end)) {
			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
			btrfs_set_file_extent_num_bytes(leaf, fi,
							start - key.offset);
1236 1237
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1238 1239
			path->slots[0]++;
			new_key.offset = start;
1240
			btrfs_set_item_key_safe(fs_info, path, &new_key);
1241 1242 1243

			fi = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
1244 1245
			btrfs_set_file_extent_generation(leaf, fi,
							 trans->transid);
1246 1247 1248 1249 1250 1251 1252 1253
			btrfs_set_file_extent_num_bytes(leaf, fi,
							other_end - start);
			btrfs_set_file_extent_offset(leaf, fi,
						     start - orig_offset);
			btrfs_mark_buffer_dirty(leaf);
			goto out;
		}
	}
Y
Yan Zheng 已提交
1254

Y
Yan, Zheng 已提交
1255 1256 1257 1258 1259 1260 1261
	while (start > key.offset || end < extent_end) {
		if (key.offset == start)
			split = end;

		new_key.offset = split;
		ret = btrfs_duplicate_item(trans, root, path, &new_key);
		if (ret == -EAGAIN) {
1262
			btrfs_release_path(path);
Y
Yan, Zheng 已提交
1263
			goto again;
Y
Yan Zheng 已提交
1264
		}
1265
		if (ret < 0) {
1266
			btrfs_abort_transaction(trans, ret);
1267 1268
			goto out;
		}
Y
Yan Zheng 已提交
1269

Y
Yan, Zheng 已提交
1270 1271
		leaf = path->nodes[0];
		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
Y
Yan Zheng 已提交
1272
				    struct btrfs_file_extent_item);
1273
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
Y
Yan Zheng 已提交
1274
		btrfs_set_file_extent_num_bytes(leaf, fi,
Y
Yan, Zheng 已提交
1275 1276 1277 1278 1279
						split - key.offset);

		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);

1280
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
Y
Yan, Zheng 已提交
1281 1282 1283
		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
		btrfs_set_file_extent_num_bytes(leaf, fi,
						extent_end - split);
Y
Yan Zheng 已提交
1284 1285
		btrfs_mark_buffer_dirty(leaf);

1286
		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
1287
					   0, root->root_key.objectid,
1288
					   ino, orig_offset);
1289 1290 1291 1292
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
Y
Yan Zheng 已提交
1293

Y
Yan, Zheng 已提交
1294 1295 1296
		if (split == start) {
			key.offset = start;
		} else {
1297 1298 1299 1300 1301
			if (start != key.offset) {
				ret = -EINVAL;
				btrfs_abort_transaction(trans, ret);
				goto out;
			}
Y
Yan Zheng 已提交
1302
			path->slots[0]--;
Y
Yan, Zheng 已提交
1303
			extent_end = end;
Y
Yan Zheng 已提交
1304
		}
1305
		recow = 1;
Y
Yan Zheng 已提交
1306 1307
	}

Y
Yan, Zheng 已提交
1308 1309
	other_start = end;
	other_end = 0;
1310
	if (extent_mergeable(leaf, path->slots[0] + 1,
L
Li Zefan 已提交
1311
			     ino, bytenr, orig_offset,
1312 1313
			     &other_start, &other_end)) {
		if (recow) {
1314
			btrfs_release_path(path);
1315 1316
			goto again;
		}
Y
Yan, Zheng 已提交
1317 1318 1319
		extent_end = other_end;
		del_slot = path->slots[0] + 1;
		del_nr++;
1320
		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
Y
Yan, Zheng 已提交
1321
					0, root->root_key.objectid,
1322
					ino, orig_offset);
1323 1324 1325 1326
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
Y
Yan Zheng 已提交
1327
	}
Y
Yan, Zheng 已提交
1328 1329
	other_start = 0;
	other_end = start;
1330
	if (extent_mergeable(leaf, path->slots[0] - 1,
L
Li Zefan 已提交
1331
			     ino, bytenr, orig_offset,
1332 1333
			     &other_start, &other_end)) {
		if (recow) {
1334
			btrfs_release_path(path);
1335 1336
			goto again;
		}
Y
Yan, Zheng 已提交
1337 1338 1339
		key.offset = other_start;
		del_slot = path->slots[0];
		del_nr++;
1340
		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
Y
Yan, Zheng 已提交
1341
					0, root->root_key.objectid,
1342
					ino, orig_offset);
1343 1344 1345 1346
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
Y
Yan, Zheng 已提交
1347 1348
	}
	if (del_nr == 0) {
1349 1350
		fi = btrfs_item_ptr(leaf, path->slots[0],
			   struct btrfs_file_extent_item);
Y
Yan, Zheng 已提交
1351 1352
		btrfs_set_file_extent_type(leaf, fi,
					   BTRFS_FILE_EXTENT_REG);
1353
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
Y
Yan, Zheng 已提交
1354
		btrfs_mark_buffer_dirty(leaf);
1355
	} else {
1356 1357
		fi = btrfs_item_ptr(leaf, del_slot - 1,
			   struct btrfs_file_extent_item);
1358 1359
		btrfs_set_file_extent_type(leaf, fi,
					   BTRFS_FILE_EXTENT_REG);
1360
		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1361 1362 1363
		btrfs_set_file_extent_num_bytes(leaf, fi,
						extent_end - key.offset);
		btrfs_mark_buffer_dirty(leaf);
Y
Yan, Zheng 已提交
1364

1365
		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1366
		if (ret < 0) {
1367
			btrfs_abort_transaction(trans, ret);
1368 1369
			goto out;
		}
1370
	}
Y
Yan, Zheng 已提交
1371
out:
Y
Yan Zheng 已提交
1372 1373 1374 1375
	btrfs_free_path(path);
	return 0;
}

1376 1377 1378 1379
/*
 * on error we return an unlocked page and the error value
 * on success we return a locked page and 0
 */
1380 1381
static int prepare_uptodate_page(struct inode *inode,
				 struct page *page, u64 pos,
1382
				 bool force_uptodate)
1383 1384 1385
{
	int ret = 0;

1386
	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1387
	    !PageUptodate(page)) {
1388 1389 1390 1391 1392 1393 1394 1395
		ret = btrfs_readpage(NULL, page);
		if (ret)
			return ret;
		lock_page(page);
		if (!PageUptodate(page)) {
			unlock_page(page);
			return -EIO;
		}
1396 1397 1398 1399
		if (page->mapping != inode->i_mapping) {
			unlock_page(page);
			return -EAGAIN;
		}
1400 1401 1402 1403
	}
	return 0;
}

C
Chris Mason 已提交
1404
/*
1405
 * this just gets pages into the page cache and locks them down.
C
Chris Mason 已提交
1406
 */
1407 1408 1409
static noinline int prepare_pages(struct inode *inode, struct page **pages,
				  size_t num_pages, loff_t pos,
				  size_t write_bytes, bool force_uptodate)
C
Chris Mason 已提交
1410 1411
{
	int i;
1412
	unsigned long index = pos >> PAGE_SHIFT;
1413
	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1414
	int err = 0;
1415
	int faili;
1416

C
Chris Mason 已提交
1417
	for (i = 0; i < num_pages; i++) {
1418
again:
1419
		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1420
					       mask | __GFP_WRITE);
C
Chris Mason 已提交
1421
		if (!pages[i]) {
1422 1423 1424 1425 1426 1427
			faili = i - 1;
			err = -ENOMEM;
			goto fail;
		}

		if (i == 0)
1428
			err = prepare_uptodate_page(inode, pages[i], pos,
1429
						    force_uptodate);
1430 1431
		if (!err && i == num_pages - 1)
			err = prepare_uptodate_page(inode, pages[i],
1432
						    pos + write_bytes, false);
1433
		if (err) {
1434
			put_page(pages[i]);
1435 1436 1437 1438
			if (err == -EAGAIN) {
				err = 0;
				goto again;
			}
1439 1440
			faili = i - 1;
			goto fail;
C
Chris Mason 已提交
1441
		}
C
Chris Mason 已提交
1442
		wait_on_page_writeback(pages[i]);
C
Chris Mason 已提交
1443
	}
1444 1445 1446 1447 1448

	return 0;
fail:
	while (faili >= 0) {
		unlock_page(pages[faili]);
1449
		put_page(pages[faili]);
1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466
		faili--;
	}
	return err;

}

/*
 * This function locks the extent and properly waits for data=ordered extents
 * to finish before allowing the pages to be modified if need.
 *
 * The return value:
 * 1 - the extent is locked
 * 0 - the extent is not locked, and everything is OK
 * -EAGAIN - need re-prepare the pages
 * the other < 0 number - Something wrong happens
 */
static noinline int
1467
lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1468
				size_t num_pages, loff_t pos,
1469
				size_t write_bytes,
1470 1471 1472
				u64 *lockstart, u64 *lockend,
				struct extent_state **cached_state)
{
1473
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1474 1475 1476 1477 1478
	u64 start_pos;
	u64 last_pos;
	int i;
	int ret = 0;

1479
	start_pos = round_down(pos, fs_info->sectorsize);
1480
	last_pos = start_pos
1481
		+ round_up(pos + write_bytes - start_pos,
1482
			   fs_info->sectorsize) - 1;
1483

1484
	if (start_pos < inode->vfs_inode.i_size) {
1485
		struct btrfs_ordered_extent *ordered;
1486

1487 1488
		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
				cached_state);
1489 1490
		ordered = btrfs_lookup_ordered_range(inode, start_pos,
						     last_pos - start_pos + 1);
1491 1492
		if (ordered &&
		    ordered->file_offset + ordered->len > start_pos &&
1493
		    ordered->file_offset <= last_pos) {
1494
			unlock_extent_cached(&inode->io_tree, start_pos,
1495
					last_pos, cached_state);
1496 1497
			for (i = 0; i < num_pages; i++) {
				unlock_page(pages[i]);
1498
				put_page(pages[i]);
1499
			}
1500 1501
			btrfs_start_ordered_extent(&inode->vfs_inode,
					ordered, 1);
1502 1503
			btrfs_put_ordered_extent(ordered);
			return -EAGAIN;
1504 1505 1506
		}
		if (ordered)
			btrfs_put_ordered_extent(ordered);
1507 1508 1509
		clear_extent_bit(&inode->io_tree, start_pos, last_pos,
				 EXTENT_DIRTY | EXTENT_DELALLOC |
				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
1510
				 0, 0, cached_state);
1511 1512 1513
		*lockstart = start_pos;
		*lockend = last_pos;
		ret = 1;
1514
	}
1515

1516
	for (i = 0; i < num_pages; i++) {
1517 1518
		if (clear_page_dirty_for_io(pages[i]))
			account_page_redirty(pages[i]);
1519 1520 1521
		set_page_extent_mapped(pages[i]);
		WARN_ON(!PageLocked(pages[i]));
	}
1522

1523
	return ret;
C
Chris Mason 已提交
1524 1525
}

1526
static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1527 1528
				    size_t *write_bytes)
{
1529 1530
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
	struct btrfs_root *root = inode->root;
1531 1532 1533 1534 1535
	struct btrfs_ordered_extent *ordered;
	u64 lockstart, lockend;
	u64 num_bytes;
	int ret;

1536
	ret = btrfs_start_write_no_snapshotting(root);
1537 1538 1539
	if (!ret)
		return -ENOSPC;

1540
	lockstart = round_down(pos, fs_info->sectorsize);
1541
	lockend = round_up(pos + *write_bytes,
1542
			   fs_info->sectorsize) - 1;
1543 1544

	while (1) {
1545
		lock_extent(&inode->io_tree, lockstart, lockend);
1546 1547 1548 1549 1550
		ordered = btrfs_lookup_ordered_range(inode, lockstart,
						     lockend - lockstart + 1);
		if (!ordered) {
			break;
		}
1551 1552
		unlock_extent(&inode->io_tree, lockstart, lockend);
		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
1553 1554 1555 1556
		btrfs_put_ordered_extent(ordered);
	}

	num_bytes = lockend - lockstart + 1;
1557 1558
	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
			NULL, NULL, NULL);
1559 1560
	if (ret <= 0) {
		ret = 0;
1561
		btrfs_end_write_no_snapshotting(root);
1562
	} else {
1563 1564
		*write_bytes = min_t(size_t, *write_bytes ,
				     num_bytes - pos + lockstart);
1565 1566
	}

1567
	unlock_extent(&inode->io_tree, lockstart, lockend);
1568 1569 1570 1571

	return ret;
}

J
Josef Bacik 已提交
1572 1573 1574
static noinline ssize_t __btrfs_buffered_write(struct file *file,
					       struct iov_iter *i,
					       loff_t pos)
1575
{
A
Al Viro 已提交
1576
	struct inode *inode = file_inode(file);
1577
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1578 1579
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct page **pages = NULL;
1580
	struct extent_state *cached_state = NULL;
1581
	struct extent_changeset *data_reserved = NULL;
1582
	u64 release_bytes = 0;
1583 1584
	u64 lockstart;
	u64 lockend;
J
Josef Bacik 已提交
1585 1586
	size_t num_written = 0;
	int nrptrs;
1587
	int ret = 0;
1588
	bool only_release_metadata = false;
1589
	bool force_page_uptodate = false;
1590

1591 1592
	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
			PAGE_SIZE / (sizeof(struct page *)));
1593 1594
	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
	nrptrs = max(nrptrs, 8);
1595
	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
J
Josef Bacik 已提交
1596 1597
	if (!pages)
		return -ENOMEM;
1598

J
Josef Bacik 已提交
1599
	while (iov_iter_count(i) > 0) {
1600
		size_t offset = pos & (PAGE_SIZE - 1);
1601
		size_t sector_offset;
J
Josef Bacik 已提交
1602
		size_t write_bytes = min(iov_iter_count(i),
1603
					 nrptrs * (size_t)PAGE_SIZE -
1604
					 offset);
1605
		size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1606
						PAGE_SIZE);
1607
		size_t reserve_bytes;
J
Josef Bacik 已提交
1608 1609
		size_t dirty_pages;
		size_t copied;
1610 1611
		size_t dirty_sectors;
		size_t num_sectors;
1612
		int extents_locked;
C
Chris Mason 已提交
1613

1614
		WARN_ON(num_pages > nrptrs);
1615

1616 1617 1618 1619
		/*
		 * Fault pages before locking them in prepare_pages
		 * to avoid recursive lock
		 */
J
Josef Bacik 已提交
1620
		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1621
			ret = -EFAULT;
J
Josef Bacik 已提交
1622
			break;
1623 1624
		}

1625
		sector_offset = pos & (fs_info->sectorsize - 1);
1626
		reserve_bytes = round_up(write_bytes + sector_offset,
1627
				fs_info->sectorsize);
1628

1629 1630 1631
		extent_changeset_release(data_reserved);
		ret = btrfs_check_data_free_space(inode, &data_reserved, pos,
						  write_bytes);
1632 1633 1634
		if (ret < 0) {
			if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
						      BTRFS_INODE_PREALLOC)) &&
1635 1636
			    check_can_nocow(BTRFS_I(inode), pos,
					&write_bytes) > 0) {
1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649
				/*
				 * For nodata cow case, no need to reserve
				 * data space.
				 */
				only_release_metadata = true;
				/*
				 * our prealloc extent may be smaller than
				 * write_bytes, so scale down.
				 */
				num_pages = DIV_ROUND_UP(write_bytes + offset,
							 PAGE_SIZE);
				reserve_bytes = round_up(write_bytes +
							 sector_offset,
1650
							 fs_info->sectorsize);
1651 1652 1653 1654
			} else {
				break;
			}
		}
1655

J
Josef Bacik 已提交
1656
		WARN_ON(reserve_bytes == 0);
1657 1658
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
				reserve_bytes);
1659 1660
		if (ret) {
			if (!only_release_metadata)
1661 1662 1663
				btrfs_free_reserved_data_space(inode,
						data_reserved, pos,
						write_bytes);
1664
			else
1665
				btrfs_end_write_no_snapshotting(root);
1666 1667 1668 1669
			break;
		}

		release_bytes = reserve_bytes;
1670
again:
1671 1672 1673 1674 1675
		/*
		 * This is going to setup the pages array with the number of
		 * pages we want, so we don't really need to worry about the
		 * contents of pages from loop to loop
		 */
1676 1677
		ret = prepare_pages(inode, pages, num_pages,
				    pos, write_bytes,
1678
				    force_page_uptodate);
J
Josef Bacik 已提交
1679 1680
		if (ret) {
			btrfs_delalloc_release_extents(BTRFS_I(inode),
1681
						       reserve_bytes, true);
J
Josef Bacik 已提交
1682
			break;
J
Josef Bacik 已提交
1683
		}
C
Chris Mason 已提交
1684

1685 1686
		extents_locked = lock_and_cleanup_extent_if_need(
				BTRFS_I(inode), pages,
1687 1688
				num_pages, pos, write_bytes, &lockstart,
				&lockend, &cached_state);
1689 1690
		if (extents_locked < 0) {
			if (extents_locked == -EAGAIN)
1691
				goto again;
J
Josef Bacik 已提交
1692
			btrfs_delalloc_release_extents(BTRFS_I(inode),
1693
						       reserve_bytes, true);
1694
			ret = extents_locked;
1695 1696 1697
			break;
		}

1698
		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1699

1700
		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1701
		dirty_sectors = round_up(copied + sector_offset,
1702 1703
					fs_info->sectorsize);
		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1704

1705 1706 1707 1708 1709 1710 1711
		/*
		 * if we have trouble faulting in the pages, fall
		 * back to one page at a time
		 */
		if (copied < write_bytes)
			nrptrs = 1;

1712 1713
		if (copied == 0) {
			force_page_uptodate = true;
1714
			dirty_sectors = 0;
1715
			dirty_pages = 0;
1716 1717
		} else {
			force_page_uptodate = false;
1718
			dirty_pages = DIV_ROUND_UP(copied + offset,
1719
						   PAGE_SIZE);
1720
		}
1721

1722
		if (num_sectors > dirty_sectors) {
1723 1724
			/* release everything except the sectors we dirtied */
			release_bytes -= dirty_sectors <<
1725
						fs_info->sb->s_blocksize_bits;
1726
			if (only_release_metadata) {
1727
				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1728
							release_bytes, true);
1729 1730 1731
			} else {
				u64 __pos;

1732
				__pos = round_down(pos,
1733
						   fs_info->sectorsize) +
1734
					(dirty_pages << PAGE_SHIFT);
1735 1736
				btrfs_delalloc_release_space(inode,
						data_reserved, __pos,
1737
						release_bytes, true);
1738
			}
1739 1740
		}

1741
		release_bytes = round_up(copied + sector_offset,
1742
					fs_info->sectorsize);
1743 1744

		if (copied > 0)
1745
			ret = btrfs_dirty_pages(inode, pages, dirty_pages,
1746
						pos, copied, &cached_state);
1747
		if (extents_locked)
1748
			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1749
					     lockstart, lockend, &cached_state);
1750
		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
1751
					       true);
1752 1753
		if (ret) {
			btrfs_drop_pages(pages, num_pages);
1754
			break;
1755
		}
C
Chris Mason 已提交
1756

1757
		release_bytes = 0;
1758
		if (only_release_metadata)
1759
			btrfs_end_write_no_snapshotting(root);
1760

1761
		if (only_release_metadata && copied > 0) {
1762
			lockstart = round_down(pos,
1763
					       fs_info->sectorsize);
1764
			lockend = round_up(pos + copied,
1765
					   fs_info->sectorsize) - 1;
1766 1767 1768 1769 1770 1771 1772

			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
				       lockend, EXTENT_NORESERVE, NULL,
				       NULL, GFP_NOFS);
			only_release_metadata = false;
		}

1773 1774
		btrfs_drop_pages(pages, num_pages);

J
Josef Bacik 已提交
1775 1776
		cond_resched();

1777
		balance_dirty_pages_ratelimited(inode->i_mapping);
1778
		if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1)
1779
			btrfs_btree_balance_dirty(fs_info);
1780

1781 1782
		pos += copied;
		num_written += copied;
J
Josef Bacik 已提交
1783
	}
C
Chris Mason 已提交
1784

J
Josef Bacik 已提交
1785 1786
	kfree(pages);

1787
	if (release_bytes) {
1788
		if (only_release_metadata) {
1789
			btrfs_end_write_no_snapshotting(root);
1790
			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1791
					release_bytes, true);
1792
		} else {
1793 1794
			btrfs_delalloc_release_space(inode, data_reserved,
					round_down(pos, fs_info->sectorsize),
1795
					release_bytes, true);
1796
		}
1797 1798
	}

1799
	extent_changeset_free(data_reserved);
J
Josef Bacik 已提交
1800 1801 1802
	return num_written ? num_written : ret;
}

1803
static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
J
Josef Bacik 已提交
1804 1805
{
	struct file *file = iocb->ki_filp;
1806
	struct inode *inode = file_inode(file);
1807
	loff_t pos = iocb->ki_pos;
J
Josef Bacik 已提交
1808 1809 1810 1811 1812
	ssize_t written;
	ssize_t written_buffered;
	loff_t endbyte;
	int err;

1813
	written = generic_file_direct_write(iocb, from);
J
Josef Bacik 已提交
1814

A
Al Viro 已提交
1815
	if (written < 0 || !iov_iter_count(from))
J
Josef Bacik 已提交
1816 1817 1818
		return written;

	pos += written;
1819
	written_buffered = __btrfs_buffered_write(file, from, pos);
J
Josef Bacik 已提交
1820 1821 1822
	if (written_buffered < 0) {
		err = written_buffered;
		goto out;
C
Chris Mason 已提交
1823
	}
1824 1825 1826 1827
	/*
	 * Ensure all data is persisted. We want the next direct IO read to be
	 * able to read what was just written.
	 */
J
Josef Bacik 已提交
1828
	endbyte = pos + written_buffered - 1;
1829
	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1830 1831
	if (err)
		goto out;
1832
	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
J
Josef Bacik 已提交
1833 1834 1835
	if (err)
		goto out;
	written += written_buffered;
1836
	iocb->ki_pos = pos + written_buffered;
1837 1838
	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
				 endbyte >> PAGE_SHIFT);
C
Chris Mason 已提交
1839
out:
J
Josef Bacik 已提交
1840 1841
	return written ? written : err;
}
1842

1843 1844 1845 1846 1847 1848 1849
static void update_time_for_write(struct inode *inode)
{
	struct timespec now;

	if (IS_NOCMTIME(inode))
		return;

1850
	now = current_time(inode);
1851 1852 1853 1854 1855 1856 1857 1858 1859 1860
	if (!timespec_equal(&inode->i_mtime, &now))
		inode->i_mtime = now;

	if (!timespec_equal(&inode->i_ctime, &now))
		inode->i_ctime = now;

	if (IS_I_VERSION(inode))
		inode_inc_iversion(inode);
}

A
Al Viro 已提交
1861 1862
static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
				    struct iov_iter *from)
J
Josef Bacik 已提交
1863 1864
{
	struct file *file = iocb->ki_filp;
A
Al Viro 已提交
1865
	struct inode *inode = file_inode(file);
1866
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
1867
	struct btrfs_root *root = BTRFS_I(inode)->root;
1868
	u64 start_pos;
1869
	u64 end_pos;
J
Josef Bacik 已提交
1870
	ssize_t num_written = 0;
1871
	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
1872
	ssize_t err;
1873
	loff_t pos;
G
Goldwyn Rodrigues 已提交
1874
	size_t count = iov_iter_count(from);
1875 1876
	loff_t oldsize;
	int clean_page = 0;
J
Josef Bacik 已提交
1877

1878 1879 1880 1881
	if (!(iocb->ki_flags & IOCB_DIRECT) &&
	    (iocb->ki_flags & IOCB_NOWAIT))
		return -EOPNOTSUPP;

1882 1883
	if (!inode_trylock(inode)) {
		if (iocb->ki_flags & IOCB_NOWAIT)
G
Goldwyn Rodrigues 已提交
1884
			return -EAGAIN;
1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895
		inode_lock(inode);
	}

	err = generic_write_checks(iocb, from);
	if (err <= 0) {
		inode_unlock(inode);
		return err;
	}

	pos = iocb->ki_pos;
	if (iocb->ki_flags & IOCB_NOWAIT) {
G
Goldwyn Rodrigues 已提交
1896 1897 1898 1899 1900 1901 1902 1903 1904 1905
		/*
		 * We will allocate space in case nodatacow is not set,
		 * so bail
		 */
		if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
					      BTRFS_INODE_PREALLOC)) ||
		    check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
			inode_unlock(inode);
			return -EAGAIN;
		}
J
Josef Bacik 已提交
1906 1907
	}

1908
	current->backing_dev_info = inode_to_bdi(inode);
1909
	err = file_remove_privs(file);
J
Josef Bacik 已提交
1910
	if (err) {
A
Al Viro 已提交
1911
		inode_unlock(inode);
J
Josef Bacik 已提交
1912 1913 1914 1915 1916 1917 1918 1919 1920
		goto out;
	}

	/*
	 * If BTRFS flips readonly due to some impossible error
	 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
	 * although we have opened a file as writable, we have
	 * to stop this write operation to ensure FS consistency.
	 */
1921
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
A
Al Viro 已提交
1922
		inode_unlock(inode);
J
Josef Bacik 已提交
1923 1924 1925 1926
		err = -EROFS;
		goto out;
	}

1927 1928 1929 1930 1931 1932 1933
	/*
	 * We reserve space for updating the inode when we reserve space for the
	 * extent we are going to write, so we will enospc out there.  We don't
	 * need to start yet another transaction to update the inode as we will
	 * update the inode when we finish writing whatever data we write.
	 */
	update_time_for_write(inode);
J
Josef Bacik 已提交
1934

1935
	start_pos = round_down(pos, fs_info->sectorsize);
1936 1937
	oldsize = i_size_read(inode);
	if (start_pos > oldsize) {
1938
		/* Expand hole size to cover write data, preventing empty gap */
1939
		end_pos = round_up(pos + count,
1940
				   fs_info->sectorsize);
1941
		err = btrfs_cont_expand(inode, oldsize, end_pos);
1942
		if (err) {
A
Al Viro 已提交
1943
			inode_unlock(inode);
1944 1945
			goto out;
		}
1946
		if (start_pos > round_up(oldsize, fs_info->sectorsize))
1947
			clean_page = 1;
1948 1949
	}

1950 1951 1952
	if (sync)
		atomic_inc(&BTRFS_I(inode)->sync_writers);

1953
	if (iocb->ki_flags & IOCB_DIRECT) {
1954
		num_written = __btrfs_direct_write(iocb, from);
J
Josef Bacik 已提交
1955
	} else {
A
Al Viro 已提交
1956
		num_written = __btrfs_buffered_write(file, from, pos);
J
Josef Bacik 已提交
1957
		if (num_written > 0)
1958
			iocb->ki_pos = pos + num_written;
1959 1960 1961
		if (clean_page)
			pagecache_isize_extended(inode, oldsize,
						i_size_read(inode));
J
Josef Bacik 已提交
1962 1963
	}

A
Al Viro 已提交
1964
	inode_unlock(inode);
1965

1966
	/*
1967 1968
	 * We also have to set last_sub_trans to the current log transid,
	 * otherwise subsequent syncs to a file that's been synced in this
1969
	 * transaction will appear to have already occurred.
1970
	 */
1971
	spin_lock(&BTRFS_I(inode)->lock);
1972
	BTRFS_I(inode)->last_sub_trans = root->log_transid;
1973
	spin_unlock(&BTRFS_I(inode)->lock);
1974 1975
	if (num_written > 0)
		num_written = generic_write_sync(iocb, num_written);
1976

1977 1978
	if (sync)
		atomic_dec(&BTRFS_I(inode)->sync_writers);
1979
out:
C
Chris Mason 已提交
1980 1981 1982 1983
	current->backing_dev_info = NULL;
	return num_written ? num_written : err;
}

C
Chris Mason 已提交
1984
int btrfs_release_file(struct inode *inode, struct file *filp)
1985
{
1986 1987 1988 1989 1990 1991 1992
	struct btrfs_file_private *private = filp->private_data;

	if (private && private->filldir_buf)
		kfree(private->filldir_buf);
	kfree(private);
	filp->private_data = NULL;

1993 1994 1995 1996 1997 1998 1999 2000 2001
	/*
	 * ordered_data_close is set by settattr when we are about to truncate
	 * a file from a non-zero size to a zero size.  This tries to
	 * flush down new bytes that may have been written if the
	 * application were using truncate to replace a file in place.
	 */
	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
			       &BTRFS_I(inode)->runtime_flags))
			filemap_flush(inode->i_mapping);
2002 2003 2004
	return 0;
}

2005 2006 2007
static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
{
	int ret;
L
Liu Bo 已提交
2008
	struct blk_plug plug;
2009

L
Liu Bo 已提交
2010 2011 2012 2013 2014 2015 2016
	/*
	 * This is only called in fsync, which would do synchronous writes, so
	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
	 * multiple disks using raid profile, a large IO can be split to
	 * several segments of stripe length (currently 64K).
	 */
	blk_start_plug(&plug);
2017
	atomic_inc(&BTRFS_I(inode)->sync_writers);
2018
	ret = btrfs_fdatawrite_range(inode, start, end);
2019
	atomic_dec(&BTRFS_I(inode)->sync_writers);
L
Liu Bo 已提交
2020
	blk_finish_plug(&plug);
2021 2022 2023 2024

	return ret;
}

C
Chris Mason 已提交
2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035
/*
 * fsync call for both files and directories.  This logs the inode into
 * the tree log instead of forcing full commits whenever possible.
 *
 * It needs to call filemap_fdatawait so that all ordered extent updates are
 * in the metadata btree are up to date for copying to the log.
 *
 * It drops the inode mutex before doing the tree log commit.  This is an
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
2036
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
C
Chris Mason 已提交
2037
{
2038
	struct dentry *dentry = file_dentry(file);
2039
	struct inode *inode = d_inode(dentry);
2040
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
C
Chris Mason 已提交
2041 2042
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_trans_handle *trans;
2043
	struct btrfs_log_ctx ctx;
2044
	int ret = 0, err;
2045
	bool full_sync = false;
2046
	u64 len;
C
Chris Mason 已提交
2047

2048 2049 2050 2051 2052
	/*
	 * The range length can be represented by u64, we have to do the typecasts
	 * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync()
	 */
	len = (u64)end - (u64)start + 1;
2053
	trace_btrfs_sync_file(file, datasync);
2054

2055 2056
	btrfs_init_log_ctx(&ctx, inode);

2057 2058 2059
	/*
	 * We write the dirty pages in the range and wait until they complete
	 * out of the ->i_mutex. If so, we can flush the dirty pages by
2060 2061
	 * multi-task, and make the performance up.  See
	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2062
	 */
2063
	ret = start_ordered_ops(inode, start, end);
2064
	if (ret)
2065
		goto out;
2066

A
Al Viro 已提交
2067
	inode_lock(inode);
M
Miao Xie 已提交
2068
	atomic_inc(&root->log_batch);
2069 2070
	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			     &BTRFS_I(inode)->runtime_flags);
2071 2072 2073 2074
	/*
	 * We might have have had more pages made dirty after calling
	 * start_ordered_ops and before acquiring the inode's i_mutex.
	 */
2075
	if (full_sync) {
2076 2077 2078 2079 2080 2081
		/*
		 * For a full sync, we need to make sure any ordered operations
		 * start and finish before we start logging the inode, so that
		 * all extents are persisted and the respective file extent
		 * items are in the fs/subvol btree.
		 */
2082
		ret = btrfs_wait_ordered_range(inode, start, len);
2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118
	} else {
		/*
		 * Start any new ordered operations before starting to log the
		 * inode. We will wait for them to finish in btrfs_sync_log().
		 *
		 * Right before acquiring the inode's mutex, we might have new
		 * writes dirtying pages, which won't immediately start the
		 * respective ordered operations - that is done through the
		 * fill_delalloc callbacks invoked from the writepage and
		 * writepages address space operations. So make sure we start
		 * all ordered operations before starting to log our inode. Not
		 * doing this means that while logging the inode, writeback
		 * could start and invoke writepage/writepages, which would call
		 * the fill_delalloc callbacks (cow_file_range,
		 * submit_compressed_extents). These callbacks add first an
		 * extent map to the modified list of extents and then create
		 * the respective ordered operation, which means in
		 * tree-log.c:btrfs_log_inode() we might capture all existing
		 * ordered operations (with btrfs_get_logged_extents()) before
		 * the fill_delalloc callback adds its ordered operation, and by
		 * the time we visit the modified list of extent maps (with
		 * btrfs_log_changed_extents()), we see and process the extent
		 * map they created. We then use the extent map to construct a
		 * file extent item for logging without waiting for the
		 * respective ordered operation to finish - this file extent
		 * item points to a disk location that might not have yet been
		 * written to, containing random data - so after a crash a log
		 * replay will make our inode have file extent items that point
		 * to disk locations containing invalid data, as we returned
		 * success to userspace without waiting for the respective
		 * ordered operation to finish, because it wasn't captured by
		 * btrfs_get_logged_extents().
		 */
		ret = start_ordered_ops(inode, start, end);
	}
	if (ret) {
A
Al Viro 已提交
2119
		inode_unlock(inode);
2120
		goto out;
2121
	}
M
Miao Xie 已提交
2122
	atomic_inc(&root->log_batch);
2123

C
Chris Mason 已提交
2124
	/*
2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150
	 * If the last transaction that changed this file was before the current
	 * transaction and we have the full sync flag set in our inode, we can
	 * bail out now without any syncing.
	 *
	 * Note that we can't bail out if the full sync flag isn't set. This is
	 * because when the full sync flag is set we start all ordered extents
	 * and wait for them to fully complete - when they complete they update
	 * the inode's last_trans field through:
	 *
	 *     btrfs_finish_ordered_io() ->
	 *         btrfs_update_inode_fallback() ->
	 *             btrfs_update_inode() ->
	 *                 btrfs_set_inode_last_trans()
	 *
	 * So we are sure that last_trans is up to date and can do this check to
	 * bail out safely. For the fast path, when the full sync flag is not
	 * set in our inode, we can not do it because we start only our ordered
	 * extents and don't wait for them to complete (that is when
	 * btrfs_finish_ordered_io runs), so here at this point their last_trans
	 * value might be less than or equals to fs_info->last_trans_committed,
	 * and setting a speculative last_trans for an inode when a buffered
	 * write is made (such as fs_info->generation + 1 for example) would not
	 * be reliable since after setting the value and before fsync is called
	 * any number of transactions can start and commit (transaction kthread
	 * commits the current transaction periodically), and a transaction
	 * commit does not start nor waits for ordered extents to complete.
2151
	 */
J
Josef Bacik 已提交
2152
	smp_mb();
2153
	if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
2154
	    (full_sync && BTRFS_I(inode)->last_trans <=
2155
	     fs_info->last_trans_committed) ||
2156 2157
	    (!btrfs_have_ordered_extents_in_range(inode, start, len) &&
	     BTRFS_I(inode)->last_trans
2158
	     <= fs_info->last_trans_committed)) {
J
Josef Bacik 已提交
2159
		/*
2160
		 * We've had everything committed since the last time we were
J
Josef Bacik 已提交
2161 2162 2163 2164 2165
		 * modified so clear this flag in case it was set for whatever
		 * reason, it's no longer relevant.
		 */
		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
			  &BTRFS_I(inode)->runtime_flags);
2166 2167 2168 2169
		/*
		 * An ordered extent might have started before and completed
		 * already with io errors, in which case the inode was not
		 * updated and we end up here. So check the inode's mapping
2170 2171
		 * for any errors that might have happened since we last
		 * checked called fsync.
2172
		 */
2173
		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
A
Al Viro 已提交
2174
		inode_unlock(inode);
2175 2176 2177
		goto out;
	}

2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
	/*
	 * We use start here because we will need to wait on the IO to complete
	 * in btrfs_sync_log, which could require joining a transaction (for
	 * example checking cross references in the nocow path).  If we use join
	 * here we could get into a situation where we're waiting on IO to
	 * happen that is blocked on a transaction trying to commit.  With start
	 * we inc the extwriter counter, so we wait for all extwriters to exit
	 * before we start blocking join'ers.  This comment is to keep somebody
	 * from thinking they are super smart and changing this to
	 * btrfs_join_transaction *cough*Josef*cough*.
	 */
2189 2190 2191
	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
A
Al Viro 已提交
2192
		inode_unlock(inode);
C
Chris Mason 已提交
2193 2194
		goto out;
	}
2195
	trans->sync = true;
2196

2197
	ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx);
2198
	if (ret < 0) {
2199 2200
		/* Fallthrough and commit/free transaction. */
		ret = 1;
2201
	}
C
Chris Mason 已提交
2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212

	/* we've logged all the items and now have a consistent
	 * version of the file in the log.  It is possible that
	 * someone will come in and modify the file, but that's
	 * fine because the log is consistent on disk, and we
	 * have references to all of the file's extents
	 *
	 * It is possible that someone will come in and log the
	 * file again, but that will end up using the synchronization
	 * inside btrfs_sync_log to keep things safe.
	 */
A
Al Viro 已提交
2213
	inode_unlock(inode);
C
Chris Mason 已提交
2214

2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228
	/*
	 * If any of the ordered extents had an error, just return it to user
	 * space, so that the application knows some writes didn't succeed and
	 * can take proper action (retry for e.g.). Blindly committing the
	 * transaction in this case, would fool userspace that everything was
	 * successful. And we also want to make sure our log doesn't contain
	 * file extent items pointing to extents that weren't fully written to -
	 * just like in the non fast fsync path, where we check for the ordered
	 * operation's error flag before writing to the log tree and return -EIO
	 * if any of them had this flag set (btrfs_wait_ordered_range) -
	 * therefore we need to check for errors in the ordered operations,
	 * which are indicated by ctx.io_err.
	 */
	if (ctx.io_err) {
2229
		btrfs_end_transaction(trans);
2230 2231 2232 2233
		ret = ctx.io_err;
		goto out;
	}

2234
	if (ret != BTRFS_NO_LOG_SYNC) {
2235
		if (!ret) {
2236
			ret = btrfs_sync_log(trans, root, &ctx);
2237
			if (!ret) {
2238
				ret = btrfs_end_transaction(trans);
2239
				goto out;
2240
			}
2241
		}
2242
		if (!full_sync) {
2243
			ret = btrfs_wait_ordered_range(inode, start, len);
2244
			if (ret) {
2245
				btrfs_end_transaction(trans);
2246
				goto out;
2247
			}
2248
		}
2249
		ret = btrfs_commit_transaction(trans);
2250
	} else {
2251
		ret = btrfs_end_transaction(trans);
2252
	}
C
Chris Mason 已提交
2253
out:
2254
	ASSERT(list_empty(&ctx.list));
2255 2256 2257
	err = file_check_and_advance_wb_err(file);
	if (!ret)
		ret = err;
2258
	return ret > 0 ? -EIO : ret;
C
Chris Mason 已提交
2259 2260
}

2261
static const struct vm_operations_struct btrfs_file_vm_ops = {
2262
	.fault		= filemap_fault,
2263
	.map_pages	= filemap_map_pages,
C
Chris Mason 已提交
2264 2265 2266 2267 2268
	.page_mkwrite	= btrfs_page_mkwrite,
};

static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
{
M
Miao Xie 已提交
2269 2270 2271 2272 2273
	struct address_space *mapping = filp->f_mapping;

	if (!mapping->a_ops->readpage)
		return -ENOEXEC;

C
Chris Mason 已提交
2274
	file_accessed(filp);
M
Miao Xie 已提交
2275 2276
	vma->vm_ops = &btrfs_file_vm_ops;

C
Chris Mason 已提交
2277 2278 2279
	return 0;
}

2280
static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
J
Josef Bacik 已提交
2281 2282 2283 2284 2285 2286 2287 2288 2289
			  int slot, u64 start, u64 end)
{
	struct btrfs_file_extent_item *fi;
	struct btrfs_key key;

	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
		return 0;

	btrfs_item_key_to_cpu(leaf, &key, slot);
2290
	if (key.objectid != btrfs_ino(inode) ||
J
Josef Bacik 已提交
2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308
	    key.type != BTRFS_EXTENT_DATA_KEY)
		return 0;

	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);

	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
		return 0;

	if (btrfs_file_extent_disk_bytenr(leaf, fi))
		return 0;

	if (key.offset == end)
		return 1;
	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
		return 1;
	return 0;
}

2309 2310 2311
static int fill_holes(struct btrfs_trans_handle *trans,
		struct btrfs_inode *inode,
		struct btrfs_path *path, u64 offset, u64 end)
J
Josef Bacik 已提交
2312
{
2313 2314
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
	struct btrfs_root *root = inode->root;
J
Josef Bacik 已提交
2315 2316 2317
	struct extent_buffer *leaf;
	struct btrfs_file_extent_item *fi;
	struct extent_map *hole_em;
2318
	struct extent_map_tree *em_tree = &inode->extent_tree;
J
Josef Bacik 已提交
2319 2320 2321
	struct btrfs_key key;
	int ret;

2322
	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2323 2324
		goto out;

2325
	key.objectid = btrfs_ino(inode);
J
Josef Bacik 已提交
2326 2327 2328 2329
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = offset;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2330 2331 2332 2333 2334 2335 2336
	if (ret <= 0) {
		/*
		 * We should have dropped this offset, so if we find it then
		 * something has gone horribly wrong.
		 */
		if (ret == 0)
			ret = -EINVAL;
J
Josef Bacik 已提交
2337
		return ret;
2338
	}
J
Josef Bacik 已提交
2339 2340

	leaf = path->nodes[0];
2341
	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
J
Josef Bacik 已提交
2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355
		u64 num_bytes;

		path->slots[0]--;
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
			end - offset;
		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_offset(leaf, fi, 0);
		btrfs_mark_buffer_dirty(leaf);
		goto out;
	}

2356
	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
J
Josef Bacik 已提交
2357 2358 2359
		u64 num_bytes;

		key.offset = offset;
2360
		btrfs_set_item_key_safe(fs_info, path, &key);
J
Josef Bacik 已提交
2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
			offset;
		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
		btrfs_set_file_extent_offset(leaf, fi, 0);
		btrfs_mark_buffer_dirty(leaf);
		goto out;
	}
	btrfs_release_path(path);

2373
	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2374
			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
J
Josef Bacik 已提交
2375 2376 2377 2378 2379 2380 2381 2382 2383
	if (ret)
		return ret;

out:
	btrfs_release_path(path);

	hole_em = alloc_extent_map();
	if (!hole_em) {
		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2384
		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
J
Josef Bacik 已提交
2385 2386 2387
	} else {
		hole_em->start = offset;
		hole_em->len = end - offset;
J
Josef Bacik 已提交
2388
		hole_em->ram_bytes = hole_em->len;
J
Josef Bacik 已提交
2389 2390 2391 2392
		hole_em->orig_start = offset;

		hole_em->block_start = EXTENT_MAP_HOLE;
		hole_em->block_len = 0;
2393
		hole_em->orig_block_len = 0;
2394
		hole_em->bdev = fs_info->fs_devices->latest_bdev;
J
Josef Bacik 已提交
2395 2396 2397 2398 2399 2400
		hole_em->compress_type = BTRFS_COMPRESS_NONE;
		hole_em->generation = trans->transid;

		do {
			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
			write_lock(&em_tree->lock);
J
Josef Bacik 已提交
2401
			ret = add_extent_mapping(em_tree, hole_em, 1);
J
Josef Bacik 已提交
2402 2403 2404 2405 2406
			write_unlock(&em_tree->lock);
		} while (ret == -EEXIST);
		free_extent_map(hole_em);
		if (ret)
			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2407
					&inode->runtime_flags);
J
Josef Bacik 已提交
2408 2409 2410 2411 2412
	}

	return 0;
}

2413 2414 2415 2416 2417 2418 2419 2420
/*
 * Find a hole extent on given inode and change start/len to the end of hole
 * extent.(hole/vacuum extent whose em->start <= start &&
 *	   em->start + em->len > start)
 * When a hole extent is found, return 1 and modify start/len.
 */
static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
{
2421
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2422 2423 2424
	struct extent_map *em;
	int ret = 0;

2425 2426 2427
	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
			      round_down(*start, fs_info->sectorsize),
			      round_up(*len, fs_info->sectorsize), 0);
2428 2429
	if (IS_ERR(em))
		return PTR_ERR(em);
2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441

	/* Hole or vacuum extent(only exists in no-hole mode) */
	if (em->block_start == EXTENT_MAP_HOLE) {
		ret = 1;
		*len = em->start + em->len > *start + *len ?
		       0 : *start + *len - em->start - em->len;
		*start = em->start + em->len;
	}
	free_extent_map(em);
	return ret;
}

2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464
static int btrfs_punch_hole_lock_range(struct inode *inode,
				       const u64 lockstart,
				       const u64 lockend,
				       struct extent_state **cached_state)
{
	while (1) {
		struct btrfs_ordered_extent *ordered;
		int ret;

		truncate_pagecache_range(inode, lockstart, lockend);

		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
				 cached_state);
		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);

		/*
		 * We need to make sure we have no ordered extents in this range
		 * and nobody raced in and read a page in this range, if we did
		 * we need to try again.
		 */
		if ((!ordered ||
		    (ordered->file_offset + ordered->len <= lockstart ||
		     ordered->file_offset > lockend)) &&
2465 2466
		     !filemap_range_has_page(inode->i_mapping,
					     lockstart, lockend)) {
2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482
			if (ordered)
				btrfs_put_ordered_extent(ordered);
			break;
		}
		if (ordered)
			btrfs_put_ordered_extent(ordered);
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
				     lockend, cached_state);
		ret = btrfs_wait_ordered_range(inode, lockstart,
					       lockend - lockstart + 1);
		if (ret)
			return ret;
	}
	return 0;
}

J
Josef Bacik 已提交
2483 2484
static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
2485
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
J
Josef Bacik 已提交
2486 2487 2488 2489 2490
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct extent_state *cached_state = NULL;
	struct btrfs_path *path;
	struct btrfs_block_rsv *rsv;
	struct btrfs_trans_handle *trans;
2491 2492 2493 2494 2495 2496
	u64 lockstart;
	u64 lockend;
	u64 tail_start;
	u64 tail_len;
	u64 orig_start = offset;
	u64 cur_offset;
C
Chris Mason 已提交
2497
	u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1);
J
Josef Bacik 已提交
2498 2499 2500
	u64 drop_end;
	int ret = 0;
	int err = 0;
2501
	unsigned int rsv_count;
2502
	bool same_block;
2503
	bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES);
2504
	u64 ino_size;
2505
	bool truncated_block = false;
2506
	bool updated_inode = false;
J
Josef Bacik 已提交
2507

2508 2509 2510
	ret = btrfs_wait_ordered_range(inode, offset, len);
	if (ret)
		return ret;
J
Josef Bacik 已提交
2511

A
Al Viro 已提交
2512
	inode_lock(inode);
2513
	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2514 2515 2516 2517 2518 2519 2520 2521 2522
	ret = find_first_non_hole(inode, &offset, &len);
	if (ret < 0)
		goto out_only_mutex;
	if (ret && !len) {
		/* Already in a large hole */
		ret = 0;
		goto out_only_mutex;
	}

2523
	lockstart = round_up(offset, btrfs_inode_sectorsize(inode));
2524
	lockend = round_down(offset + len,
2525
			     btrfs_inode_sectorsize(inode)) - 1;
2526 2527
	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2528
	/*
2529
	 * We needn't truncate any block which is beyond the end of the file
2530 2531
	 * because we are sure there is no data there.
	 */
J
Josef Bacik 已提交
2532
	/*
2533 2534
	 * Only do this if we are in the same block and we aren't doing the
	 * entire block.
J
Josef Bacik 已提交
2535
	 */
2536
	if (same_block && len < fs_info->sectorsize) {
2537
		if (offset < ino_size) {
2538 2539
			truncated_block = true;
			ret = btrfs_truncate_block(inode, offset, len, 0);
2540 2541 2542
		} else {
			ret = 0;
		}
2543
		goto out_only_mutex;
J
Josef Bacik 已提交
2544 2545
	}

2546
	/* zero back part of the first block */
2547
	if (offset < ino_size) {
2548 2549
		truncated_block = true;
		ret = btrfs_truncate_block(inode, offset, 0, 0);
2550
		if (ret) {
A
Al Viro 已提交
2551
			inode_unlock(inode);
2552 2553
			return ret;
		}
J
Josef Bacik 已提交
2554 2555
	}

2556 2557
	/* Check the aligned pages after the first unaligned page,
	 * if offset != orig_start, which means the first unaligned page
2558
	 * including several following pages are already in holes,
2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583
	 * the extra check can be skipped */
	if (offset == orig_start) {
		/* after truncate page, check hole again */
		len = offset + len - lockstart;
		offset = lockstart;
		ret = find_first_non_hole(inode, &offset, &len);
		if (ret < 0)
			goto out_only_mutex;
		if (ret && !len) {
			ret = 0;
			goto out_only_mutex;
		}
		lockstart = offset;
	}

	/* Check the tail unaligned part is in a hole */
	tail_start = lockend + 1;
	tail_len = offset + len - tail_start;
	if (tail_len) {
		ret = find_first_non_hole(inode, &tail_start, &tail_len);
		if (unlikely(ret < 0))
			goto out_only_mutex;
		if (!ret) {
			/* zero the front end of the last page */
			if (tail_start + tail_len < ino_size) {
2584 2585 2586 2587
				truncated_block = true;
				ret = btrfs_truncate_block(inode,
							tail_start + tail_len,
							0, 1);
2588 2589
				if (ret)
					goto out_only_mutex;
2590
			}
M
Miao Xie 已提交
2591
		}
J
Josef Bacik 已提交
2592 2593 2594
	}

	if (lockend < lockstart) {
2595 2596
		ret = 0;
		goto out_only_mutex;
J
Josef Bacik 已提交
2597 2598
	}

2599 2600 2601 2602 2603
	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
					  &cached_state);
	if (ret) {
		inode_unlock(inode);
		goto out_only_mutex;
J
Josef Bacik 已提交
2604 2605 2606 2607 2608 2609 2610 2611
	}

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

2612
	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
J
Josef Bacik 已提交
2613 2614 2615 2616
	if (!rsv) {
		ret = -ENOMEM;
		goto out_free;
	}
C
Chris Mason 已提交
2617
	rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1);
J
Josef Bacik 已提交
2618 2619 2620 2621 2622
	rsv->failfast = 1;

	/*
	 * 1 - update the inode
	 * 1 - removing the extents in the range
2623
	 * 1 - adding the hole extent if no_holes isn't set
J
Josef Bacik 已提交
2624
	 */
2625 2626
	rsv_count = no_holes ? 2 : 3;
	trans = btrfs_start_transaction(root, rsv_count);
J
Josef Bacik 已提交
2627 2628 2629 2630 2631
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}

2632
	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2633
				      min_size, 0);
J
Josef Bacik 已提交
2634 2635 2636
	BUG_ON(ret);
	trans->block_rsv = rsv;

2637 2638
	cur_offset = lockstart;
	len = lockend - cur_offset;
J
Josef Bacik 已提交
2639 2640 2641
	while (cur_offset < lockend) {
		ret = __btrfs_drop_extents(trans, root, inode, path,
					   cur_offset, lockend + 1,
2642
					   &drop_end, 1, 0, 0, NULL);
J
Josef Bacik 已提交
2643 2644 2645
		if (ret != -ENOSPC)
			break;

2646
		trans->block_rsv = &fs_info->trans_block_rsv;
J
Josef Bacik 已提交
2647

J
Josef Bacik 已提交
2648
		if (cur_offset < drop_end && cur_offset < ino_size) {
2649 2650
			ret = fill_holes(trans, BTRFS_I(inode), path,
					cur_offset, drop_end);
2651
			if (ret) {
2652 2653 2654 2655 2656 2657 2658
				/*
				 * If we failed then we didn't insert our hole
				 * entries for the area we dropped, so now the
				 * fs is corrupted, so we must abort the
				 * transaction.
				 */
				btrfs_abort_transaction(trans, ret);
2659 2660 2661
				err = ret;
				break;
			}
J
Josef Bacik 已提交
2662 2663 2664 2665 2666 2667 2668 2669 2670 2671
		}

		cur_offset = drop_end;

		ret = btrfs_update_inode(trans, root, inode);
		if (ret) {
			err = ret;
			break;
		}

2672
		btrfs_end_transaction(trans);
2673
		btrfs_btree_balance_dirty(fs_info);
J
Josef Bacik 已提交
2674

2675
		trans = btrfs_start_transaction(root, rsv_count);
J
Josef Bacik 已提交
2676 2677 2678 2679 2680 2681
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			trans = NULL;
			break;
		}

2682
		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2683
					      rsv, min_size, 0);
J
Josef Bacik 已提交
2684 2685
		BUG_ON(ret);	/* shouldn't happen */
		trans->block_rsv = rsv;
2686 2687 2688 2689 2690 2691 2692 2693

		ret = find_first_non_hole(inode, &cur_offset, &len);
		if (unlikely(ret < 0))
			break;
		if (ret && !len) {
			ret = 0;
			break;
		}
J
Josef Bacik 已提交
2694 2695 2696 2697 2698 2699 2700
	}

	if (ret) {
		err = ret;
		goto out_trans;
	}

2701
	trans->block_rsv = &fs_info->trans_block_rsv;
2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714
	/*
	 * If we are using the NO_HOLES feature we might have had already an
	 * hole that overlaps a part of the region [lockstart, lockend] and
	 * ends at (or beyond) lockend. Since we have no file extent items to
	 * represent holes, drop_end can be less than lockend and so we must
	 * make sure we have an extent map representing the existing hole (the
	 * call to __btrfs_drop_extents() might have dropped the existing extent
	 * map representing the existing hole), otherwise the fast fsync path
	 * will not record the existence of the hole region
	 * [existing_hole_start, lockend].
	 */
	if (drop_end <= lockend)
		drop_end = lockend + 1;
2715 2716 2717 2718 2719 2720
	/*
	 * Don't insert file hole extent item if it's for a range beyond eof
	 * (because it's useless) or if it represents a 0 bytes range (when
	 * cur_offset == drop_end).
	 */
	if (cur_offset < ino_size && cur_offset < drop_end) {
2721 2722
		ret = fill_holes(trans, BTRFS_I(inode), path,
				cur_offset, drop_end);
2723
		if (ret) {
2724 2725
			/* Same comment as above. */
			btrfs_abort_transaction(trans, ret);
2726 2727 2728
			err = ret;
			goto out_trans;
		}
J
Josef Bacik 已提交
2729 2730 2731 2732 2733 2734
	}

out_trans:
	if (!trans)
		goto out_free;

2735
	inode_inc_iversion(inode);
2736
	inode->i_mtime = inode->i_ctime = current_time(inode);
2737

2738
	trans->block_rsv = &fs_info->trans_block_rsv;
J
Josef Bacik 已提交
2739
	ret = btrfs_update_inode(trans, root, inode);
2740
	updated_inode = true;
2741
	btrfs_end_transaction(trans);
2742
	btrfs_btree_balance_dirty(fs_info);
J
Josef Bacik 已提交
2743 2744
out_free:
	btrfs_free_path(path);
2745
	btrfs_free_block_rsv(fs_info, rsv);
J
Josef Bacik 已提交
2746 2747
out:
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2748
			     &cached_state);
2749
out_only_mutex:
2750
	if (!updated_inode && truncated_block && !ret && !err) {
2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762
		/*
		 * If we only end up zeroing part of a page, we still need to
		 * update the inode item, so that all the time fields are
		 * updated as well as the necessary btrfs inode in memory fields
		 * for detecting, at fsync time, if the inode isn't yet in the
		 * log tree or it's there but not up to date.
		 */
		trans = btrfs_start_transaction(root, 1);
		if (IS_ERR(trans)) {
			err = PTR_ERR(trans);
		} else {
			err = btrfs_update_inode(trans, root, inode);
2763
			ret = btrfs_end_transaction(trans);
2764 2765
		}
	}
A
Al Viro 已提交
2766
	inode_unlock(inode);
J
Josef Bacik 已提交
2767 2768 2769 2770 2771
	if (ret && !err)
		err = ret;
	return err;
}

2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802
/* Helper structure to record which range is already reserved */
struct falloc_range {
	struct list_head list;
	u64 start;
	u64 len;
};

/*
 * Helper function to add falloc range
 *
 * Caller should have locked the larger range of extent containing
 * [start, len)
 */
static int add_falloc_range(struct list_head *head, u64 start, u64 len)
{
	struct falloc_range *prev = NULL;
	struct falloc_range *range = NULL;

	if (list_empty(head))
		goto insert;

	/*
	 * As fallocate iterate by bytenr order, we only need to check
	 * the last range.
	 */
	prev = list_entry(head->prev, struct falloc_range, list);
	if (prev->start + prev->len == start) {
		prev->len += len;
		return 0;
	}
insert:
D
David Sterba 已提交
2803
	range = kmalloc(sizeof(*range), GFP_KERNEL);
2804 2805 2806 2807 2808 2809 2810 2811
	if (!range)
		return -ENOMEM;
	range->start = start;
	range->len = len;
	list_add_tail(&range->list, head);
	return 0;
}

2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836
static int btrfs_fallocate_update_isize(struct inode *inode,
					const u64 end,
					const int mode)
{
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root = BTRFS_I(inode)->root;
	int ret;
	int ret2;

	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
		return 0;

	trans = btrfs_start_transaction(root, 1);
	if (IS_ERR(trans))
		return PTR_ERR(trans);

	inode->i_ctime = current_time(inode);
	i_size_write(inode, end);
	btrfs_ordered_update_i_size(inode, end, NULL);
	ret = btrfs_update_inode(trans, root, inode);
	ret2 = btrfs_end_transaction(trans);

	return ret ? ret : ret2;
}

2837 2838 2839 2840 2841 2842
enum {
	RANGE_BOUNDARY_WRITTEN_EXTENT = 0,
	RANGE_BOUNDARY_PREALLOC_EXTENT = 1,
	RANGE_BOUNDARY_HOLE = 2,
};

2843 2844 2845 2846 2847
static int btrfs_zero_range_check_range_boundary(struct inode *inode,
						 u64 offset)
{
	const u64 sectorsize = btrfs_inode_sectorsize(inode);
	struct extent_map *em;
2848
	int ret;
2849 2850 2851 2852 2853 2854 2855

	offset = round_down(offset, sectorsize);
	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
	if (IS_ERR(em))
		return PTR_ERR(em);

	if (em->block_start == EXTENT_MAP_HOLE)
2856 2857 2858 2859 2860
		ret = RANGE_BOUNDARY_HOLE;
	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
	else
		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968

	free_extent_map(em);
	return ret;
}

static int btrfs_zero_range(struct inode *inode,
			    loff_t offset,
			    loff_t len,
			    const int mode)
{
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
	struct extent_map *em;
	struct extent_changeset *data_reserved = NULL;
	int ret;
	u64 alloc_hint = 0;
	const u64 sectorsize = btrfs_inode_sectorsize(inode);
	u64 alloc_start = round_down(offset, sectorsize);
	u64 alloc_end = round_up(offset + len, sectorsize);
	u64 bytes_to_reserve = 0;
	bool space_reserved = false;

	inode_dio_wait(inode);

	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
			      alloc_start, alloc_end - alloc_start, 0);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out;
	}

	/*
	 * Avoid hole punching and extent allocation for some cases. More cases
	 * could be considered, but these are unlikely common and we keep things
	 * as simple as possible for now. Also, intentionally, if the target
	 * range contains one or more prealloc extents together with regular
	 * extents and holes, we drop all the existing extents and allocate a
	 * new prealloc extent, so that we get a larger contiguous disk extent.
	 */
	if (em->start <= alloc_start &&
	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
		const u64 em_end = em->start + em->len;

		if (em_end >= offset + len) {
			/*
			 * The whole range is already a prealloc extent,
			 * do nothing except updating the inode's i_size if
			 * needed.
			 */
			free_extent_map(em);
			ret = btrfs_fallocate_update_isize(inode, offset + len,
							   mode);
			goto out;
		}
		/*
		 * Part of the range is already a prealloc extent, so operate
		 * only on the remaining part of the range.
		 */
		alloc_start = em_end;
		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
		len = offset + len - alloc_start;
		offset = alloc_start;
		alloc_hint = em->block_start + em->len;
	}
	free_extent_map(em);

	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
				      alloc_start, sectorsize, 0);
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
			goto out;
		}

		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
			free_extent_map(em);
			ret = btrfs_fallocate_update_isize(inode, offset + len,
							   mode);
			goto out;
		}
		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
			free_extent_map(em);
			ret = btrfs_truncate_block(inode, offset, len, 0);
			if (!ret)
				ret = btrfs_fallocate_update_isize(inode,
								   offset + len,
								   mode);
			return ret;
		}
		free_extent_map(em);
		alloc_start = round_down(offset, sectorsize);
		alloc_end = alloc_start + sectorsize;
		goto reserve_space;
	}

	alloc_start = round_up(offset, sectorsize);
	alloc_end = round_down(offset + len, sectorsize);

	/*
	 * For unaligned ranges, check the pages at the boundaries, they might
	 * map to an extent, in which case we need to partially zero them, or
	 * they might map to a hole, in which case we need our allocation range
	 * to cover them.
	 */
	if (!IS_ALIGNED(offset, sectorsize)) {
		ret = btrfs_zero_range_check_range_boundary(inode, offset);
		if (ret < 0)
			goto out;
2969
		if (ret == RANGE_BOUNDARY_HOLE) {
2970 2971
			alloc_start = round_down(offset, sectorsize);
			ret = 0;
2972
		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2973 2974 2975
			ret = btrfs_truncate_block(inode, offset, 0, 0);
			if (ret)
				goto out;
2976 2977
		} else {
			ret = 0;
2978 2979 2980 2981 2982 2983 2984 2985
		}
	}

	if (!IS_ALIGNED(offset + len, sectorsize)) {
		ret = btrfs_zero_range_check_range_boundary(inode,
							    offset + len);
		if (ret < 0)
			goto out;
2986
		if (ret == RANGE_BOUNDARY_HOLE) {
2987 2988
			alloc_end = round_up(offset + len, sectorsize);
			ret = 0;
2989
		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2990 2991 2992
			ret = btrfs_truncate_block(inode, offset + len, 0, 1);
			if (ret)
				goto out;
2993 2994
		} else {
			ret = 0;
2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024
		}
	}

reserve_space:
	if (alloc_start < alloc_end) {
		struct extent_state *cached_state = NULL;
		const u64 lockstart = alloc_start;
		const u64 lockend = alloc_end - 1;

		bytes_to_reserve = alloc_end - alloc_start;
		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
						      bytes_to_reserve);
		if (ret < 0)
			goto out;
		space_reserved = true;
		ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
						alloc_start, bytes_to_reserve);
		if (ret)
			goto out;
		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
						  &cached_state);
		if (ret)
			goto out;
		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
						alloc_end - alloc_start,
						i_blocksize(inode),
						offset + len, &alloc_hint);
		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
				     lockend, &cached_state);
		/* btrfs_prealloc_file_range releases reserved space on error */
3025
		if (ret) {
3026
			space_reserved = false;
3027 3028
			goto out;
		}
3029
	}
3030
	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3031 3032 3033 3034 3035 3036 3037 3038 3039
 out:
	if (ret && space_reserved)
		btrfs_free_reserved_data_space(inode, data_reserved,
					       alloc_start, bytes_to_reserve);
	extent_changeset_free(data_reserved);

	return ret;
}

3040 3041 3042
static long btrfs_fallocate(struct file *file, int mode,
			    loff_t offset, loff_t len)
{
A
Al Viro 已提交
3043
	struct inode *inode = file_inode(file);
3044
	struct extent_state *cached_state = NULL;
3045
	struct extent_changeset *data_reserved = NULL;
3046 3047 3048
	struct falloc_range *range;
	struct falloc_range *tmp;
	struct list_head reserve_list;
3049 3050 3051 3052 3053 3054
	u64 cur_offset;
	u64 last_byte;
	u64 alloc_start;
	u64 alloc_end;
	u64 alloc_hint = 0;
	u64 locked_end;
3055
	u64 actual_end = 0;
3056
	struct extent_map *em;
3057
	int blocksize = btrfs_inode_sectorsize(inode);
3058 3059
	int ret;

3060 3061
	alloc_start = round_down(offset, blocksize);
	alloc_end = round_up(offset + len, blocksize);
3062
	cur_offset = alloc_start;
3063

J
Josef Bacik 已提交
3064
	/* Make sure we aren't being give some crap mode */
3065 3066
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
		     FALLOC_FL_ZERO_RANGE))
3067 3068
		return -EOPNOTSUPP;

J
Josef Bacik 已提交
3069 3070 3071
	if (mode & FALLOC_FL_PUNCH_HOLE)
		return btrfs_punch_hole(inode, offset, len);

3072
	/*
3073 3074 3075
	 * Only trigger disk allocation, don't trigger qgroup reserve
	 *
	 * For qgroup space, it will be checked later.
3076
	 */
3077 3078 3079 3080 3081 3082
	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
						      alloc_end - alloc_start);
		if (ret < 0)
			return ret;
	}
3083

A
Al Viro 已提交
3084
	inode_lock(inode);
3085 3086 3087 3088 3089 3090

	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
		ret = inode_newsize_ok(inode, offset + len);
		if (ret)
			goto out;
	}
3091

3092 3093 3094 3095 3096 3097 3098
	/*
	 * TODO: Move these two operations after we have checked
	 * accurate reserved space, or fallocate can still fail but
	 * with page truncated or size expanded.
	 *
	 * But that's a minor problem and won't do much harm BTW.
	 */
3099
	if (alloc_start > inode->i_size) {
3100 3101
		ret = btrfs_cont_expand(inode, i_size_read(inode),
					alloc_start);
3102 3103
		if (ret)
			goto out;
3104
	} else if (offset + len > inode->i_size) {
3105 3106
		/*
		 * If we are fallocating from the end of the file onward we
3107 3108
		 * need to zero out the end of the block if i_size lands in the
		 * middle of a block.
3109
		 */
3110
		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
3111 3112
		if (ret)
			goto out;
3113 3114
	}

3115 3116 3117 3118
	/*
	 * wait for ordered IO before we have any locks.  We'll loop again
	 * below with the locks held.
	 */
3119 3120 3121 3122
	ret = btrfs_wait_ordered_range(inode, alloc_start,
				       alloc_end - alloc_start);
	if (ret)
		goto out;
3123

3124 3125 3126 3127 3128 3129
	if (mode & FALLOC_FL_ZERO_RANGE) {
		ret = btrfs_zero_range(inode, offset, len, mode);
		inode_unlock(inode);
		return ret;
	}

3130 3131 3132 3133 3134 3135 3136 3137
	locked_end = alloc_end - 1;
	while (1) {
		struct btrfs_ordered_extent *ordered;

		/* the extent lock is ordered inside the running
		 * transaction
		 */
		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3138
				 locked_end, &cached_state);
3139 3140
		ordered = btrfs_lookup_first_ordered_extent(inode, locked_end);

3141 3142 3143 3144 3145 3146
		if (ordered &&
		    ordered->file_offset + ordered->len > alloc_start &&
		    ordered->file_offset < alloc_end) {
			btrfs_put_ordered_extent(ordered);
			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
					     alloc_start, locked_end,
3147
					     &cached_state);
3148 3149 3150 3151
			/*
			 * we can't wait on the range with the transaction
			 * running or with the extent lock held
			 */
3152 3153 3154 3155
			ret = btrfs_wait_ordered_range(inode, alloc_start,
						       alloc_end - alloc_start);
			if (ret)
				goto out;
3156 3157 3158 3159 3160 3161 3162
		} else {
			if (ordered)
				btrfs_put_ordered_extent(ordered);
			break;
		}
	}

3163 3164
	/* First, check if we exceed the qgroup limit */
	INIT_LIST_HEAD(&reserve_list);
3165
	while (cur_offset < alloc_end) {
3166
		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3167
				      alloc_end - cur_offset, 0);
3168 3169
		if (IS_ERR(em)) {
			ret = PTR_ERR(em);
3170 3171
			break;
		}
3172
		last_byte = min(extent_map_end(em), alloc_end);
3173
		actual_end = min_t(u64, extent_map_end(em), offset + len);
3174
		last_byte = ALIGN(last_byte, blocksize);
3175 3176 3177
		if (em->block_start == EXTENT_MAP_HOLE ||
		    (cur_offset >= inode->i_size &&
		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3178 3179 3180 3181 3182
			ret = add_falloc_range(&reserve_list, cur_offset,
					       last_byte - cur_offset);
			if (ret < 0) {
				free_extent_map(em);
				break;
3183
			}
3184 3185
			ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
					cur_offset, last_byte - cur_offset);
3186 3187
			if (ret < 0) {
				free_extent_map(em);
3188
				break;
3189
			}
3190 3191 3192 3193 3194 3195
		} else {
			/*
			 * Do not need to reserve unwritten extent for this
			 * range, free reserved data space first, otherwise
			 * it'll result in false ENOSPC error.
			 */
3196 3197
			btrfs_free_reserved_data_space(inode, data_reserved,
					cur_offset, last_byte - cur_offset);
3198 3199 3200
		}
		free_extent_map(em);
		cur_offset = last_byte;
3201 3202 3203 3204 3205 3206 3207 3208 3209 3210
	}

	/*
	 * If ret is still 0, means we're OK to fallocate.
	 * Or just cleanup the list and exit.
	 */
	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
		if (!ret)
			ret = btrfs_prealloc_file_range(inode, mode,
					range->start,
F
Fabian Frederick 已提交
3211
					range->len, i_blocksize(inode),
3212
					offset + len, &alloc_hint);
3213
		else
3214 3215 3216
			btrfs_free_reserved_data_space(inode,
					data_reserved, range->start,
					range->len);
3217 3218 3219 3220 3221 3222
		list_del(&range->list);
		kfree(range);
	}
	if (ret < 0)
		goto out_unlock;

3223 3224 3225 3226 3227
	/*
	 * We didn't need to allocate any more space, but we still extended the
	 * size of the file so we need to update i_size and the inode item.
	 */
	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3228
out_unlock:
3229
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3230
			     &cached_state);
3231
out:
A
Al Viro 已提交
3232
	inode_unlock(inode);
3233
	/* Let go of our reservation. */
3234
	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3235 3236
		btrfs_free_reserved_data_space(inode, data_reserved,
				alloc_start, alloc_end - cur_offset);
3237
	extent_changeset_free(data_reserved);
3238 3239 3240
	return ret;
}

3241
static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
J
Josef Bacik 已提交
3242
{
3243
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3244
	struct extent_map *em = NULL;
J
Josef Bacik 已提交
3245
	struct extent_state *cached_state = NULL;
L
Liu Bo 已提交
3246 3247 3248 3249
	u64 lockstart;
	u64 lockend;
	u64 start;
	u64 len;
J
Josef Bacik 已提交
3250 3251
	int ret = 0;

L
Liu Bo 已提交
3252 3253 3254 3255 3256 3257 3258 3259 3260
	if (inode->i_size == 0)
		return -ENXIO;

	/*
	 * *offset can be negative, in this case we start finding DATA/HOLE from
	 * the very start of the file.
	 */
	start = max_t(loff_t, 0, *offset);

3261
	lockstart = round_down(start, fs_info->sectorsize);
3262
	lockend = round_up(i_size_read(inode),
3263
			   fs_info->sectorsize);
J
Josef Bacik 已提交
3264
	if (lockend <= lockstart)
3265
		lockend = lockstart + fs_info->sectorsize;
L
Liu Bo 已提交
3266
	lockend--;
J
Josef Bacik 已提交
3267 3268
	len = lockend - lockstart + 1;

3269
	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3270
			 &cached_state);
J
Josef Bacik 已提交
3271

3272
	while (start < inode->i_size) {
3273 3274
		em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0,
				start, len, 0);
J
Josef Bacik 已提交
3275
		if (IS_ERR(em)) {
3276
			ret = PTR_ERR(em);
3277
			em = NULL;
J
Josef Bacik 已提交
3278 3279 3280
			break;
		}

3281 3282 3283 3284 3285 3286 3287 3288
		if (whence == SEEK_HOLE &&
		    (em->block_start == EXTENT_MAP_HOLE ||
		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
			break;
		else if (whence == SEEK_DATA &&
			   (em->block_start != EXTENT_MAP_HOLE &&
			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
			break;
J
Josef Bacik 已提交
3289 3290 3291

		start = em->start + em->len;
		free_extent_map(em);
3292
		em = NULL;
J
Josef Bacik 已提交
3293 3294
		cond_resched();
	}
3295 3296 3297 3298 3299 3300 3301
	free_extent_map(em);
	if (!ret) {
		if (whence == SEEK_DATA && start >= inode->i_size)
			ret = -ENXIO;
		else
			*offset = min_t(loff_t, start, inode->i_size);
	}
J
Josef Bacik 已提交
3302
	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3303
			     &cached_state);
J
Josef Bacik 已提交
3304 3305 3306
	return ret;
}

3307
static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
J
Josef Bacik 已提交
3308 3309 3310 3311
{
	struct inode *inode = file->f_mapping->host;
	int ret;

A
Al Viro 已提交
3312
	inode_lock(inode);
3313
	switch (whence) {
J
Josef Bacik 已提交
3314 3315
	case SEEK_END:
	case SEEK_CUR:
3316
		offset = generic_file_llseek(file, offset, whence);
J
Josef Bacik 已提交
3317 3318 3319
		goto out;
	case SEEK_DATA:
	case SEEK_HOLE:
3320
		if (offset >= i_size_read(inode)) {
A
Al Viro 已提交
3321
			inode_unlock(inode);
3322 3323 3324
			return -ENXIO;
		}

3325
		ret = find_desired_extent(inode, &offset, whence);
J
Josef Bacik 已提交
3326
		if (ret) {
A
Al Viro 已提交
3327
			inode_unlock(inode);
J
Josef Bacik 已提交
3328 3329 3330 3331
			return ret;
		}
	}

J
Jie Liu 已提交
3332
	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
J
Josef Bacik 已提交
3333
out:
A
Al Viro 已提交
3334
	inode_unlock(inode);
J
Josef Bacik 已提交
3335 3336 3337
	return offset;
}

G
Goldwyn Rodrigues 已提交
3338 3339
static int btrfs_file_open(struct inode *inode, struct file *filp)
{
3340
	filp->f_mode |= FMODE_NOWAIT;
G
Goldwyn Rodrigues 已提交
3341 3342 3343
	return generic_file_open(inode, filp);
}

3344
const struct file_operations btrfs_file_operations = {
J
Josef Bacik 已提交
3345
	.llseek		= btrfs_file_llseek,
3346
	.read_iter      = generic_file_read_iter,
C
Chris Mason 已提交
3347
	.splice_read	= generic_file_splice_read,
A
Al Viro 已提交
3348
	.write_iter	= btrfs_file_write_iter,
C
Chris Mason 已提交
3349
	.mmap		= btrfs_file_mmap,
G
Goldwyn Rodrigues 已提交
3350
	.open		= btrfs_file_open,
3351
	.release	= btrfs_release_file,
C
Chris Mason 已提交
3352
	.fsync		= btrfs_sync_file,
3353
	.fallocate	= btrfs_fallocate,
C
Christoph Hellwig 已提交
3354
	.unlocked_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
3355
#ifdef CONFIG_COMPAT
3356
	.compat_ioctl	= btrfs_compat_ioctl,
C
Chris Mason 已提交
3357
#endif
3358
	.clone_file_range = btrfs_clone_file_range,
3359
	.dedupe_file_range = btrfs_dedupe_file_range,
C
Chris Mason 已提交
3360
};
3361

3362
void __cold btrfs_auto_defrag_exit(void)
3363
{
3364
	kmem_cache_destroy(btrfs_inode_defrag_cachep);
3365 3366
}

3367
int __init btrfs_auto_defrag_init(void)
3368 3369 3370
{
	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
					sizeof(struct inode_defrag), 0,
3371
					SLAB_MEM_SPREAD,
3372 3373 3374 3375 3376 3377
					NULL);
	if (!btrfs_inode_defrag_cachep)
		return -ENOMEM;

	return 0;
}
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403

int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
	int ret;

	/*
	 * So with compression we will find and lock a dirty page and clear the
	 * first one as dirty, setup an async extent, and immediately return
	 * with the entire range locked but with nobody actually marked with
	 * writeback.  So we can't just filemap_write_and_wait_range() and
	 * expect it to work since it will just kick off a thread to do the
	 * actual work.  So we need to call filemap_fdatawrite_range _again_
	 * since it will wait on the page lock, which won't be unlocked until
	 * after the pages have been marked as writeback and so we're good to go
	 * from there.  We have to do this otherwise we'll miss the ordered
	 * extents and that results in badness.  Please Josef, do not think you
	 * know better and pull this out at some point in the future, it is
	 * right and you are wrong.
	 */
	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
			     &BTRFS_I(inode)->runtime_flags))
		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);

	return ret;
}