file.c 31.2 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19 20 21 22 23 24 25 26 27 28 29 30 31
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
32
#include <linux/version.h>
C
Chris Mason 已提交
33 34 35 36 37 38
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "ioctl.h"
#include "print-tree.h"
39 40
#include "tree-log.h"
#include "locking.h"
41
#include "compat.h"
C
Chris Mason 已提交
42 43


C
Chris Mason 已提交
44 45 46
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
47 48 49 50
static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
					 int write_bytes,
					 struct page **prepared_pages,
					 const char __user * buf)
C
Chris Mason 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
{
	long page_fault = 0;
	int i;
	int offset = pos & (PAGE_CACHE_SIZE - 1);

	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
		size_t count = min_t(size_t,
				     PAGE_CACHE_SIZE - offset, write_bytes);
		struct page *page = prepared_pages[i];
		fault_in_pages_readable(buf, count);

		/* Copy data from userspace to the current page */
		kmap(page);
		page_fault = __copy_from_user(page_address(page) + offset,
					      buf, count);
		/* Flush processor's dcache for this page */
		flush_dcache_page(page);
		kunmap(page);
		buf += count;
		write_bytes -= count;

		if (page_fault)
			break;
	}
	return page_fault ? -EFAULT : 0;
}

C
Chris Mason 已提交
78 79 80
/*
 * unlocks pages after btrfs_file_write is done with them
 */
81
static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
C
Chris Mason 已提交
82 83 84 85 86
{
	size_t i;
	for (i = 0; i < num_pages; i++) {
		if (!pages[i])
			break;
C
Chris Mason 已提交
87 88 89 90
		/* page checked is some magic around finding pages that
		 * have been modified without going through btrfs_set_page_dirty
		 * clear it here
		 */
C
Chris Mason 已提交
91
		ClearPageChecked(pages[i]);
C
Chris Mason 已提交
92 93 94 95 96 97
		unlock_page(pages[i]);
		mark_page_accessed(pages[i]);
		page_cache_release(pages[i]);
	}
}

C
Chris Mason 已提交
98 99 100 101
/* this does all the hard work for inserting an inline extent into
 * the btree.  Any existing inline extent is extended as required to make room,
 * otherwise things are inserted as required into the btree
 */
102
static int noinline insert_inline_extent(struct btrfs_trans_handle *trans,
103
				struct btrfs_root *root, struct inode *inode,
104 105 106
				u64 offset, size_t size,
				struct page **pages, size_t page_offset,
				int num_pages)
107 108 109
{
	struct btrfs_key key;
	struct btrfs_path *path;
110 111 112
	struct extent_buffer *leaf;
	char *kaddr;
	unsigned long ptr;
113
	struct btrfs_file_extent_item *ei;
114
	struct page *page;
115 116 117
	u32 datasize;
	int err = 0;
	int ret;
118 119
	int i;
	ssize_t cur_size;
120 121 122 123 124 125 126 127 128 129 130

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	btrfs_set_trans_block_group(trans, inode);

	key.objectid = inode->i_ino;
	key.offset = offset;
	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);

131 132
	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0) {
133 134 135
		err = ret;
		goto fail;
	}
136
	if (ret == 1) {
137 138 139 140 141
		struct btrfs_key found_key;

		if (path->slots[0] == 0)
			goto insert;

142 143
		path->slots[0]--;
		leaf = path->nodes[0];
144 145 146 147 148 149 150
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);

		if (found_key.objectid != inode->i_ino)
			goto insert;

		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
			goto insert;
151 152 153 154 155 156 157 158 159 160 161 162
		ei = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);

		if (btrfs_file_extent_type(leaf, ei) !=
		    BTRFS_FILE_EXTENT_INLINE) {
			goto insert;
		}
		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		ret = 0;
	}
	if (ret == 0) {
		u32 found_size;
163
		u64 found_end;
164 165 166 167 168 169 170 171 172 173 174 175 176 177 178

		leaf = path->nodes[0];
		ei = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);

		if (btrfs_file_extent_type(leaf, ei) !=
		    BTRFS_FILE_EXTENT_INLINE) {
			err = ret;
			btrfs_print_leaf(root, leaf);
			printk("found wasn't inline offset %Lu inode %lu\n",
			       offset, inode->i_ino);
			goto fail;
		}
		found_size = btrfs_file_extent_inline_len(leaf,
					  btrfs_item_nr(leaf, path->slots[0]));
179
		found_end = key.offset + found_size;
180

181
		if (found_end < offset + size) {
182 183
			btrfs_release_path(root, path);
			ret = btrfs_search_slot(trans, root, &key, path,
184
						offset + size - found_end, 1);
185
			BUG_ON(ret != 0);
186

187
			ret = btrfs_extend_item(trans, root, path,
188
						offset + size - found_end);
189 190 191 192 193 194 195
			if (ret) {
				err = ret;
				goto fail;
			}
			leaf = path->nodes[0];
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_file_extent_item);
C
Chris Mason 已提交
196
			inode->i_blocks += (offset + size - found_end) >> 9;
197
		}
198 199 200 201
		if (found_end < offset) {
			ptr = btrfs_file_extent_inline_start(ei) + found_size;
			memset_extent_buffer(leaf, 0, ptr, offset - found_end);
		}
202 203 204
	} else {
insert:
		btrfs_release_path(root, path);
205
		datasize = offset + size - key.offset;
C
Chris Mason 已提交
206
		inode->i_blocks += datasize >> 9;
207
		datasize = btrfs_file_extent_calc_inline_size(datasize);
208 209 210 211 212 213 214 215 216 217 218 219 220
		ret = btrfs_insert_empty_item(trans, root, path, &key,
					      datasize);
		if (ret) {
			err = ret;
			printk("got bad ret %d\n", ret);
			goto fail;
		}
		leaf = path->nodes[0];
		ei = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		btrfs_set_file_extent_generation(leaf, ei, trans->transid);
		btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
	}
221
	ptr = btrfs_file_extent_inline_start(ei) + offset - key.offset;
222 223 224 225 226 227

	cur_size = size;
	i = 0;
	while (size > 0) {
		page = pages[i];
		kaddr = kmap_atomic(page, KM_USER0);
J
Jens Axboe 已提交
228
		cur_size = min_t(size_t, PAGE_CACHE_SIZE - page_offset, size);
229 230 231 232 233 234 235 236 237 238
		write_extent_buffer(leaf, kaddr + page_offset, ptr, cur_size);
		kunmap_atomic(kaddr, KM_USER0);
		page_offset = 0;
		ptr += cur_size;
		size -= cur_size;
		if (i >= num_pages) {
			printk("i %d num_pages %d\n", i, num_pages);
		}
		i++;
	}
239
	btrfs_mark_buffer_dirty(leaf);
240 241 242 243 244
fail:
	btrfs_free_path(path);
	return err;
}

C
Chris Mason 已提交
245 246 247 248 249 250 251 252
/*
 * after copy_from_user, pages need to be dirtied and we need to make
 * sure holes are created between the current EOF and the start of
 * any next extents (if required).
 *
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
253
static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
C
Chris Mason 已提交
254 255 256 257 258 259 260 261
				   struct btrfs_root *root,
				   struct file *file,
				   struct page **pages,
				   size_t num_pages,
				   loff_t pos,
				   size_t write_bytes)
{
	int err = 0;
262
	int i;
263
	struct inode *inode = fdentry(file)->d_inode;
264
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
265 266
	u64 hint_byte;
	u64 num_bytes;
267 268 269
	u64 start_pos;
	u64 end_of_last_block;
	u64 end_pos = pos + write_bytes;
270
	u64 inline_size;
271
	int did_inline = 0;
272
	loff_t isize = i_size_read(inode);
C
Chris Mason 已提交
273

274
	start_pos = pos & ~((u64)root->sectorsize - 1);
275 276
	num_bytes = (write_bytes + pos - start_pos +
		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
C
Chris Mason 已提交
277

278 279
	end_of_last_block = start_pos + num_bytes - 1;

280
	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
C
Chris Mason 已提交
281
	trans = btrfs_join_transaction(root, 1);
282 283 284 285 286
	if (!trans) {
		err = -ENOMEM;
		goto out_unlock;
	}
	btrfs_set_trans_block_group(trans, inode);
287
	hint_byte = 0;
288 289

	if ((end_of_last_block & 4095) == 0) {
290
		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
291
	}
292
	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
293 294 295

	/* FIXME...EIEIO, ENOSPC and more */
	/* insert any holes we need to create */
296
	if (isize < start_pos) {
297 298
		u64 last_pos_in_file;
		u64 hole_size;
299
		u64 mask = root->sectorsize - 1;
300
		last_pos_in_file = (isize + mask) & ~mask;
301
		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
302 303 304
		if (hole_size > 0) {
			btrfs_wait_ordered_range(inode, last_pos_in_file,
						 last_pos_in_file + hole_size);
305
			mutex_lock(&BTRFS_I(inode)->extent_mutex);
C
Chris Mason 已提交
306 307 308
			err = btrfs_drop_extents(trans, root, inode,
						 last_pos_in_file,
						 last_pos_in_file + hole_size,
309
						 last_pos_in_file,
310
						 &hint_byte);
C
Chris Mason 已提交
311 312 313
			if (err)
				goto failed;

314 315 316
			err = btrfs_insert_file_extent(trans, root,
						       inode->i_ino,
						       last_pos_in_file,
S
Sage Weil 已提交
317
						       0, 0, hole_size, 0);
318
			btrfs_drop_extent_cache(inode, last_pos_in_file,
319
					last_pos_in_file + hole_size - 1, 0);
320
			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
321
			btrfs_check_file(root, inode);
322 323
		}
		if (err)
C
Chris Mason 已提交
324
			goto failed;
325 326 327 328 329 330
	}

	/*
	 * either allocate an extent for the new bytes or setup the key
	 * to show we are doing inline data in the extent
	 */
331 332
	inline_size = end_pos;
	if (isize >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
333 334
	    inline_size > root->fs_info->max_inline ||
	    (inline_size & (root->sectorsize -1)) == 0 ||
335
	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
336 337 338 339
		/* check for reserved extents on each page, we don't want
		 * to reset the delalloc bit on things that already have
		 * extents reserved.
		 */
340
		btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
341 342 343
		for (i = 0; i < num_pages; i++) {
			struct page *p = pages[i];
			SetPageUptodate(p);
344
			ClearPageChecked(p);
345
			set_page_dirty(p);
C
Chris Mason 已提交
346
		}
347
	} else {
348
		u64 aligned_end;
349
		/* step one, delete the existing extents in this range */
350 351
		aligned_end = (pos + write_bytes + root->sectorsize - 1) &
			~((u64)root->sectorsize - 1);
352
		mutex_lock(&BTRFS_I(inode)->extent_mutex);
C
Chris Mason 已提交
353
		err = btrfs_drop_extents(trans, root, inode, start_pos,
354
					 aligned_end, aligned_end, &hint_byte);
C
Chris Mason 已提交
355 356
		if (err)
			goto failed;
357 358 359
		if (isize > inline_size)
			inline_size = min_t(u64, isize, aligned_end);
		inline_size -= start_pos;
360
		err = insert_inline_extent(trans, root, inode, start_pos,
361
					   inline_size, pages, 0, num_pages);
362
		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1, 0);
363
		BUG_ON(err);
364
		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
365 366 367 368 369 370 371

		/*
		 * an ugly way to do all the prop accounting around
		 * the page bits and mapping tags
		 */
		set_page_writeback(pages[0]);
		end_page_writeback(pages[0]);
372
		did_inline = 1;
373 374 375
	}
	if (end_pos > isize) {
		i_size_write(inode, end_pos);
376 377
		if (did_inline)
			BTRFS_I(inode)->disk_i_size = end_pos;
378
		btrfs_update_inode(trans, root, inode);
C
Chris Mason 已提交
379 380
	}
failed:
C
Chris Mason 已提交
381
	err = btrfs_end_transaction(trans, root);
382
out_unlock:
383
	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
C
Chris Mason 已提交
384 385 386
	return err;
}

C
Chris Mason 已提交
387 388 389 390
/*
 * this drops all the extents in the cache that intersect the range
 * [start, end].  Existing extents are split as required.
 */
391 392
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
			    int skip_pinned)
393 394
{
	struct extent_map *em;
395 396
	struct extent_map *split = NULL;
	struct extent_map *split2 = NULL;
397
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
398
	u64 len = end - start + 1;
399 400
	int ret;
	int testend = 1;
401
	unsigned long flags;
402

403
	WARN_ON(end < start);
404
	if (end == (u64)-1) {
405
		len = (u64)-1;
406 407
		testend = 0;
	}
408
	while(1) {
409 410 411 412 413
		if (!split)
			split = alloc_extent_map(GFP_NOFS);
		if (!split2)
			split2 = alloc_extent_map(GFP_NOFS);

414
		spin_lock(&em_tree->lock);
415
		em = lookup_extent_mapping(em_tree, start, len);
416 417
		if (!em) {
			spin_unlock(&em_tree->lock);
418
			break;
419
		}
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
		flags = em->flags;
		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
			spin_unlock(&em_tree->lock);
			if (em->start <= start &&
			    (!testend || em->start + em->len >= start + len)) {
				free_extent_map(em);
				break;
			}
			if (start < em->start) {
				len = em->start - start;
			} else {
				len = start + len - (em->start + em->len);
				start = em->start + em->len;
			}
			free_extent_map(em);
			continue;
		}
437
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
438
		remove_extent_mapping(em_tree, em);
439 440 441 442 443 444 445

		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
		    em->start < start) {
			split->start = em->start;
			split->len = start - em->start;
			split->block_start = em->block_start;
			split->bdev = em->bdev;
446
			split->flags = flags;
447 448 449 450 451 452 453 454 455 456 457 458 459
			ret = add_extent_mapping(em_tree, split);
			BUG_ON(ret);
			free_extent_map(split);
			split = split2;
			split2 = NULL;
		}
		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
		    testend && em->start + em->len > start + len) {
			u64 diff = start + len - em->start;

			split->start = start + len;
			split->len = em->start + em->len - (start + len);
			split->bdev = em->bdev;
460
			split->flags = flags;
461 462 463 464 465 466 467 468

			split->block_start = em->block_start + diff;

			ret = add_extent_mapping(em_tree, split);
			BUG_ON(ret);
			free_extent_map(split);
			split = NULL;
		}
469 470
		spin_unlock(&em_tree->lock);

471 472 473 474 475
		/* once for us */
		free_extent_map(em);
		/* once for the tree*/
		free_extent_map(em);
	}
476 477 478 479
	if (split)
		free_extent_map(split);
	if (split2)
		free_extent_map(split2);
480 481 482
	return 0;
}

483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517
int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
{
	return 0;
#if 0
	struct btrfs_path *path;
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
	struct btrfs_file_extent_item *extent;
	u64 last_offset = 0;
	int nritems;
	int slot;
	int found_type;
	int ret;
	int err = 0;
	u64 extent_end = 0;

	path = btrfs_alloc_path();
	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
				       last_offset, 0);
	while(1) {
		nritems = btrfs_header_nritems(path->nodes[0]);
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret)
				goto out;
			nritems = btrfs_header_nritems(path->nodes[0]);
		}
		slot = path->slots[0];
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
		if (found_key.objectid != inode->i_ino)
			break;
		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
			goto out;

C
Chris Mason 已提交
518
		if (found_key.offset < last_offset) {
519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
			WARN_ON(1);
			btrfs_print_leaf(root, leaf);
			printk("inode %lu found offset %Lu expected %Lu\n",
			       inode->i_ino, found_key.offset, last_offset);
			err = 1;
			goto out;
		}
		extent = btrfs_item_ptr(leaf, slot,
					struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(leaf, extent);
		if (found_type == BTRFS_FILE_EXTENT_REG) {
			extent_end = found_key.offset +
			     btrfs_file_extent_num_bytes(leaf, extent);
		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
			struct btrfs_item *item;
			item = btrfs_item_nr(leaf, slot);
			extent_end = found_key.offset +
			     btrfs_file_extent_inline_len(leaf, item);
			extent_end = (extent_end + root->sectorsize - 1) &
				~((u64)root->sectorsize -1 );
		}
		last_offset = extent_end;
		path->slots[0]++;
	}
C
Chris Mason 已提交
543
	if (0 && last_offset < inode->i_size) {
544 545 546 547 548 549 550 551 552 553 554 555 556
		WARN_ON(1);
		btrfs_print_leaf(root, leaf);
		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
		       last_offset, inode->i_size);
		err = 1;

	}
out:
	btrfs_free_path(path);
	return err;
#endif
}

C
Chris Mason 已提交
557 558 559 560 561 562 563 564
/*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
 * that would be a good hint to the block allocator for this file.
 *
 * If an extent intersects the range but is not entirely inside the range
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
C
Chris Mason 已提交
565 566 567
 *
 * inline_limit is used to tell this code which offsets in the file to keep
 * if they contain inline extents.
C
Chris Mason 已提交
568
 */
569
int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
C
Chris Mason 已提交
570
		       struct btrfs_root *root, struct inode *inode,
571
		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
C
Chris Mason 已提交
572
{
573 574
	u64 extent_end = 0;
	u64 search_start = start;
Z
Zheng Yan 已提交
575 576 577
	u64 leaf_start;
	u64 root_gen;
	u64 root_owner;
578
	struct extent_buffer *leaf;
C
Chris Mason 已提交
579 580
	struct btrfs_file_extent_item *extent;
	struct btrfs_path *path;
581 582 583 584
	struct btrfs_key key;
	struct btrfs_file_extent_item old;
	int keep;
	int slot;
C
Chris Mason 已提交
585 586 587 588
	int bookend;
	int found_type;
	int found_extent;
	int found_inline;
C
Chris Mason 已提交
589
	int recow;
590
	int ret;
C
Chris Mason 已提交
591

592
	btrfs_drop_extent_cache(inode, start, end - 1, 0);
593

C
Chris Mason 已提交
594 595 596 597
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
	while(1) {
C
Chris Mason 已提交
598
		recow = 0;
C
Chris Mason 已提交
599 600 601 602 603 604 605 606 607 608 609 610
		btrfs_release_path(root, path);
		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
					       search_start, -1);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			if (path->slots[0] == 0) {
				ret = 0;
				goto out;
			}
			path->slots[0]--;
		}
611
next_slot:
C
Chris Mason 已提交
612 613 614 615
		keep = 0;
		bookend = 0;
		found_extent = 0;
		found_inline = 0;
Z
Zheng Yan 已提交
616 617 618
		leaf_start = 0;
		root_gen = 0;
		root_owner = 0;
C
Chris Mason 已提交
619
		extent = NULL;
620
		leaf = path->nodes[0];
C
Chris Mason 已提交
621
		slot = path->slots[0];
622
		ret = 0;
623
		btrfs_item_key_to_cpu(leaf, &key, slot);
624 625
		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
		    key.offset >= end) {
C
Chris Mason 已提交
626 627
			goto out;
		}
628 629
		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
		    key.objectid != inode->i_ino) {
C
Chris Mason 已提交
630 631
			goto out;
		}
C
Chris Mason 已提交
632 633 634 635
		if (recow) {
			search_start = key.offset;
			continue;
		}
636 637 638
		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
			extent = btrfs_item_ptr(leaf, slot,
						struct btrfs_file_extent_item);
639
			found_type = btrfs_file_extent_type(leaf, extent);
640
			if (found_type == BTRFS_FILE_EXTENT_REG) {
641 642 643 644 645 646
				extent_end =
				     btrfs_file_extent_disk_bytenr(leaf,
								   extent);
				if (extent_end)
					*hint_byte = extent_end;

647
				extent_end = key.offset +
648
				     btrfs_file_extent_num_bytes(leaf, extent);
649 650
				found_extent = 1;
			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
651 652
				struct btrfs_item *item;
				item = btrfs_item_nr(leaf, slot);
653 654
				found_inline = 1;
				extent_end = key.offset +
655
				     btrfs_file_extent_inline_len(leaf, item);
656 657 658
			}
		} else {
			extent_end = search_start;
C
Chris Mason 已提交
659 660 661
		}

		/* we found nothing we can drop */
662 663 664 665
		if ((!found_extent && !found_inline) ||
		    search_start >= extent_end) {
			int nextret;
			u32 nritems;
666
			nritems = btrfs_header_nritems(leaf);
667 668 669 670
			if (slot >= nritems - 1) {
				nextret = btrfs_next_leaf(root, path);
				if (nextret)
					goto out;
C
Chris Mason 已提交
671
				recow = 1;
672 673 674 675
			} else {
				path->slots[0]++;
			}
			goto next_slot;
C
Chris Mason 已提交
676 677 678
		}

		if (found_inline) {
679
			u64 mask = root->sectorsize - 1;
C
Chris Mason 已提交
680 681 682
			search_start = (extent_end + mask) & ~mask;
		} else
			search_start = extent_end;
Y
Yan 已提交
683
		if (end <= extent_end && start >= key.offset && found_inline) {
684
			*hint_byte = EXTENT_MAP_INLINE;
Z
Zheng Yan 已提交
685 686 687 688 689 690 691 692 693
			goto out;
		}

		if (found_extent) {
			read_extent_buffer(leaf, &old, (unsigned long)extent,
					   sizeof(old));
			root_gen = btrfs_header_generation(leaf);
			root_owner = btrfs_header_owner(leaf);
			leaf_start = leaf->start;
694
		}
Z
Zheng Yan 已提交
695

C
Chris Mason 已提交
696
		if (end < extent_end && end >= key.offset) {
697
			bookend = 1;
698
			if (found_inline && start <= key.offset)
699
				keep = 1;
C
Chris Mason 已提交
700 701 702 703 704 705
		}
		/* truncate existing extent */
		if (start > key.offset) {
			u64 new_num;
			u64 old_num;
			keep = 1;
706
			WARN_ON(start & (root->sectorsize - 1));
C
Chris Mason 已提交
707
			if (found_extent) {
708 709 710 711 712 713 714 715
				new_num = start - key.offset;
				old_num = btrfs_file_extent_num_bytes(leaf,
								      extent);
				*hint_byte =
					btrfs_file_extent_disk_bytenr(leaf,
								      extent);
				if (btrfs_file_extent_disk_bytenr(leaf,
								  extent)) {
C
Chris Mason 已提交
716
					dec_i_blocks(inode, old_num - new_num);
C
Chris Mason 已提交
717
				}
718 719
				btrfs_set_file_extent_num_bytes(leaf, extent,
								new_num);
720
				btrfs_mark_buffer_dirty(leaf);
721 722 723
			} else if (key.offset < inline_limit &&
				   (end > extent_end) &&
				   (inline_limit < extent_end)) {
724 725
				u32 new_size;
				new_size = btrfs_file_extent_calc_inline_size(
726
						   inline_limit - key.offset);
C
Chris Mason 已提交
727 728
				dec_i_blocks(inode, (extent_end - key.offset) -
					(inline_limit - key.offset));
729
				btrfs_truncate_item(trans, root, path,
730
						    new_size, 1);
C
Chris Mason 已提交
731 732 733 734 735
			}
		}
		/* delete the entire extent */
		if (!keep) {
			ret = btrfs_del_item(trans, root, path);
736
			/* TODO update progress marker and return */
C
Chris Mason 已提交
737 738
			BUG_ON(ret);
			extent = NULL;
Z
Zheng Yan 已提交
739 740
			btrfs_release_path(root, path);
			/* the extent will be freed later */
C
Chris Mason 已提交
741
		}
742
		if (bookend && found_inline && start <= key.offset) {
743 744
			u32 new_size;
			new_size = btrfs_file_extent_calc_inline_size(
745
						   extent_end - end);
C
Chris Mason 已提交
746 747
			dec_i_blocks(inode, (extent_end - key.offset) -
					(extent_end - end));
Z
Zheng Yan 已提交
748 749 750
			ret = btrfs_truncate_item(trans, root, path,
						  new_size, 0);
			BUG_ON(ret);
751
		}
C
Chris Mason 已提交
752 753
		/* create bookend, splitting the extent in two */
		if (bookend && found_extent) {
Z
Zheng Yan 已提交
754
			u64 disk_bytenr;
C
Chris Mason 已提交
755 756 757 758 759 760 761
			struct btrfs_key ins;
			ins.objectid = inode->i_ino;
			ins.offset = end;
			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
			btrfs_release_path(root, path);
			ret = btrfs_insert_empty_item(trans, root, path, &ins,
						      sizeof(*extent));
Z
Zheng Yan 已提交
762
			BUG_ON(ret);
763

764 765 766 767 768 769 770
			leaf = path->nodes[0];
			extent = btrfs_item_ptr(leaf, path->slots[0],
						struct btrfs_file_extent_item);
			write_extent_buffer(leaf, &old,
					    (unsigned long)extent, sizeof(old));

			btrfs_set_file_extent_offset(leaf, extent,
771 772 773 774 775
				    le64_to_cpu(old.offset) + end - key.offset);
			WARN_ON(le64_to_cpu(old.num_bytes) <
				(extent_end - end));
			btrfs_set_file_extent_num_bytes(leaf, extent,
							extent_end - end);
776
			btrfs_set_file_extent_type(leaf, extent,
C
Chris Mason 已提交
777
						   BTRFS_FILE_EXTENT_REG);
778

C
Chris Mason 已提交
779
			btrfs_mark_buffer_dirty(path->nodes[0]);
Z
Zheng Yan 已提交
780 781 782 783 784 785 786 787 788 789 790 791 792 793

			disk_bytenr = le64_to_cpu(old.disk_bytenr);
			if (disk_bytenr != 0) {
				ret = btrfs_inc_extent_ref(trans, root,
						disk_bytenr,
						le64_to_cpu(old.disk_num_bytes),
						leaf->start,
						root->root_key.objectid,
						trans->transid,
						ins.objectid, ins.offset);
				BUG_ON(ret);
			}
			btrfs_release_path(root, path);
			if (disk_bytenr != 0) {
C
Chris Mason 已提交
794
				inode->i_blocks +=
795 796
				      btrfs_file_extent_num_bytes(leaf,
								  extent) >> 9;
C
Chris Mason 已提交
797
			}
Z
Zheng Yan 已提交
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816
		}

		if (found_extent && !keep) {
			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);

			if (disk_bytenr != 0) {
				dec_i_blocks(inode, le64_to_cpu(old.num_bytes));
				ret = btrfs_free_extent(trans, root,
						disk_bytenr,
						le64_to_cpu(old.disk_num_bytes),
						leaf_start, root_owner,
						root_gen, key.objectid,
						key.offset, 0);
				BUG_ON(ret);
				*hint_byte = disk_bytenr;
			}
		}

		if (search_start >= end) {
C
Chris Mason 已提交
817 818 819 820 821 822
			ret = 0;
			goto out;
		}
	}
out:
	btrfs_free_path(path);
C
Chris Mason 已提交
823
	btrfs_check_file(root, inode);
C
Chris Mason 已提交
824 825 826 827
	return ret;
}

/*
C
Chris Mason 已提交
828 829 830
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
C
Chris Mason 已提交
831
 */
832
static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
833 834 835
			 struct page **pages, size_t num_pages,
			 loff_t pos, unsigned long first_index,
			 unsigned long last_index, size_t write_bytes)
C
Chris Mason 已提交
836 837 838
{
	int i;
	unsigned long index = pos >> PAGE_CACHE_SHIFT;
839
	struct inode *inode = fdentry(file)->d_inode;
C
Chris Mason 已提交
840
	int err = 0;
841
	u64 start_pos;
842
	u64 last_pos;
843

844
	start_pos = pos & ~((u64)root->sectorsize - 1);
845
	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
C
Chris Mason 已提交
846 847

	memset(pages, 0, num_pages * sizeof(struct page *));
848
again:
C
Chris Mason 已提交
849 850 851 852
	for (i = 0; i < num_pages; i++) {
		pages[i] = grab_cache_page(inode->i_mapping, index + i);
		if (!pages[i]) {
			err = -ENOMEM;
853
			BUG_ON(1);
C
Chris Mason 已提交
854
		}
C
Chris Mason 已提交
855
		wait_on_page_writeback(pages[i]);
C
Chris Mason 已提交
856
	}
857
	if (start_pos < inode->i_size) {
858
		struct btrfs_ordered_extent *ordered;
859 860
		lock_extent(&BTRFS_I(inode)->io_tree,
			    start_pos, last_pos - 1, GFP_NOFS);
861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878
		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
		if (ordered &&
		    ordered->file_offset + ordered->len > start_pos &&
		    ordered->file_offset < last_pos) {
			btrfs_put_ordered_extent(ordered);
			unlock_extent(&BTRFS_I(inode)->io_tree,
				      start_pos, last_pos - 1, GFP_NOFS);
			for (i = 0; i < num_pages; i++) {
				unlock_page(pages[i]);
				page_cache_release(pages[i]);
			}
			btrfs_wait_ordered_range(inode, start_pos,
						 last_pos - start_pos);
			goto again;
		}
		if (ordered)
			btrfs_put_ordered_extent(ordered);

879 880 881
		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
				  GFP_NOFS);
882 883
		unlock_extent(&BTRFS_I(inode)->io_tree,
			      start_pos, last_pos - 1, GFP_NOFS);
884
	}
885
	for (i = 0; i < num_pages; i++) {
886
		clear_page_dirty_for_io(pages[i]);
887 888 889
		set_page_extent_mapped(pages[i]);
		WARN_ON(!PageLocked(pages[i]));
	}
C
Chris Mason 已提交
890 891 892 893 894 895 896
	return 0;
}

static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppos)
{
	loff_t pos;
897 898 899
	loff_t start_pos;
	ssize_t num_written = 0;
	ssize_t err = 0;
C
Chris Mason 已提交
900
	int ret = 0;
901
	struct inode *inode = fdentry(file)->d_inode;
C
Chris Mason 已提交
902
	struct btrfs_root *root = BTRFS_I(inode)->root;
903 904
	struct page **pages = NULL;
	int nrptrs;
C
Chris Mason 已提交
905 906 907
	struct page *pinned[2];
	unsigned long first_index;
	unsigned long last_index;
908 909 910 911
	int will_write;

	will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
		      (file->f_flags & O_DIRECT));
912 913 914

	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
C
Chris Mason 已提交
915 916
	pinned[0] = NULL;
	pinned[1] = NULL;
917

C
Chris Mason 已提交
918
	pos = *ppos;
919 920
	start_pos = pos;

C
Chris Mason 已提交
921 922 923 924
	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
	current->backing_dev_info = inode->i_mapping->backing_dev_info;
	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
	if (err)
925
		goto out_nolock;
C
Chris Mason 已提交
926
	if (count == 0)
927
		goto out_nolock;
928

929
	err = file_remove_suid(file);
C
Chris Mason 已提交
930
	if (err)
931
		goto out_nolock;
C
Chris Mason 已提交
932 933
	file_update_time(file);

934
	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
C
Chris Mason 已提交
935 936 937 938 939

	mutex_lock(&inode->i_mutex);
	first_index = pos >> PAGE_CACHE_SHIFT;
	last_index = (pos + count) >> PAGE_CACHE_SHIFT;

940 941 942 943 944 945 946 947
	/*
	 * if this is a nodatasum mount, force summing off for the inode
	 * all the time.  That way a later mount with summing on won't
	 * get confused
	 */
	if (btrfs_test_opt(root, NODATASUM))
		btrfs_set_flag(inode, NODATASUM);

C
Chris Mason 已提交
948 949 950 951 952 953 954 955
	/*
	 * there are lots of better ways to do this, but this code
	 * makes sure the first and last page in the file range are
	 * up to date and ready for cow
	 */
	if ((pos & (PAGE_CACHE_SIZE - 1))) {
		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
		if (!PageUptodate(pinned[0])) {
C
Chris Mason 已提交
956
			ret = btrfs_readpage(NULL, pinned[0]);
C
Chris Mason 已提交
957 958 959 960 961 962 963 964 965
			BUG_ON(ret);
			wait_on_page_locked(pinned[0]);
		} else {
			unlock_page(pinned[0]);
		}
	}
	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
		if (!PageUptodate(pinned[1])) {
C
Chris Mason 已提交
966
			ret = btrfs_readpage(NULL, pinned[1]);
C
Chris Mason 已提交
967 968 969 970 971 972 973 974 975
			BUG_ON(ret);
			wait_on_page_locked(pinned[1]);
		} else {
			unlock_page(pinned[1]);
		}
	}

	while(count > 0) {
		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
976 977
		size_t write_bytes = min(count, nrptrs *
					(size_t)PAGE_CACHE_SIZE -
978
					 offset);
C
Chris Mason 已提交
979 980 981
		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
					PAGE_CACHE_SHIFT;

982
		WARN_ON(num_pages > nrptrs);
C
Chris Mason 已提交
983
		memset(pages, 0, sizeof(pages));
984 985 986 987 988

		ret = btrfs_check_free_space(root, write_bytes, 0);
		if (ret)
			goto out;

C
Chris Mason 已提交
989 990
		ret = prepare_pages(root, file, pages, num_pages,
				    pos, first_index, last_index,
991
				    write_bytes);
992 993
		if (ret)
			goto out;
C
Chris Mason 已提交
994 995 996

		ret = btrfs_copy_from_user(pos, num_pages,
					   write_bytes, pages, buf);
997 998 999 1000
		if (ret) {
			btrfs_drop_pages(pages, num_pages);
			goto out;
		}
C
Chris Mason 已提交
1001 1002 1003 1004

		ret = dirty_and_release_pages(NULL, root, file, pages,
					      num_pages, pos, write_bytes);
		btrfs_drop_pages(pages, num_pages);
1005 1006
		if (ret)
			goto out;
C
Chris Mason 已提交
1007

1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
		if (will_write) {
			btrfs_fdatawrite_range(inode->i_mapping, pos,
					       pos + write_bytes - 1,
					       WB_SYNC_NONE);
		} else {
			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
							   num_pages);
			if (num_pages <
			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
				btrfs_btree_balance_dirty(root, 1);
			btrfs_throttle(root);
		}

C
Chris Mason 已提交
1021 1022 1023 1024 1025 1026 1027 1028
		buf += write_bytes;
		count -= write_bytes;
		pos += write_bytes;
		num_written += write_bytes;

		cond_resched();
	}
out:
1029
	mutex_unlock(&inode->i_mutex);
1030

1031
out_nolock:
1032
	kfree(pages);
C
Chris Mason 已提交
1033 1034 1035 1036 1037
	if (pinned[0])
		page_cache_release(pinned[0]);
	if (pinned[1])
		page_cache_release(pinned[1]);
	*ppos = pos;
1038

1039
	if (num_written > 0 && will_write) {
1040 1041
		struct btrfs_trans_handle *trans;

1042 1043
		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
		if (err)
1044
			num_written = err;
1045

1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
		if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
			trans = btrfs_start_transaction(root, 1);
			ret = btrfs_log_dentry_safe(trans, root,
						    file->f_dentry);
			if (ret == 0) {
				btrfs_sync_log(trans, root);
				btrfs_end_transaction(trans, root);
			} else {
				btrfs_commit_transaction(trans, root);
			}
		}
		if (file->f_flags & O_DIRECT) {
			invalidate_mapping_pages(inode->i_mapping,
			      start_pos >> PAGE_CACHE_SHIFT,
			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1061
		}
1062
	}
C
Chris Mason 已提交
1063 1064 1065 1066
	current->backing_dev_info = NULL;
	return num_written ? num_written : err;
}

S
Sage Weil 已提交
1067
int btrfs_release_file(struct inode * inode, struct file * filp)
1068
{
S
Sage Weil 已提交
1069 1070
	if (filp->private_data)
		btrfs_ioctl_trans_end(filp);
1071 1072 1073
	return 0;
}

C
Chris Mason 已提交
1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
/*
 * fsync call for both files and directories.  This logs the inode into
 * the tree log instead of forcing full commits whenever possible.
 *
 * It needs to call filemap_fdatawait so that all ordered extent updates are
 * in the metadata btree are up to date for copying to the log.
 *
 * It drops the inode mutex before doing the tree log commit.  This is an
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
1085
int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
C
Chris Mason 已提交
1086 1087 1088
{
	struct inode *inode = dentry->d_inode;
	struct btrfs_root *root = BTRFS_I(inode)->root;
1089
	int ret = 0;
C
Chris Mason 已提交
1090 1091 1092
	struct btrfs_trans_handle *trans;

	/*
1093 1094
	 * check the transaction that last modified this inode
	 * and see if its already been committed
C
Chris Mason 已提交
1095
	 */
1096 1097
	if (!BTRFS_I(inode)->last_trans)
		goto out;
1098

1099 1100 1101 1102 1103 1104 1105 1106 1107
	mutex_lock(&root->fs_info->trans_mutex);
	if (BTRFS_I(inode)->last_trans <=
	    root->fs_info->last_trans_committed) {
		BTRFS_I(inode)->last_trans = 0;
		mutex_unlock(&root->fs_info->trans_mutex);
		goto out;
	}
	mutex_unlock(&root->fs_info->trans_mutex);

C
Chris Mason 已提交
1108
	root->fs_info->tree_log_batch++;
1109
	filemap_fdatawait(inode->i_mapping);
C
Chris Mason 已提交
1110
	root->fs_info->tree_log_batch++;
1111

1112
	/*
1113 1114
	 * ok we haven't committed the transaction yet, lets do a commit
	 */
S
Sage Weil 已提交
1115 1116 1117
	if (file->private_data)
		btrfs_ioctl_trans_end(file);

C
Chris Mason 已提交
1118 1119 1120 1121 1122
	trans = btrfs_start_transaction(root, 1);
	if (!trans) {
		ret = -ENOMEM;
		goto out;
	}
1123 1124

	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
C
Chris Mason 已提交
1125
	if (ret < 0) {
1126
		goto out;
C
Chris Mason 已提交
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
	}

	/* we've logged all the items and now have a consistent
	 * version of the file in the log.  It is possible that
	 * someone will come in and modify the file, but that's
	 * fine because the log is consistent on disk, and we
	 * have references to all of the file's extents
	 *
	 * It is possible that someone will come in and log the
	 * file again, but that will end up using the synchronization
	 * inside btrfs_sync_log to keep things safe.
	 */
	mutex_unlock(&file->f_dentry->d_inode->i_mutex);

1141 1142 1143 1144 1145 1146
	if (ret > 0) {
		ret = btrfs_commit_transaction(trans, root);
	} else {
		btrfs_sync_log(trans, root);
		ret = btrfs_end_transaction(trans, root);
	}
C
Chris Mason 已提交
1147
	mutex_lock(&file->f_dentry->d_inode->i_mutex);
C
Chris Mason 已提交
1148 1149 1150 1151
out:
	return ret > 0 ? EIO : ret;
}

C
Chris Mason 已提交
1152
static struct vm_operations_struct btrfs_file_vm_ops = {
1153
	.fault		= filemap_fault,
C
Chris Mason 已提交
1154 1155 1156 1157 1158 1159 1160 1161 1162 1163
	.page_mkwrite	= btrfs_page_mkwrite,
};

static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
{
	vma->vm_ops = &btrfs_file_vm_ops;
	file_accessed(filp);
	return 0;
}

C
Chris Mason 已提交
1164 1165 1166
struct file_operations btrfs_file_operations = {
	.llseek		= generic_file_llseek,
	.read		= do_sync_read,
C
Chris Mason 已提交
1167
	.aio_read       = generic_file_aio_read,
C
Chris Mason 已提交
1168
	.splice_read	= generic_file_splice_read,
C
Chris Mason 已提交
1169
	.write		= btrfs_file_write,
C
Chris Mason 已提交
1170
	.mmap		= btrfs_file_mmap,
C
Chris Mason 已提交
1171
	.open		= generic_file_open,
1172
	.release	= btrfs_release_file,
C
Chris Mason 已提交
1173
	.fsync		= btrfs_sync_file,
C
Christoph Hellwig 已提交
1174
	.unlocked_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
1175
#ifdef CONFIG_COMPAT
C
Christoph Hellwig 已提交
1176
	.compat_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
1177 1178
#endif
};