file.c 27.3 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

C
Chris Mason 已提交
19 20 21 22 23 24 25 26 27 28 29 30 31
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/smp_lock.h>
#include <linux/backing-dev.h>
#include <linux/mpage.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/statfs.h>
#include <linux/compat.h>
32
#include <linux/version.h>
C
Chris Mason 已提交
33 34 35 36 37 38
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "ioctl.h"
#include "print-tree.h"
39 40
#include "tree-log.h"
#include "locking.h"
41
#include "compat.h"
C
Chris Mason 已提交
42 43


C
Chris Mason 已提交
44 45 46
/* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
47 48 49 50
static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
					 int write_bytes,
					 struct page **prepared_pages,
					 const char __user * buf)
C
Chris Mason 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
{
	long page_fault = 0;
	int i;
	int offset = pos & (PAGE_CACHE_SIZE - 1);

	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
		size_t count = min_t(size_t,
				     PAGE_CACHE_SIZE - offset, write_bytes);
		struct page *page = prepared_pages[i];
		fault_in_pages_readable(buf, count);

		/* Copy data from userspace to the current page */
		kmap(page);
		page_fault = __copy_from_user(page_address(page) + offset,
					      buf, count);
		/* Flush processor's dcache for this page */
		flush_dcache_page(page);
		kunmap(page);
		buf += count;
		write_bytes -= count;

		if (page_fault)
			break;
	}
	return page_fault ? -EFAULT : 0;
}

C
Chris Mason 已提交
78 79 80
/*
 * unlocks pages after btrfs_file_write is done with them
 */
81
static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
C
Chris Mason 已提交
82 83 84 85 86
{
	size_t i;
	for (i = 0; i < num_pages; i++) {
		if (!pages[i])
			break;
C
Chris Mason 已提交
87 88 89 90
		/* page checked is some magic around finding pages that
		 * have been modified without going through btrfs_set_page_dirty
		 * clear it here
		 */
C
Chris Mason 已提交
91
		ClearPageChecked(pages[i]);
C
Chris Mason 已提交
92 93 94 95 96 97
		unlock_page(pages[i]);
		mark_page_accessed(pages[i]);
		page_cache_release(pages[i]);
	}
}

C
Chris Mason 已提交
98 99 100 101 102 103 104 105
/*
 * after copy_from_user, pages need to be dirtied and we need to make
 * sure holes are created between the current EOF and the start of
 * any next extents (if required).
 *
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
106
static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
C
Chris Mason 已提交
107 108 109 110 111 112 113 114
				   struct btrfs_root *root,
				   struct file *file,
				   struct page **pages,
				   size_t num_pages,
				   loff_t pos,
				   size_t write_bytes)
{
	int err = 0;
115
	int i;
116
	struct inode *inode = fdentry(file)->d_inode;
117
	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118 119
	u64 hint_byte;
	u64 num_bytes;
120 121 122 123
	u64 start_pos;
	u64 end_of_last_block;
	u64 end_pos = pos + write_bytes;
	loff_t isize = i_size_read(inode);
C
Chris Mason 已提交
124

125
	start_pos = pos & ~((u64)root->sectorsize - 1);
126 127
	num_bytes = (write_bytes + pos - start_pos +
		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
C
Chris Mason 已提交
128

129 130
	end_of_last_block = start_pos + num_bytes - 1;

131
	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
C
Chris Mason 已提交
132
	trans = btrfs_join_transaction(root, 1);
133 134 135 136 137
	if (!trans) {
		err = -ENOMEM;
		goto out_unlock;
	}
	btrfs_set_trans_block_group(trans, inode);
138
	hint_byte = 0;
139 140

	if ((end_of_last_block & 4095) == 0) {
141
		printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
142
	}
143
	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
144 145 146

	/* FIXME...EIEIO, ENOSPC and more */
	/* insert any holes we need to create */
147
	if (isize < start_pos) {
148 149
		u64 last_pos_in_file;
		u64 hole_size;
150
		u64 mask = root->sectorsize - 1;
151
		last_pos_in_file = (isize + mask) & ~mask;
152
		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
153 154 155
		if (hole_size > 0) {
			btrfs_wait_ordered_range(inode, last_pos_in_file,
						 last_pos_in_file + hole_size);
156
			mutex_lock(&BTRFS_I(inode)->extent_mutex);
C
Chris Mason 已提交
157 158 159
			err = btrfs_drop_extents(trans, root, inode,
						 last_pos_in_file,
						 last_pos_in_file + hole_size,
160
						 last_pos_in_file,
161
						 &hint_byte);
C
Chris Mason 已提交
162 163 164
			if (err)
				goto failed;

165 166 167
			err = btrfs_insert_file_extent(trans, root,
						       inode->i_ino,
						       last_pos_in_file,
C
Chris Mason 已提交
168 169
						       0, 0, hole_size, 0,
						       hole_size, 0, 0, 0);
170
			btrfs_drop_extent_cache(inode, last_pos_in_file,
171
					last_pos_in_file + hole_size - 1, 0);
172
			mutex_unlock(&BTRFS_I(inode)->extent_mutex);
173
			btrfs_check_file(root, inode);
174 175
		}
		if (err)
C
Chris Mason 已提交
176
			goto failed;
177 178
	}

C
Chris Mason 已提交
179 180 181
	/* check for reserved extents on each page, we don't want
	 * to reset the delalloc bit on things that already have
	 * extents reserved.
182
	 */
C
Chris Mason 已提交
183 184 185 186 187 188
	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
	for (i = 0; i < num_pages; i++) {
		struct page *p = pages[i];
		SetPageUptodate(p);
		ClearPageChecked(p);
		set_page_dirty(p);
189 190 191 192
	}
	if (end_pos > isize) {
		i_size_write(inode, end_pos);
		btrfs_update_inode(trans, root, inode);
C
Chris Mason 已提交
193 194
	}
failed:
C
Chris Mason 已提交
195
	err = btrfs_end_transaction(trans, root);
196
out_unlock:
197
	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
C
Chris Mason 已提交
198 199 200
	return err;
}

C
Chris Mason 已提交
201 202 203 204
/*
 * this drops all the extents in the cache that intersect the range
 * [start, end].  Existing extents are split as required.
 */
205 206
int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
			    int skip_pinned)
207 208
{
	struct extent_map *em;
209 210
	struct extent_map *split = NULL;
	struct extent_map *split2 = NULL;
211
	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
212
	u64 len = end - start + 1;
213 214
	int ret;
	int testend = 1;
215
	unsigned long flags;
C
Chris Mason 已提交
216
	int compressed = 0;
217

218
	WARN_ON(end < start);
219
	if (end == (u64)-1) {
220
		len = (u64)-1;
221 222
		testend = 0;
	}
223
	while(1) {
224 225 226 227 228
		if (!split)
			split = alloc_extent_map(GFP_NOFS);
		if (!split2)
			split2 = alloc_extent_map(GFP_NOFS);

229
		spin_lock(&em_tree->lock);
230
		em = lookup_extent_mapping(em_tree, start, len);
231 232
		if (!em) {
			spin_unlock(&em_tree->lock);
233
			break;
234
		}
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
		flags = em->flags;
		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
			spin_unlock(&em_tree->lock);
			if (em->start <= start &&
			    (!testend || em->start + em->len >= start + len)) {
				free_extent_map(em);
				break;
			}
			if (start < em->start) {
				len = em->start - start;
			} else {
				len = start + len - (em->start + em->len);
				start = em->start + em->len;
			}
			free_extent_map(em);
			continue;
		}
C
Chris Mason 已提交
252
		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
253
		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
254
		remove_extent_mapping(em_tree, em);
255 256 257 258 259 260

		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
		    em->start < start) {
			split->start = em->start;
			split->len = start - em->start;
			split->block_start = em->block_start;
C
Chris Mason 已提交
261 262 263 264 265 266

			if (compressed)
				split->block_len = em->block_len;
			else
				split->block_len = split->len;

267
			split->bdev = em->bdev;
268
			split->flags = flags;
269 270 271 272 273 274 275 276 277 278 279 280 281
			ret = add_extent_mapping(em_tree, split);
			BUG_ON(ret);
			free_extent_map(split);
			split = split2;
			split2 = NULL;
		}
		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
		    testend && em->start + em->len > start + len) {
			u64 diff = start + len - em->start;

			split->start = start + len;
			split->len = em->start + em->len - (start + len);
			split->bdev = em->bdev;
282
			split->flags = flags;
283

C
Chris Mason 已提交
284 285 286 287 288 289 290
			if (compressed) {
				split->block_len = em->block_len;
				split->block_start = em->block_start;
			} else {
				split->block_len = split->len;
				split->block_start = em->block_start + diff;
			}
291 292 293 294 295 296

			ret = add_extent_mapping(em_tree, split);
			BUG_ON(ret);
			free_extent_map(split);
			split = NULL;
		}
297 298
		spin_unlock(&em_tree->lock);

299 300 301 302 303
		/* once for us */
		free_extent_map(em);
		/* once for the tree*/
		free_extent_map(em);
	}
304 305 306 307
	if (split)
		free_extent_map(split);
	if (split2)
		free_extent_map(split2);
308 309 310
	return 0;
}

311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345
int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
{
	return 0;
#if 0
	struct btrfs_path *path;
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
	struct btrfs_file_extent_item *extent;
	u64 last_offset = 0;
	int nritems;
	int slot;
	int found_type;
	int ret;
	int err = 0;
	u64 extent_end = 0;

	path = btrfs_alloc_path();
	ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
				       last_offset, 0);
	while(1) {
		nritems = btrfs_header_nritems(path->nodes[0]);
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret)
				goto out;
			nritems = btrfs_header_nritems(path->nodes[0]);
		}
		slot = path->slots[0];
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
		if (found_key.objectid != inode->i_ino)
			break;
		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
			goto out;

C
Chris Mason 已提交
346
		if (found_key.offset < last_offset) {
347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
			WARN_ON(1);
			btrfs_print_leaf(root, leaf);
			printk("inode %lu found offset %Lu expected %Lu\n",
			       inode->i_ino, found_key.offset, last_offset);
			err = 1;
			goto out;
		}
		extent = btrfs_item_ptr(leaf, slot,
					struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(leaf, extent);
		if (found_type == BTRFS_FILE_EXTENT_REG) {
			extent_end = found_key.offset +
			     btrfs_file_extent_num_bytes(leaf, extent);
		} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
			struct btrfs_item *item;
			item = btrfs_item_nr(leaf, slot);
			extent_end = found_key.offset +
C
Chris Mason 已提交
364
			     btrfs_file_extent_inline_len(leaf, extent);
365 366 367 368 369 370
			extent_end = (extent_end + root->sectorsize - 1) &
				~((u64)root->sectorsize -1 );
		}
		last_offset = extent_end;
		path->slots[0]++;
	}
C
Chris Mason 已提交
371
	if (0 && last_offset < inode->i_size) {
372 373 374 375 376 377 378 379 380 381 382 383 384
		WARN_ON(1);
		btrfs_print_leaf(root, leaf);
		printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
		       last_offset, inode->i_size);
		err = 1;

	}
out:
	btrfs_free_path(path);
	return err;
#endif
}

C
Chris Mason 已提交
385 386 387 388 389 390 391 392
/*
 * this is very complex, but the basic idea is to drop all extents
 * in the range start - end.  hint_block is filled in with a block number
 * that would be a good hint to the block allocator for this file.
 *
 * If an extent intersects the range but is not entirely inside the range
 * it is either truncated or split.  Anything entirely inside the range
 * is deleted from the tree.
C
Chris Mason 已提交
393 394 395
 *
 * inline_limit is used to tell this code which offsets in the file to keep
 * if they contain inline extents.
C
Chris Mason 已提交
396
 */
397
int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
C
Chris Mason 已提交
398
		       struct btrfs_root *root, struct inode *inode,
399
		       u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
C
Chris Mason 已提交
400
{
401 402
	u64 extent_end = 0;
	u64 search_start = start;
Z
Zheng Yan 已提交
403
	u64 leaf_start;
C
Chris Mason 已提交
404 405 406 407
	u64 ram_bytes = 0;
	u8 compression = 0;
	u8 encryption = 0;
	u16 other_encoding = 0;
Z
Zheng Yan 已提交
408 409
	u64 root_gen;
	u64 root_owner;
410
	struct extent_buffer *leaf;
C
Chris Mason 已提交
411 412
	struct btrfs_file_extent_item *extent;
	struct btrfs_path *path;
413 414 415 416
	struct btrfs_key key;
	struct btrfs_file_extent_item old;
	int keep;
	int slot;
C
Chris Mason 已提交
417 418 419 420
	int bookend;
	int found_type;
	int found_extent;
	int found_inline;
C
Chris Mason 已提交
421
	int recow;
422
	int ret;
C
Chris Mason 已提交
423

C
Chris Mason 已提交
424
	inline_limit = 0;
425
	btrfs_drop_extent_cache(inode, start, end - 1, 0);
426

C
Chris Mason 已提交
427 428 429 430
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
	while(1) {
C
Chris Mason 已提交
431
		recow = 0;
C
Chris Mason 已提交
432 433 434 435 436 437 438 439 440 441 442 443
		btrfs_release_path(root, path);
		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
					       search_start, -1);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			if (path->slots[0] == 0) {
				ret = 0;
				goto out;
			}
			path->slots[0]--;
		}
444
next_slot:
C
Chris Mason 已提交
445 446 447 448
		keep = 0;
		bookend = 0;
		found_extent = 0;
		found_inline = 0;
Z
Zheng Yan 已提交
449 450 451
		leaf_start = 0;
		root_gen = 0;
		root_owner = 0;
C
Chris Mason 已提交
452
		extent = NULL;
453
		leaf = path->nodes[0];
C
Chris Mason 已提交
454
		slot = path->slots[0];
455
		ret = 0;
456
		btrfs_item_key_to_cpu(leaf, &key, slot);
457 458
		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
		    key.offset >= end) {
C
Chris Mason 已提交
459 460
			goto out;
		}
461 462
		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
		    key.objectid != inode->i_ino) {
C
Chris Mason 已提交
463 464
			goto out;
		}
C
Chris Mason 已提交
465 466 467 468
		if (recow) {
			search_start = key.offset;
			continue;
		}
469 470 471
		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
			extent = btrfs_item_ptr(leaf, slot,
						struct btrfs_file_extent_item);
472
			found_type = btrfs_file_extent_type(leaf, extent);
C
Chris Mason 已提交
473 474 475 476 477 478
			compression = btrfs_file_extent_compression(leaf,
								    extent);
			encryption = btrfs_file_extent_encryption(leaf,
								  extent);
			other_encoding = btrfs_file_extent_other_encoding(leaf,
								  extent);
479
			if (found_type == BTRFS_FILE_EXTENT_REG) {
480 481 482 483 484 485
				extent_end =
				     btrfs_file_extent_disk_bytenr(leaf,
								   extent);
				if (extent_end)
					*hint_byte = extent_end;

486
				extent_end = key.offset +
487
				     btrfs_file_extent_num_bytes(leaf, extent);
C
Chris Mason 已提交
488 489
				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
								extent);
490 491 492 493
				found_extent = 1;
			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				found_inline = 1;
				extent_end = key.offset +
C
Chris Mason 已提交
494
				     btrfs_file_extent_inline_len(leaf, extent);
495 496 497
			}
		} else {
			extent_end = search_start;
C
Chris Mason 已提交
498 499 500
		}

		/* we found nothing we can drop */
501 502 503 504
		if ((!found_extent && !found_inline) ||
		    search_start >= extent_end) {
			int nextret;
			u32 nritems;
505
			nritems = btrfs_header_nritems(leaf);
506 507 508 509
			if (slot >= nritems - 1) {
				nextret = btrfs_next_leaf(root, path);
				if (nextret)
					goto out;
C
Chris Mason 已提交
510
				recow = 1;
511 512 513 514
			} else {
				path->slots[0]++;
			}
			goto next_slot;
C
Chris Mason 已提交
515 516 517
		}

		if (found_inline) {
518
			u64 mask = root->sectorsize - 1;
C
Chris Mason 已提交
519 520 521
			search_start = (extent_end + mask) & ~mask;
		} else
			search_start = extent_end;
C
Chris Mason 已提交
522 523

		if (end <= extent_end && start >= key.offset && found_inline)
524
			*hint_byte = EXTENT_MAP_INLINE;
Z
Zheng Yan 已提交
525 526 527 528 529 530 531

		if (found_extent) {
			read_extent_buffer(leaf, &old, (unsigned long)extent,
					   sizeof(old));
			root_gen = btrfs_header_generation(leaf);
			root_owner = btrfs_header_owner(leaf);
			leaf_start = leaf->start;
532
		}
Z
Zheng Yan 已提交
533

C
Chris Mason 已提交
534
		if (end < extent_end && end >= key.offset) {
535
			bookend = 1;
536
			if (found_inline && start <= key.offset)
537
				keep = 1;
C
Chris Mason 已提交
538 539 540 541 542 543
		}
		/* truncate existing extent */
		if (start > key.offset) {
			u64 new_num;
			u64 old_num;
			keep = 1;
544
			WARN_ON(start & (root->sectorsize - 1));
C
Chris Mason 已提交
545
			if (found_extent) {
546 547 548 549 550 551 552 553
				new_num = start - key.offset;
				old_num = btrfs_file_extent_num_bytes(leaf,
								      extent);
				*hint_byte =
					btrfs_file_extent_disk_bytenr(leaf,
								      extent);
				if (btrfs_file_extent_disk_bytenr(leaf,
								  extent)) {
554 555
					inode_sub_bytes(inode, old_num -
							new_num);
C
Chris Mason 已提交
556
				}
557 558
				btrfs_set_file_extent_num_bytes(leaf, extent,
								new_num);
559
				btrfs_mark_buffer_dirty(leaf);
560 561 562
			} else if (key.offset < inline_limit &&
				   (end > extent_end) &&
				   (inline_limit < extent_end)) {
563 564
				u32 new_size;
				new_size = btrfs_file_extent_calc_inline_size(
565
						   inline_limit - key.offset);
566 567
				inode_sub_bytes(inode, extent_end -
						inline_limit);
568
				btrfs_truncate_item(trans, root, path,
569
						    new_size, 1);
C
Chris Mason 已提交
570 571 572 573
			}
		}
		/* delete the entire extent */
		if (!keep) {
574 575 576
			if (found_inline)
				inode_sub_bytes(inode, extent_end -
						key.offset);
C
Chris Mason 已提交
577
			ret = btrfs_del_item(trans, root, path);
578
			/* TODO update progress marker and return */
C
Chris Mason 已提交
579 580
			BUG_ON(ret);
			extent = NULL;
Z
Zheng Yan 已提交
581 582
			btrfs_release_path(root, path);
			/* the extent will be freed later */
C
Chris Mason 已提交
583
		}
584
		if (bookend && found_inline && start <= key.offset) {
585 586
			u32 new_size;
			new_size = btrfs_file_extent_calc_inline_size(
587
						   extent_end - end);
588
			inode_sub_bytes(inode, end - key.offset);
Z
Zheng Yan 已提交
589 590 591
			ret = btrfs_truncate_item(trans, root, path,
						  new_size, 0);
			BUG_ON(ret);
592
		}
C
Chris Mason 已提交
593 594
		/* create bookend, splitting the extent in two */
		if (bookend && found_extent) {
Z
Zheng Yan 已提交
595
			u64 disk_bytenr;
C
Chris Mason 已提交
596 597 598 599 600 601 602
			struct btrfs_key ins;
			ins.objectid = inode->i_ino;
			ins.offset = end;
			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
			btrfs_release_path(root, path);
			ret = btrfs_insert_empty_item(trans, root, path, &ins,
						      sizeof(*extent));
Z
Zheng Yan 已提交
603
			BUG_ON(ret);
604

605 606 607 608 609 610
			leaf = path->nodes[0];
			extent = btrfs_item_ptr(leaf, path->slots[0],
						struct btrfs_file_extent_item);
			write_extent_buffer(leaf, &old,
					    (unsigned long)extent, sizeof(old));

C
Chris Mason 已提交
611 612 613 614 615 616
			btrfs_set_file_extent_compression(leaf, extent,
							  compression);
			btrfs_set_file_extent_encryption(leaf, extent,
							 encryption);
			btrfs_set_file_extent_other_encoding(leaf, extent,
							     other_encoding);
617
			btrfs_set_file_extent_offset(leaf, extent,
618 619 620 621 622
				    le64_to_cpu(old.offset) + end - key.offset);
			WARN_ON(le64_to_cpu(old.num_bytes) <
				(extent_end - end));
			btrfs_set_file_extent_num_bytes(leaf, extent,
							extent_end - end);
C
Chris Mason 已提交
623 624 625 626 627 628 629 630 631

			/*
			 * set the ram bytes to the size of the full extent
			 * before splitting.  This is a worst case flag,
			 * but its the best we can do because we don't know
			 * how splitting affects compression
			 */
			btrfs_set_file_extent_ram_bytes(leaf, extent,
							ram_bytes);
632
			btrfs_set_file_extent_type(leaf, extent,
C
Chris Mason 已提交
633
						   BTRFS_FILE_EXTENT_REG);
634

C
Chris Mason 已提交
635
			btrfs_mark_buffer_dirty(path->nodes[0]);
Z
Zheng Yan 已提交
636 637 638 639 640 641 642 643

			disk_bytenr = le64_to_cpu(old.disk_bytenr);
			if (disk_bytenr != 0) {
				ret = btrfs_inc_extent_ref(trans, root,
						disk_bytenr,
						le64_to_cpu(old.disk_num_bytes),
						leaf->start,
						root->root_key.objectid,
644
						trans->transid, ins.objectid);
Z
Zheng Yan 已提交
645 646 647 648
				BUG_ON(ret);
			}
			btrfs_release_path(root, path);
			if (disk_bytenr != 0) {
649
				inode_add_bytes(inode, extent_end - end);
C
Chris Mason 已提交
650
			}
Z
Zheng Yan 已提交
651 652 653 654 655 656
		}

		if (found_extent && !keep) {
			u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);

			if (disk_bytenr != 0) {
657 658
				inode_sub_bytes(inode,
						le64_to_cpu(old.num_bytes));
Z
Zheng Yan 已提交
659 660 661 662
				ret = btrfs_free_extent(trans, root,
						disk_bytenr,
						le64_to_cpu(old.disk_num_bytes),
						leaf_start, root_owner,
663
						root_gen, key.objectid, 0);
Z
Zheng Yan 已提交
664 665 666 667 668 669
				BUG_ON(ret);
				*hint_byte = disk_bytenr;
			}
		}

		if (search_start >= end) {
C
Chris Mason 已提交
670 671 672 673 674 675
			ret = 0;
			goto out;
		}
	}
out:
	btrfs_free_path(path);
C
Chris Mason 已提交
676
	btrfs_check_file(root, inode);
C
Chris Mason 已提交
677 678 679 680
	return ret;
}

/*
C
Chris Mason 已提交
681 682 683
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
C
Chris Mason 已提交
684
 */
685
static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
686 687 688
			 struct page **pages, size_t num_pages,
			 loff_t pos, unsigned long first_index,
			 unsigned long last_index, size_t write_bytes)
C
Chris Mason 已提交
689 690 691
{
	int i;
	unsigned long index = pos >> PAGE_CACHE_SHIFT;
692
	struct inode *inode = fdentry(file)->d_inode;
C
Chris Mason 已提交
693
	int err = 0;
694
	u64 start_pos;
695
	u64 last_pos;
696

697
	start_pos = pos & ~((u64)root->sectorsize - 1);
698
	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
C
Chris Mason 已提交
699 700

	memset(pages, 0, num_pages * sizeof(struct page *));
701
again:
C
Chris Mason 已提交
702 703 704 705
	for (i = 0; i < num_pages; i++) {
		pages[i] = grab_cache_page(inode->i_mapping, index + i);
		if (!pages[i]) {
			err = -ENOMEM;
706
			BUG_ON(1);
C
Chris Mason 已提交
707
		}
C
Chris Mason 已提交
708
		wait_on_page_writeback(pages[i]);
C
Chris Mason 已提交
709
	}
710
	if (start_pos < inode->i_size) {
711
		struct btrfs_ordered_extent *ordered;
712 713
		lock_extent(&BTRFS_I(inode)->io_tree,
			    start_pos, last_pos - 1, GFP_NOFS);
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
		if (ordered &&
		    ordered->file_offset + ordered->len > start_pos &&
		    ordered->file_offset < last_pos) {
			btrfs_put_ordered_extent(ordered);
			unlock_extent(&BTRFS_I(inode)->io_tree,
				      start_pos, last_pos - 1, GFP_NOFS);
			for (i = 0; i < num_pages; i++) {
				unlock_page(pages[i]);
				page_cache_release(pages[i]);
			}
			btrfs_wait_ordered_range(inode, start_pos,
						 last_pos - start_pos);
			goto again;
		}
		if (ordered)
			btrfs_put_ordered_extent(ordered);

732 733 734
		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
				  GFP_NOFS);
735 736
		unlock_extent(&BTRFS_I(inode)->io_tree,
			      start_pos, last_pos - 1, GFP_NOFS);
737
	}
738
	for (i = 0; i < num_pages; i++) {
739
		clear_page_dirty_for_io(pages[i]);
740 741 742
		set_page_extent_mapped(pages[i]);
		WARN_ON(!PageLocked(pages[i]));
	}
C
Chris Mason 已提交
743 744 745 746 747 748 749
	return 0;
}

static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
				size_t count, loff_t *ppos)
{
	loff_t pos;
750 751 752
	loff_t start_pos;
	ssize_t num_written = 0;
	ssize_t err = 0;
C
Chris Mason 已提交
753
	int ret = 0;
754
	struct inode *inode = fdentry(file)->d_inode;
C
Chris Mason 已提交
755
	struct btrfs_root *root = BTRFS_I(inode)->root;
756 757
	struct page **pages = NULL;
	int nrptrs;
C
Chris Mason 已提交
758 759 760
	struct page *pinned[2];
	unsigned long first_index;
	unsigned long last_index;
761 762 763 764
	int will_write;

	will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
		      (file->f_flags & O_DIRECT));
765 766 767

	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
C
Chris Mason 已提交
768 769
	pinned[0] = NULL;
	pinned[1] = NULL;
770

C
Chris Mason 已提交
771
	pos = *ppos;
772 773
	start_pos = pos;

C
Chris Mason 已提交
774 775 776 777
	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
	current->backing_dev_info = inode->i_mapping->backing_dev_info;
	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
	if (err)
778
		goto out_nolock;
C
Chris Mason 已提交
779
	if (count == 0)
780
		goto out_nolock;
781

782
	err = file_remove_suid(file);
C
Chris Mason 已提交
783
	if (err)
784
		goto out_nolock;
C
Chris Mason 已提交
785 786
	file_update_time(file);

787
	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
C
Chris Mason 已提交
788 789 790 791 792

	mutex_lock(&inode->i_mutex);
	first_index = pos >> PAGE_CACHE_SHIFT;
	last_index = (pos + count) >> PAGE_CACHE_SHIFT;

793 794 795 796 797 798 799 800
	/*
	 * if this is a nodatasum mount, force summing off for the inode
	 * all the time.  That way a later mount with summing on won't
	 * get confused
	 */
	if (btrfs_test_opt(root, NODATASUM))
		btrfs_set_flag(inode, NODATASUM);

C
Chris Mason 已提交
801 802 803 804 805 806 807 808
	/*
	 * there are lots of better ways to do this, but this code
	 * makes sure the first and last page in the file range are
	 * up to date and ready for cow
	 */
	if ((pos & (PAGE_CACHE_SIZE - 1))) {
		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
		if (!PageUptodate(pinned[0])) {
C
Chris Mason 已提交
809
			ret = btrfs_readpage(NULL, pinned[0]);
C
Chris Mason 已提交
810 811 812 813 814 815 816 817 818
			BUG_ON(ret);
			wait_on_page_locked(pinned[0]);
		} else {
			unlock_page(pinned[0]);
		}
	}
	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
		if (!PageUptodate(pinned[1])) {
C
Chris Mason 已提交
819
			ret = btrfs_readpage(NULL, pinned[1]);
C
Chris Mason 已提交
820 821 822 823 824 825 826 827 828
			BUG_ON(ret);
			wait_on_page_locked(pinned[1]);
		} else {
			unlock_page(pinned[1]);
		}
	}

	while(count > 0) {
		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
829 830
		size_t write_bytes = min(count, nrptrs *
					(size_t)PAGE_CACHE_SIZE -
831
					 offset);
C
Chris Mason 已提交
832 833 834
		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
					PAGE_CACHE_SHIFT;

835
		WARN_ON(num_pages > nrptrs);
C
Chris Mason 已提交
836
		memset(pages, 0, sizeof(pages));
837 838 839 840 841

		ret = btrfs_check_free_space(root, write_bytes, 0);
		if (ret)
			goto out;

C
Chris Mason 已提交
842 843
		ret = prepare_pages(root, file, pages, num_pages,
				    pos, first_index, last_index,
844
				    write_bytes);
845 846
		if (ret)
			goto out;
C
Chris Mason 已提交
847 848 849

		ret = btrfs_copy_from_user(pos, num_pages,
					   write_bytes, pages, buf);
850 851 852 853
		if (ret) {
			btrfs_drop_pages(pages, num_pages);
			goto out;
		}
C
Chris Mason 已提交
854 855 856 857

		ret = dirty_and_release_pages(NULL, root, file, pages,
					      num_pages, pos, write_bytes);
		btrfs_drop_pages(pages, num_pages);
858 859
		if (ret)
			goto out;
C
Chris Mason 已提交
860

861 862 863 864 865 866 867 868 869 870 871 872 873
		if (will_write) {
			btrfs_fdatawrite_range(inode->i_mapping, pos,
					       pos + write_bytes - 1,
					       WB_SYNC_NONE);
		} else {
			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
							   num_pages);
			if (num_pages <
			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
				btrfs_btree_balance_dirty(root, 1);
			btrfs_throttle(root);
		}

C
Chris Mason 已提交
874 875 876 877 878 879 880 881
		buf += write_bytes;
		count -= write_bytes;
		pos += write_bytes;
		num_written += write_bytes;

		cond_resched();
	}
out:
882
	mutex_unlock(&inode->i_mutex);
883

884
out_nolock:
885
	kfree(pages);
C
Chris Mason 已提交
886 887 888 889 890
	if (pinned[0])
		page_cache_release(pinned[0]);
	if (pinned[1])
		page_cache_release(pinned[1]);
	*ppos = pos;
891

892
	if (num_written > 0 && will_write) {
893 894
		struct btrfs_trans_handle *trans;

895 896
		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
		if (err)
897
			num_written = err;
898

899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
		if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
			trans = btrfs_start_transaction(root, 1);
			ret = btrfs_log_dentry_safe(trans, root,
						    file->f_dentry);
			if (ret == 0) {
				btrfs_sync_log(trans, root);
				btrfs_end_transaction(trans, root);
			} else {
				btrfs_commit_transaction(trans, root);
			}
		}
		if (file->f_flags & O_DIRECT) {
			invalidate_mapping_pages(inode->i_mapping,
			      start_pos >> PAGE_CACHE_SHIFT,
			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
914
		}
915
	}
C
Chris Mason 已提交
916 917 918 919
	current->backing_dev_info = NULL;
	return num_written ? num_written : err;
}

S
Sage Weil 已提交
920
int btrfs_release_file(struct inode * inode, struct file * filp)
921
{
S
Sage Weil 已提交
922 923
	if (filp->private_data)
		btrfs_ioctl_trans_end(filp);
924 925 926
	return 0;
}

C
Chris Mason 已提交
927 928 929 930 931 932 933 934 935 936 937
/*
 * fsync call for both files and directories.  This logs the inode into
 * the tree log instead of forcing full commits whenever possible.
 *
 * It needs to call filemap_fdatawait so that all ordered extent updates are
 * in the metadata btree are up to date for copying to the log.
 *
 * It drops the inode mutex before doing the tree log commit.  This is an
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
938
int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
C
Chris Mason 已提交
939 940 941
{
	struct inode *inode = dentry->d_inode;
	struct btrfs_root *root = BTRFS_I(inode)->root;
942
	int ret = 0;
C
Chris Mason 已提交
943 944 945
	struct btrfs_trans_handle *trans;

	/*
946 947
	 * check the transaction that last modified this inode
	 * and see if its already been committed
C
Chris Mason 已提交
948
	 */
949 950
	if (!BTRFS_I(inode)->last_trans)
		goto out;
951

952 953 954 955 956 957 958 959 960
	mutex_lock(&root->fs_info->trans_mutex);
	if (BTRFS_I(inode)->last_trans <=
	    root->fs_info->last_trans_committed) {
		BTRFS_I(inode)->last_trans = 0;
		mutex_unlock(&root->fs_info->trans_mutex);
		goto out;
	}
	mutex_unlock(&root->fs_info->trans_mutex);

C
Chris Mason 已提交
961
	root->fs_info->tree_log_batch++;
962
	filemap_fdatawait(inode->i_mapping);
C
Chris Mason 已提交
963
	root->fs_info->tree_log_batch++;
964

965
	/*
966 967
	 * ok we haven't committed the transaction yet, lets do a commit
	 */
S
Sage Weil 已提交
968 969 970
	if (file->private_data)
		btrfs_ioctl_trans_end(file);

C
Chris Mason 已提交
971 972 973 974 975
	trans = btrfs_start_transaction(root, 1);
	if (!trans) {
		ret = -ENOMEM;
		goto out;
	}
976 977

	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
C
Chris Mason 已提交
978
	if (ret < 0) {
979
		goto out;
C
Chris Mason 已提交
980 981 982 983 984 985 986 987 988 989 990 991 992 993
	}

	/* we've logged all the items and now have a consistent
	 * version of the file in the log.  It is possible that
	 * someone will come in and modify the file, but that's
	 * fine because the log is consistent on disk, and we
	 * have references to all of the file's extents
	 *
	 * It is possible that someone will come in and log the
	 * file again, but that will end up using the synchronization
	 * inside btrfs_sync_log to keep things safe.
	 */
	mutex_unlock(&file->f_dentry->d_inode->i_mutex);

994 995 996 997 998 999
	if (ret > 0) {
		ret = btrfs_commit_transaction(trans, root);
	} else {
		btrfs_sync_log(trans, root);
		ret = btrfs_end_transaction(trans, root);
	}
C
Chris Mason 已提交
1000
	mutex_lock(&file->f_dentry->d_inode->i_mutex);
C
Chris Mason 已提交
1001 1002 1003 1004
out:
	return ret > 0 ? EIO : ret;
}

C
Chris Mason 已提交
1005
static struct vm_operations_struct btrfs_file_vm_ops = {
1006
	.fault		= filemap_fault,
C
Chris Mason 已提交
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
	.page_mkwrite	= btrfs_page_mkwrite,
};

static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
{
	vma->vm_ops = &btrfs_file_vm_ops;
	file_accessed(filp);
	return 0;
}

C
Chris Mason 已提交
1017 1018 1019
struct file_operations btrfs_file_operations = {
	.llseek		= generic_file_llseek,
	.read		= do_sync_read,
C
Chris Mason 已提交
1020
	.aio_read       = generic_file_aio_read,
C
Chris Mason 已提交
1021
	.splice_read	= generic_file_splice_read,
C
Chris Mason 已提交
1022
	.write		= btrfs_file_write,
C
Chris Mason 已提交
1023
	.mmap		= btrfs_file_mmap,
C
Chris Mason 已提交
1024
	.open		= generic_file_open,
1025
	.release	= btrfs_release_file,
C
Chris Mason 已提交
1026
	.fsync		= btrfs_sync_file,
C
Christoph Hellwig 已提交
1027
	.unlocked_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
1028
#ifdef CONFIG_COMPAT
C
Christoph Hellwig 已提交
1029
	.compat_ioctl	= btrfs_ioctl,
C
Chris Mason 已提交
1030 1031
#endif
};