move_extent.c 20.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd.
 * Written by Takashi Sato <t-sato@yk.jp.nec.com>
 *            Akira Fujita <a-fujita@rs.jp.nec.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */

#include <linux/fs.h>
#include <linux/quotaops.h>
18
#include <linux/slab.h>
19 20
#include "ext4_jbd2.h"
#include "ext4.h"
21
#include "ext4_extents.h"
22

23 24 25 26 27 28 29
/**
 * get_ext_path - Find an extent path for designated logical block number.
 *
 * @inode:	an inode which is searched
 * @lblock:	logical block number to find an extent path
 * @path:	pointer to an extent path pointer (for output)
 *
30
 * ext4_find_extent wrapper. Return 0 on success, or a negative error value
31 32 33 34
 * on failure.
 */
static inline int
get_ext_path(struct inode *inode, ext4_lblk_t lblock,
35
		struct ext4_ext_path **ppath)
36
{
37
	struct ext4_ext_path *path;
38

39
	path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE);
40
	if (IS_ERR(path))
41 42 43 44 45 46 47 48 49
		return PTR_ERR(path);
	if (path[ext_depth(inode)].p_ext == NULL) {
		ext4_ext_drop_refs(path);
		kfree(path);
		*ppath = NULL;
		return -ENODATA;
	}
	*ppath = path;
	return 0;
50
}
51 52

/**
53 54
 * ext4_double_down_write_data_sem - Acquire two inodes' write lock
 *                                   of i_data_sem
55
 *
D
Dmitry Monakhov 已提交
56
 * Acquire write lock of i_data_sem of the two inodes
57
 */
58 59
void
ext4_double_down_write_data_sem(struct inode *first, struct inode *second)
60
{
D
Dmitry Monakhov 已提交
61 62 63 64 65 66
	if (first < second) {
		down_write(&EXT4_I(first)->i_data_sem);
		down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
	} else {
		down_write(&EXT4_I(second)->i_data_sem);
		down_write_nested(&EXT4_I(first)->i_data_sem, SINGLE_DEPTH_NESTING);
67 68 69 70 71

	}
}

/**
72
 * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem
73 74 75
 *
 * @orig_inode:		original inode structure to be released its lock first
 * @donor_inode:	donor inode structure to be released its lock second
76
 * Release write lock of i_data_sem of two inodes (orig and donor).
77
 */
78 79 80
void
ext4_double_up_write_data_sem(struct inode *orig_inode,
			      struct inode *donor_inode)
81 82 83 84 85
{
	up_write(&EXT4_I(orig_inode)->i_data_sem);
	up_write(&EXT4_I(donor_inode)->i_data_sem);
}

86 87 88 89 90 91
/**
 * mext_check_coverage - Check that all extents in range has the same type
 *
 * @inode:		inode in question
 * @from:		block offset of inode
 * @count:		block count to be checked
92
 * @unwritten:		extents expected to be unwritten
93 94 95 96 97 98
 * @err:		pointer to save error value
 *
 * Return 1 if all extents in range has expected type, and zero otherwise.
 */
static int
mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
99
		    int unwritten, int *err)
100 101 102
{
	struct ext4_ext_path *path = NULL;
	struct ext4_extent *ext;
103
	int ret = 0;
104 105 106 107
	ext4_lblk_t last = from + count;
	while (from < last) {
		*err = get_ext_path(inode, from, &path);
		if (*err)
108
			goto out;
109
		ext = path[ext_depth(inode)].p_ext;
110
		if (unwritten != ext4_ext_is_unwritten(ext))
111
			goto out;
112 113 114
		from += ext4_ext_get_actual_len(ext);
		ext4_ext_drop_refs(path);
	}
115 116
	ret = 1;
out:
117 118
	ext4_ext_drop_refs(path);
	kfree(path);
119
	return ret;
120 121
}

122 123 124 125 126
/**
 * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
 *
 * @inode1:	the inode structure
 * @inode2:	the inode structure
127 128
 * @index1:	page index
 * @index2:	page index
129 130 131 132 133 134
 * @page:	result page vector
 *
 * Grab two locked pages for inode's by inode order
 */
static int
mext_page_double_lock(struct inode *inode1, struct inode *inode2,
135
		      pgoff_t index1, pgoff_t index2, struct page *page[2])
136 137 138 139 140 141 142 143 144
{
	struct address_space *mapping[2];
	unsigned fl = AOP_FLAG_NOFS;

	BUG_ON(!inode1 || !inode2);
	if (inode1 < inode2) {
		mapping[0] = inode1->i_mapping;
		mapping[1] = inode2->i_mapping;
	} else {
145 146 147
		pgoff_t tmp = index1;
		index1 = index2;
		index2 = tmp;
148 149 150 151
		mapping[0] = inode2->i_mapping;
		mapping[1] = inode1->i_mapping;
	}

152
	page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
153 154 155
	if (!page[0])
		return -ENOMEM;

156
	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
157 158 159 160 161
	if (!page[1]) {
		unlock_page(page[0]);
		page_cache_release(page[0]);
		return -ENOMEM;
	}
162 163 164 165 166 167 168
	/*
	 * grab_cache_page_write_begin() may not wait on page's writeback if
	 * BDI not demand that. But it is reasonable to be very conservative
	 * here and explicitly wait on page's writeback
	 */
	wait_on_page_writeback(page[0]);
	wait_on_page_writeback(page[1]);
169 170 171
	if (inode1 > inode2)
		swap(page[0], page[1]);

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
	return 0;
}

/* Force page buffers uptodate w/o dropping page's lock */
static int
mext_page_mkuptodate(struct page *page, unsigned from, unsigned to)
{
	struct inode *inode = page->mapping->host;
	sector_t block;
	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
	unsigned int blocksize, block_start, block_end;
	int i, err,  nr = 0, partial = 0;
	BUG_ON(!PageLocked(page));
	BUG_ON(PageWriteback(page));

	if (PageUptodate(page))
		return 0;

	blocksize = 1 << inode->i_blkbits;
	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);

	head = page_buffers(page);
	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	for (bh = head, block_start = 0; bh != head || !block_start;
	     block++, block_start = block_end, bh = bh->b_this_page) {
		block_end = block_start + blocksize;
		if (block_end <= from || block_start >= to) {
			if (!buffer_uptodate(bh))
				partial = 1;
			continue;
		}
		if (buffer_uptodate(bh))
			continue;
		if (!buffer_mapped(bh)) {
			err = ext4_get_block(inode, block, bh, 0);
			if (err) {
				SetPageError(page);
				return err;
			}
			if (!buffer_mapped(bh)) {
				zero_user(page, block_start, blocksize);
214
				set_buffer_uptodate(bh);
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238
				continue;
			}
		}
		BUG_ON(nr >= MAX_BUF_PER_PAGE);
		arr[nr++] = bh;
	}
	/* No io required */
	if (!nr)
		goto out;

	for (i = 0; i < nr; i++) {
		bh = arr[i];
		if (!bh_uptodate_or_lock(bh)) {
			err = bh_submit_read(bh);
			if (err)
				return err;
		}
	}
out:
	if (!partial)
		SetPageUptodate(page);
	return 0;
}

239 240 241 242 243 244
/**
 * move_extent_per_page - Move extent data per page
 *
 * @o_filp:			file structure of original file
 * @donor_inode:		donor inode
 * @orig_page_offset:		page index on original file
245
 * @donor_page_offset:		page index on donor file
246 247
 * @data_offset_in_page:	block index where data swapping starts
 * @block_len_in_page:		the number of blocks to be swapped
248
 * @unwritten:			orig extent is unwritten or not
249
 * @err:			pointer to save return value
250 251
 *
 * Save the data in original inode blocks and replace original inode extents
252
 * with donor inode extents by calling ext4_swap_extents().
253 254
 * Finally, write out the saved data in new original inode blocks. Return
 * replaced block count.
255 256
 */
static int
257
move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
258 259 260
		     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
		     int data_offset_in_page,
		     int block_len_in_page, int unwritten, int *err)
261
{
A
Al Viro 已提交
262
	struct inode *orig_inode = file_inode(o_filp);
263
	struct page *pagep[2] = {NULL, NULL};
264
	handle_t *handle;
265
	ext4_lblk_t orig_blk_offset, donor_blk_offset;
266
	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
267
	unsigned int tmp_data_size, data_size, replaced_size;
268
	int err2, jblocks, retries = 0;
269
	int replaced_count = 0;
270
	int from = data_offset_in_page << orig_inode->i_blkbits;
271
	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
272
	struct super_block *sb = orig_inode->i_sb;
273 274 275 276 277

	/*
	 * It needs twice the amount of ordinary journal buffers because
	 * inode and donor_inode may change each different metadata blocks.
	 */
278 279
again:
	*err = 0;
280
	jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
281
	handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks);
282
	if (IS_ERR(handle)) {
283 284
		*err = PTR_ERR(handle);
		return 0;
285 286 287 288 289
	}

	orig_blk_offset = orig_page_offset * blocks_per_page +
		data_offset_in_page;

290 291 292
	donor_blk_offset = donor_page_offset * blocks_per_page +
		data_offset_in_page;

293
	/* Calculate data_size */
294 295 296
	if ((orig_blk_offset + block_len_in_page - 1) ==
	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
		/* Replace the last block */
297
		tmp_data_size = orig_inode->i_size & (blocksize - 1);
298
		/*
299
		 * If data_size equal zero, it shows data_size is multiples of
300 301
		 * blocksize. So we set appropriate value.
		 */
302 303
		if (tmp_data_size == 0)
			tmp_data_size = blocksize;
304

305
		data_size = tmp_data_size +
306
			((block_len_in_page - 1) << orig_inode->i_blkbits);
307 308 309 310
	} else
		data_size = block_len_in_page << orig_inode->i_blkbits;

	replaced_size = data_size;
311

312
	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
313
				     donor_page_offset, pagep);
314
	if (unlikely(*err < 0))
315
		goto stop_journal;
316
	/*
317
	 * If orig extent was unwritten it can become initialized
318 319 320 321 322
	 * at any time after i_data_sem was dropped, in order to
	 * serialize with delalloc we have recheck extent while we
	 * hold page's lock, if it is still the case data copy is not
	 * necessary, just swap data blocks between orig and donor.
	 */
323
	if (unwritten) {
324
		ext4_double_down_write_data_sem(orig_inode, donor_inode);
325 326
		/* If any of extents in range became initialized we have to
		 * fallback to data copying */
327 328
		unwritten = mext_check_coverage(orig_inode, orig_blk_offset,
						block_len_in_page, 1, err);
329 330
		if (*err)
			goto drop_data_sem;
331

332
		unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
333
						 block_len_in_page, 1, err);
334 335 336
		if (*err)
			goto drop_data_sem;

337
		if (!unwritten) {
338
			ext4_double_up_write_data_sem(orig_inode, donor_inode);
339 340 341 342 343 344 345 346 347
			goto data_copy;
		}
		if ((page_has_private(pagep[0]) &&
		     !try_to_release_page(pagep[0], 0)) ||
		    (page_has_private(pagep[1]) &&
		     !try_to_release_page(pagep[1], 0))) {
			*err = -EBUSY;
			goto drop_data_sem;
		}
348 349 350 351
		replaced_count = ext4_swap_extents(handle, orig_inode,
						   donor_inode, orig_blk_offset,
						   donor_blk_offset,
						   block_len_in_page, 1, err);
352
	drop_data_sem:
353
		ext4_double_up_write_data_sem(orig_inode, donor_inode);
354 355 356
		goto unlock_pages;
	}
data_copy:
357 358 359 360 361 362 363 364 365 366
	*err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
	if (*err)
		goto unlock_pages;

	/* At this point all buffers in range are uptodate, old mapping layout
	 * is no longer required, try to drop it now. */
	if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
	    (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
		*err = -EBUSY;
		goto unlock_pages;
367
	}
368
	ext4_double_down_write_data_sem(orig_inode, donor_inode);
369 370 371
	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
					       orig_blk_offset, donor_blk_offset,
					   block_len_in_page, 1, err);
372
	ext4_double_up_write_data_sem(orig_inode, donor_inode);
373
	if (*err) {
374 375 376 377
		if (replaced_count) {
			block_len_in_page = replaced_count;
			replaced_size =
				block_len_in_page << orig_inode->i_blkbits;
378
		} else
379
			goto unlock_pages;
380
	}
381 382
	/* Perform all necessary steps similar write_begin()/write_end()
	 * but keeping in mind that i_size will not change */
383
	*err = __block_write_begin(pagep[0], from, replaced_size,
384 385 386
				   ext4_get_block);
	if (!*err)
		*err = block_commit_write(pagep[0], from, from + replaced_size);
387

388 389 390 391 392 393 394 395 396 397 398 399 400
	if (unlikely(*err < 0))
		goto repair_branches;

	/* Even in case of data=writeback it is reasonable to pin
	 * inode to transaction, to prevent unexpected data loss */
	*err = ext4_jbd2_file_inode(handle, orig_inode);

unlock_pages:
	unlock_page(pagep[0]);
	page_cache_release(pagep[0]);
	unlock_page(pagep[1]);
	page_cache_release(pagep[1]);
stop_journal:
401
	ext4_journal_stop(handle);
402 403 404
	if (*err == -ENOSPC &&
	    ext4_should_retry_alloc(sb, &retries))
		goto again;
405 406
	/* Buffer was busy because probably is pinned to journal transaction,
	 * force transaction commit may help to free it. */
407 408
	if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal &&
	    jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal))
409
		goto again;
410
	return replaced_count;
411 412 413 414 415 416 417

repair_branches:
	/*
	 * This should never ever happen!
	 * Extents are swapped already, but we are not able to copy data.
	 * Try to swap extents to it's original places
	 */
418
	ext4_double_down_write_data_sem(orig_inode, donor_inode);
419 420 421
	replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
					       orig_blk_offset, donor_blk_offset,
					   block_len_in_page, 0, &err2);
422
	ext4_double_up_write_data_sem(orig_inode, donor_inode);
423 424 425 426 427 428 429 430
	if (replaced_count != block_len_in_page) {
		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
				       "Unable to copy data block,"
				       " data will be lost.");
		*err = -EIO;
	}
	replaced_count = 0;
	goto unlock_pages;
431 432 433
}

/**
434
 * mext_check_arguments - Check whether move extent can be done
435 436 437 438 439 440 441 442 443 444 445 446 447
 *
 * @orig_inode:		original inode
 * @donor_inode:	donor inode
 * @orig_start:		logical start offset in block for orig
 * @donor_start:	logical start offset in block for donor
 * @len:		the number of blocks to be moved
 *
 * Check the arguments of ext4_move_extents() whether the files can be
 * exchanged with each other.
 * Return 0 on success, or a negative error value on failure.
 */
static int
mext_check_arguments(struct inode *orig_inode,
448 449
		     struct inode *donor_inode, __u64 orig_start,
		     __u64 donor_start, __u64 *len)
450
{
451
	__u64 orig_eof, donor_eof;
452 453 454
	unsigned int blkbits = orig_inode->i_blkbits;
	unsigned int blocksize = 1 << blkbits;

455 456 457 458
	orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
	donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;


459 460 461 462 463 464 465
	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
		ext4_debug("ext4 move extent: suid or sgid is set"
			   " to donor file [ino:orig %lu, donor %lu]\n",
			   orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

466 467 468
	if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
		return -EPERM;

469 470 471 472 473
	/* Ext4 move extent does not support swapfile */
	if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
		ext4_debug("ext4 move extent: The argument files should "
			"not be swapfile [ino:orig %lu, donor %lu]\n",
			orig_inode->i_ino, donor_inode->i_ino);
474
		return -EBUSY;
475 476 477
	}

	/* Ext4 move extent supports only extent based file */
478
	if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
479 480 481
		ext4_debug("ext4 move extent: orig file is not extents "
			"based file [ino:orig %lu]\n", orig_inode->i_ino);
		return -EOPNOTSUPP;
482
	} else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
483 484 485 486 487 488 489 490 491 492 493
		ext4_debug("ext4 move extent: donor file is not extents "
			"based file [ino:donor %lu]\n", donor_inode->i_ino);
		return -EOPNOTSUPP;
	}

	if ((!orig_inode->i_size) || (!donor_inode->i_size)) {
		ext4_debug("ext4 move extent: File size is 0 byte\n");
		return -EINVAL;
	}

	/* Start offset should be same */
494 495
	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
496
		ext4_debug("ext4 move extent: orig and donor's start "
497
			"offset are not alligned [ino:orig %lu, donor %lu]\n",
498 499 500 501
			orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

502
	if ((orig_start >= EXT_MAX_BLOCKS) ||
503
	    (donor_start >= EXT_MAX_BLOCKS) ||
504
	    (*len > EXT_MAX_BLOCKS) ||
505
	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
506
	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
507
		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
508
			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
509 510 511
			orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}
512 513 514 515
	if (orig_eof < orig_start + *len - 1)
		*len = orig_eof - orig_start;
	if (donor_eof < donor_start + *len - 1)
		*len = donor_eof - donor_start;
516
	if (!*len) {
517
		ext4_debug("ext4 move extent: len should not be 0 "
518 519 520 521 522 523 524 525 526 527 528 529 530
			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
			donor_inode->i_ino);
		return -EINVAL;
	}

	return 0;
}

/**
 * ext4_move_extents - Exchange the specified range of a file
 *
 * @o_filp:		file structure of the original file
 * @d_filp:		file structure of the donor file
531 532
 * @orig_blk:		start offset in block for orig
 * @donor_blk:		start offset in block for donor
533 534 535 536 537 538 539 540
 * @len:		the number of blocks to be moved
 * @moved_len:		moved block length
 *
 * This function returns 0 and moved block length is set in moved_len
 * if succeed, otherwise returns error value.
 *
 */
int
541 542
ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
		  __u64 donor_blk, __u64 len, __u64 *moved_len)
543
{
A
Al Viro 已提交
544 545
	struct inode *orig_inode = file_inode(o_filp);
	struct inode *donor_inode = file_inode(d_filp);
546
	struct ext4_ext_path *path = NULL;
547
	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
548 549 550
	ext4_lblk_t o_end, o_start = orig_blk;
	ext4_lblk_t d_start = donor_blk;
	int ret;
551

D
Dmitry Monakhov 已提交
552 553 554 555 556 557 558 559 560
	if (orig_inode->i_sb != donor_inode->i_sb) {
		ext4_debug("ext4 move extent: The argument files "
			"should be in same FS [ino:orig %lu, donor %lu]\n",
			orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

	/* orig and donor should be different inodes */
	if (orig_inode == donor_inode) {
561
		ext4_debug("ext4 move extent: The argument files should not "
D
Dmitry Monakhov 已提交
562
			"be same inode [ino:orig %lu, donor %lu]\n",
563 564 565 566
			orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}

567 568 569 570 571 572 573
	/* Regular file check */
	if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
		ext4_debug("ext4 move extent: The argument files should be "
			"regular file [ino:orig %lu, donor %lu]\n",
			orig_inode->i_ino, donor_inode->i_ino);
		return -EINVAL;
	}
574 575 576 577 578 579
	/* TODO: This is non obvious task to swap blocks for inodes with full
	   jornaling enabled */
	if (ext4_should_journal_data(orig_inode) ||
	    ext4_should_journal_data(donor_inode)) {
		return -EINVAL;
	}
580
	/* Protect orig and donor inodes against a truncate */
581
	lock_two_nondirectories(orig_inode, donor_inode);
582

583 584 585 586 587 588
	/* Wait for all existing dio workers */
	ext4_inode_block_unlocked_dio(orig_inode);
	ext4_inode_block_unlocked_dio(donor_inode);
	inode_dio_wait(orig_inode);
	inode_dio_wait(donor_inode);

589
	/* Protect extent tree against block allocations via delalloc */
590
	ext4_double_down_write_data_sem(orig_inode, donor_inode);
591
	/* Check the filesystem environment whether move_extent can be done */
592 593
	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
				    donor_blk, &len);
D
Dmitry Monakhov 已提交
594
	if (ret)
595
		goto out;
596
	o_end = o_start + len;
597

598 599 600 601 602 603
	while (o_start < o_end) {
		struct ext4_extent *ex;
		ext4_lblk_t cur_blk, next_blk;
		pgoff_t orig_page_index, donor_page_index;
		int offset_in_page;
		int unwritten, cur_len;
604

605 606
		ret = get_ext_path(orig_inode, o_start, &path);
		if (ret)
607
			goto out;
608 609 610 611 612 613 614 615 616 617 618 619 620
		ex = path[path->p_depth].p_ext;
		next_blk = ext4_ext_next_allocated_block(path);
		cur_blk = le32_to_cpu(ex->ee_block);
		cur_len = ext4_ext_get_actual_len(ex);
		/* Check hole before the start pos */
		if (cur_blk + cur_len - 1 < o_start) {
			if (next_blk == EXT_MAX_BLOCKS) {
				o_start = o_end;
				ret = -ENODATA;
				goto out;
			}
			d_start += next_blk - o_start;
			o_start = next_blk;
621
			continue;
622 623 624 625 626 627 628 629 630 631
		/* Check hole after the start pos */
		} else if (cur_blk > o_start) {
			/* Skip hole */
			d_start += cur_blk - o_start;
			o_start = cur_blk;
			/* Extent inside requested range ?*/
			if (cur_blk >= o_end)
				goto out;
		} else { /* in_range(o_start, o_blk, o_len) */
			cur_len += cur_blk - o_start;
632
		}
633 634 635 636 637 638 639 640 641 642 643
		unwritten = ext4_ext_is_unwritten(ex);
		if (o_end - o_start < cur_len)
			cur_len = o_end - o_start;

		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
					       orig_inode->i_blkbits);
		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
					       donor_inode->i_blkbits);
		offset_in_page = o_start % blocks_per_page;
		if (cur_len > blocks_per_page- offset_in_page)
			cur_len = blocks_per_page - offset_in_page;
644 645 646 647 648 649 650
		/*
		 * Up semaphore to avoid following problems:
		 * a. transaction deadlock among ext4_journal_start,
		 *    ->write_begin via pagefault, and jbd2_journal_commit
		 * b. racing with ->readpage, ->write_begin, and ext4_get_block
		 *    in move_extent_per_page
		 */
651
		ext4_double_up_write_data_sem(orig_inode, donor_inode);
652 653 654 655 656
		/* Swap original branches with new branches */
		move_extent_per_page(o_filp, donor_inode,
				     orig_page_index, donor_page_index,
				     offset_in_page, cur_len,
				     unwritten, &ret);
657
		ext4_double_down_write_data_sem(orig_inode, donor_inode);
D
Dmitry Monakhov 已提交
658
		if (ret < 0)
659
			break;
660 661
		o_start += cur_len;
		d_start += cur_len;
662
	}
663 664 665 666
	*moved_len = o_start - orig_blk;
	if (*moved_len > len)
		*moved_len = len;

667
out:
668 669 670 671 672
	if (*moved_len) {
		ext4_discard_preallocations(orig_inode);
		ext4_discard_preallocations(donor_inode);
	}

673 674
	ext4_ext_drop_refs(path);
	kfree(path);
675
	ext4_double_up_write_data_sem(orig_inode, donor_inode);
676 677
	ext4_inode_resume_unlocked_dio(orig_inode);
	ext4_inode_resume_unlocked_dio(donor_inode);
678
	unlock_two_nondirectories(orig_inode, donor_inode);
679

D
Dmitry Monakhov 已提交
680
	return ret;
681
}